In [1]:
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np
import pandas as pd

2026-02-06 08:57:30.142162: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


kaggle datasets download -d mirzaniazmorshed/ntsb-aviation-accidents
unzip ntsb-aviation-accidents.zip -d ntsb_data

In [2]:

df_narr = pd.read_excel('ntsb_data/narratives.xlsx')
df_find = pd.read_excel('ntsb_data/Findings_merged.xlsx')
df_joined = pd.merge(df_narr, df_find, on='ev_id', how='inner')
df = df_joined[['narr_accp', 'finding_description']].dropna()

df['broad_cause'] = df['finding_description'].str.split('-').str[0]
df = df[df.groupby('broad_cause')['broad_cause'].transform('count') > 50]
df['broad_cause'].value_counts()

broad_cause
Personnel issues         24358
Aircraft                 21886
Environmental issues     11838
Not determined            2134
Organizational issues      709
Name: count, dtype: int64

In [4]:
df['finding_description']

0        Personnel issues-Action/decision-Info processi...
1        Personnel issues-Action/decision-Info processi...
2        Personnel issues-Action/decision-Info processi...
3        Personnel issues-Action/decision-Info processi...
4        Environmental issues-Conditions/weather/phenom...
                               ...                        
67039    Personnel issues-Psychological-Perception/orie...
67055    Aircraft-Aircraft systems-Electrical power sys...
67056    Personnel issues-Task performance-Maintenance-...
67060    Aircraft-Aircraft oper/perf/capability-Perform...
67061    Aircraft-Aircraft structures-Empennage structu...
Name: finding_description, Length: 58082, dtype: str

In [3]:
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import class_weight


class_counts = df['broad_cause'].value_counts()
valid_classes = class_counts[class_counts > 3000].index
df = df[df['broad_cause'].isin(valid_classes)].copy()

encoder = LabelEncoder()
df['label_int'] = encoder.fit_transform(df['broad_cause'])
num_classes = len(encoder.classes_)


doc_lens = df['narr_accp'].astype(str).apply(lambda x: len(x.split()))
sequence_length = int(np.percentile(doc_lens, 95))
print(f"Optimal sequence length (95th percentile): {sequence_length}")


max_tokens = 15000  # Increased for the 3090
vectorize_layer = layers.TextVectorization(
    max_tokens=max_tokens,
    output_mode='int',
    output_sequence_length=sequence_length,
    ngrams=(1, 2) # Captures bigrams like "engine_failure" or "pilot_error"
)
vectorize_layer.adapt(df['narr_accp'].values)


model = tf.keras.Sequential([
    tf.keras.Input(shape=(1,), dtype=tf.string),
    vectorize_layer,
    layers.Embedding(input_dim=max_tokens, output_dim=128, mask_zero=False),
    
    # Convolutional block: Looks for "phrases" of 5 tokens
    layers.Conv1D(128, 5, activation='relu', padding='same'),
    layers.MaxPooling1D(pool_size=2),
    
    # Deeper block for more abstract features
    layers.Conv1D(256, 5, activation='relu', padding='same'),
    layers.GlobalMaxPooling1D(), # Picks the most descriptive "feature" found
    
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.5), # Essential to prevent over-memorizing technical templates
    layers.Dense(num_classes, activation='softmax')
])

train_x = df['narr_accp'].values.astype(str)
train_y = df['label_int'].values.astype('int32')

weights = class_weight.compute_class_weight('balanced', classes=np.unique(train_y), y=train_y)
class_weight_dict = dict(enumerate(weights))

dataset = tf.data.Dataset.from_tensor_slices((train_x, train_y))
dataset = dataset.shuffle(5000).batch(64).prefetch(tf.data.AUTOTUNE)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(dataset, epochs=15, class_weight=class_weight_dict)


Optimal sequence length (95th percentile): 667


I0000 00:00:1770364713.724524    6393 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 21764 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3090, pci bus id: 0000:2b:00.0, compute capability: 8.6
2026-02-06 08:58:39.042922: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:84] Allocation of 1081598880 exceeds 10% of free system memory.


Epoch 1/15


2026-02-06 08:58:51.016375: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:473] Loaded cuDNN version 91900


[1m908/908[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 20ms/step - accuracy: 0.3947 - loss: 1.0508
Epoch 2/15
[1m908/908[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 19ms/step - accuracy: 0.4420 - loss: 0.9889
Epoch 3/15
[1m908/908[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 19ms/step - accuracy: 0.4504 - loss: 0.9579
Epoch 4/15
[1m908/908[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 19ms/step - accuracy: 0.4623 - loss: 0.9312
Epoch 5/15
[1m908/908[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 19ms/step - accuracy: 0.4627 - loss: 0.9072
Epoch 6/15
[1m908/908[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 19ms/step - accuracy: 0.4683 - loss: 0.8871
Epoch 7/15
[1m908/908[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 19ms/step - accuracy: 0.4744 - loss: 0.8745
Epoch 8/15
[1m908/908[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 19ms/step - accuracy: 0.4811 - loss: 0.8603
Epoch 9/15
[1m908/908[0m [32m━━━

<keras.src.callbacks.history.History at 0x796629530ce0>