# Neiss Datasets

In [2]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

In [3]:
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
df_list = []
for year in range(2014, 2024):
    path = f'../data/neiss{year}.tsv'
    print(path, '...', sep='')
    try:
        df_list.append(pd.read_csv(path, sep='\t'))
    except UnicodeDecodeError:
        print('Exception Caught')
        print(path, '...', sep='')
        print(len(pd.read_csv(path, sep='\t', encoding='ISO-8859-1')))
        df_list.append(pd.read_csv(path, sep='\t', encoding='ISO-8859-1'))
df = pd.concat(df_list, axis=0, ignore_index=True)

../data/neiss2014.tsv...
../data/neiss2015.tsv...
../data/neiss2016.tsv...
../data/neiss2017.tsv...
Exception Caught
../data/neiss2017.tsv...


  print(len(pd.read_csv(path, sep='\t', encoding='ISO-8859-1')))


386907


  df_list.append(pd.read_csv(path, sep='\t', encoding='ISO-8859-1'))


../data/neiss2018.tsv...


  df_list.append(pd.read_csv(path, sep='\t'))


../data/neiss2019.tsv...


  df_list.append(pd.read_csv(path, sep='\t'))


../data/neiss2020.tsv...
../data/neiss2021.tsv...


  df_list.append(pd.read_csv(path, sep='\t'))


../data/neiss2022.tsv...
../data/neiss2023.tsv...


  df_list.append(pd.read_csv(path, sep='\t'))


In [4]:
# Drop records with missing Narrative_1 data
df.drop(df[df['Narrative_1'].isnull()].index, axis=0, inplace=True)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3520522 entries, 0 to 3520529
Data columns (total 25 columns):
 #   Column             Dtype  
---  ------             -----  
 0   CPSC_Case_Number   object 
 1   Treatment_Date     object 
 2   Age                int64  
 3   Sex                float64
 4   Race               float64
 5   Other_Race         object 
 6   Hispanic           float64
 7   Body_Part          float64
 8   Diagnosis          float64
 9   Other_Diagnosis    object 
 10  Body_Part_2        float64
 11  Diagnosis_2        float64
 12  Other_Diagnosis_2  object 
 13  Disposition        float64
 14  Location           float64
 15  Fire_Involvement   float64
 16  Product_1          float64
 17  Product_2          float64
 18  Product_3          float64
 19  Alcohol            float64
 20  Drug               float64
 21  Narrative_1        object 
 22  Stratum            object 
 23  PSU                float64
 24  Weight             float64
dtypes: float64(17), int64(1

In [6]:
from transformers import BertTokenizer
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


# Fine Tuning Distil-BERT

In [7]:
from datasets import Dataset, ClassLabel, Features, Value
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from evaluate import load
import torch.nn.functional as F
import torch

from peft import LoraConfig, get_peft_model

from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay

2025-05-16 15:26:12.859397: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## 1. Combine labeled data to make the train/validation set

In [8]:
labeled_sample_1 = pd.read_csv('ped_accident_labels.csv', index_col='index')
labeled_sample_2 = pd.read_csv('performance_df.csv', index_col=0)
labeled_sample_3 = pd.read_csv('holdout_labeled.csv', index_col=0)

In [9]:
labeled_sample_1.head()

Unnamed: 0_level_0,Narrative_1,Pedestrian Label
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,16YOM PRESENTS AFTER BEING PEDESTRIAN STRUCK W...,0
1,55 YOM DX NECK AND BACK PAIN - S/P PT PEDESTRI...,0
2,23YOM WITH ELBOW PAIN AFTER RIDING MOPED TO WO...,0
3,50YOM W/THORACIC & LUMBAR BACK STRAIN S/P PEDE...,1
4,^66YOM PEDESTRIAN WHO JUMPED TO GET OUT OF WAY...,0


In [10]:
labeled_sample_2.head()

Unnamed: 0_level_0,Narrative_1,LLM Classification,Human Label
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
522,"6YOM WHO WAS RIDING HIS BIKE AND LOST CONTROL,...",Pedestrian,1
116,27 YOM W/HELMET SWERVED BIKE TO AVOID PEDESTRI...,Not Pedestrian,0
56,49 YOM DX BACK CONTUSION - S/P BICYCLIST STRUC...,Not Pedestrian,0
72,"18YOM W/OPEN FXS OF TIBIA & FIBULA,ABRAS HIP &...",Pedestrian,1
1273,18YOF BIBEMS AFTER STRUCK BY CAR AND PROPELLED...,Pedestrian,1


In [11]:
labeled_sample_3.head()

Unnamed: 0,CPSC_Case_Number,Treatment_Date,Age,Sex,Race,Other_Race,Hispanic,Body_Part,Diagnosis,Other_Diagnosis,Body_Part_2,Diagnosis_2,Other_Diagnosis_2,Disposition,Location,Fire_Involvement,Product_1,Product_2,Product_3,Alcohol,Drug,Narrative_1,Stratum,PSU,Weight,group,Pedestrian Probability,Human Label
1373638,170957638,09/24/2017,52,2,0,,,75,62,,,,,1,4,0,5040,0,0,,,52YF HELM'D BICYCLIST CRASHED INTO PEDESTRIAN&...,V,21,16.1828,group_a,0.112776,0
2870136,220159454,01/25/2022,24,2,4,,2.0,37,64,,,,,1,4,0,5040,0,0,0.0,0.0,"24 YOF IN MVA , PEDESTRIAN STRUCK AND INJ LEG ...",V,38,17.2223,group_a,0.387114,1
3007698,220705908,06/09/2022,12,1,0,,0.0,75,52,,,,,4,4,0,5040,0,0,0.0,0.0,12YOM BROUGHT IN BY AMBULANCE AFTER A PEDESTRI...,C,37,6.6676,group_a,0.928713,1
703697,151238381,11/18/2015,40,1,2,,,36,57,,,,,1,4,0,5040,0,0,,,40 YOM DX DISPLACED FX OF LT TIBIAL SPINE - S/...,V,57,16.565,group_a,0.299209,0
849328,160538697,05/12/2016,25,2,2,,,81,71,LEG/MOUTH PAIN,,,,1,4,0,5040,0,0,,,25 YOF DX LEG/MOUTH PAIN - S/P PT PEDESTRIAN S...,V,57,14.6504,group_a,0.624338,0


In [12]:
# Combining the three labeled samples, only keeping the narrative and the human label
train_validation = pd.concat(
    [labeled_sample_1[['Narrative_1', 'Pedestrian Label']].rename(columns={'Pedestrian Label': 'Human Label'}),
     labeled_sample_2[['Narrative_1', 'Human Label']],
     labeled_sample_3[['Narrative_1', 'Human Label']]
     ],
    axis=0)

## 2. Create Holdout Set

In [13]:
# Step 2: Filter further by location codes of interest (e.g., street or highway)
filtered_df = df[df['Location'].isin([4, 5])]

# ~96,000 samples remain (~76.5 minutes to run locally)
print(len(filtered_df))

329568


In [14]:
# # Step 1: Filter narratives that contain relevant keywords related to pedestrian activity or motor vehicle involvement
# search_words = [
#     "walking", "walk", "jogging", "jog", "running", "run", "on foot", "bystander",
#     "standing", "biking", "bike", "roller skating", "roller skates", "skateboarding",
#     "skateboard", "scootering", "scooter", "pedestr", "pedst", "struck by", "hit by"
# ]

# filtered_df = filtered_df[filtered_df['Narrative_1'].str.contains('|'.join(search_words), case=False, na=False)]

# # ~470,000 samples contain at least one keyword (~6.25 hours to run locally)
# len(filtered_df)

In [15]:
# Step 3: Focus on narratives that explicitly mention "pedestrian", "struck by", or "hit by"
filtered_df = filtered_df[filtered_df['Narrative_1'].str.contains('|'.join(["pedestr", "pedst", "struck by", "hit by"]), case=False, na=False)]

# ~20,000 high-priority samples (~15 minutes to run locally)
print(len(filtered_df))

19489


This subset of ~20,000 samples contains location-relevant narratives with keywords most likely to reflect pedestrian injuries involving motor vehicles. This will serve as the primary dataset for local experimentation. In future iterations, model weights may be exported and applied at scale on larger datasets using cloud resources.

In [16]:
# Excluding data that out model will be trained on
filtered_df = filtered_df[~filtered_df['Narrative_1'].isin(train_validation['Narrative_1'])]

# There are 19,226 samples that are not in our train/validation set
# NOTE: Some data in our train_validation set are not in our original filtered_df since the location codes outside of 4 and 5 were not excluded initially
len(filtered_df)

19076

In [17]:
# Mask that contains all of the narratives containing pedestrian
pedestrian_mask = filtered_df['Narrative_1'].str.contains('pedst|pedestr', case=False, na=False)
# Mask containing all of the narratives containing struck by/hit by
struck_hit_mask = filtered_df['Narrative_1'].str.contains('struck by|hit by', case=False, na=False)

# df containing 50 "pedestrian" samples that don't contain "struck/hit" by
group_a = filtered_df[pedestrian_mask & ~struck_hit_mask].sample(50, random_state=42)

# df containing 50 "struck/hit by" samples that dont contain "pedestrian"
group_b = filtered_df[struck_hit_mask & ~pedestrian_mask].sample(50, random_state=42)

# df containing 50 samples where both "pedestrian" and "struck/hit by" are present in the narrative
group_c = filtered_df[struck_hit_mask & pedestrian_mask].sample(50, random_state=42)

# Creating flags to test the models performance among each group
group_a['group'] = 'group_a'
group_b['group'] = 'group_b'
group_c['group'] = 'group_c'

# Holdout set containing all three groups
holdout = pd.concat([group_a, group_b, group_c], axis=0)

In [18]:
holdout.head()

Unnamed: 0,CPSC_Case_Number,Treatment_Date,Age,Sex,Race,Other_Race,Hispanic,Body_Part,Diagnosis,Other_Diagnosis,Body_Part_2,Diagnosis_2,Other_Diagnosis_2,Disposition,Location,Fire_Involvement,Product_1,Product_2,Product_3,Alcohol,Drug,Narrative_1,Stratum,PSU,Weight,group
2995637,220649382,05/18/2022,17,2.0,2.0,,0.0,30.0,55.0,,,,,1.0,4.0,0.0,5040.0,0.0,0.0,0.0,0.0,"17YOF PRESENTED TO ED C/O PEDESTRAIN STRUCK, P...",V,57.0,18.1791,group_a
964545,160906268,05/17/2016,43,1.0,0.0,,,75.0,62.0,,,,,1.0,4.0,0.0,5040.0,0.0,0.0,,,"43YOM W/CHI,FACIAL CONT,ABRAS & PAIN TO ELBOW ...",V,41.0,14.6504,group_a
3253168,230424553,04/07/2023,23,1.0,5.0,,2.0,76.0,57.0,,88.0,59.0,,1.0,4.0,0.0,1333.0,0.0,0.0,0.0,0.0,23YOM PATIENT WAS ON HIS SKATEBOARD ON HIS WAY...,S,47.0,76.8216,group_a
3313305,230638234,06/10/2023,53,1.0,2.0,,2.0,87.0,71.0,OSTEOPHYTE,,,,1.0,4.0,0.0,5040.0,0.0,0.0,0.0,0.0,53YOM BROUGHT IN BY EMS WITH CC OF LEFT FOREAR...,M,76.0,79.7644,group_a
1316897,170817237,08/03/2017,25,1.0,2.0,,,93.0,57.0,,,,,1.0,5.0,0.0,1211.0,0.0,0.0,,,"25YOM S/P PEDESTRIAN VS CAR ACCID,5PM TODAY,PT...",L,3.0,67.2099,group_a


## 3. Train the model

In [51]:
features = Features({
    'text': Value('string'),
    'labels': ClassLabel(names=["Not Pedestrian", "Pedestrian"])
})

dataset = Dataset.from_pandas(
    df=train_validation.rename(columns={'Narrative_1':'text', 'Human Label': 'labels'}).reset_index(drop=True),
    features=features
)

dataset = dataset.train_test_split(test_size=0.2, seed=42, stratify_by_column='labels')

In [52]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 389
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 98
    })
})

In [53]:
# Initializing tokenizer
pretrained_model="distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(pretrained_model)

# defining the tokenization function
def tokenize(sample):
    return tokenizer(
        sample['text'],
        # padding='max_length',
        truncation=True
    )
    
results = dataset.map(tokenize, batched=True)

Map: 100%|██████████| 389/389 [00:00<00:00, 3690.11 examples/s]
Map: 100%|██████████| 98/98 [00:00<00:00, 5421.21 examples/s]


In [54]:
id2label = {0: "Not Pedestrian", 1: "Pedestrian"}
label2id = {"Not Pedestrian":0, "Pedestrian": 1}

model = AutoModelForSequenceClassification.from_pretrained(
    pretrained_model,
    num_labels = 2,
    id2label = id2label,
    label2id = label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [55]:
peft_config = LoraConfig(
    task_type='SEQ_CLS', # Defining the classification type to be Sequence Classification
    r=64, # Lora attention dimension (intrinsic rank of the low-rank matricies)
    lora_alpha=32, # Alpha Parameter for Lora scaling (like the learning rate)
    lora_dropout=0.1, # The dropout probability for Lora layers
    target_modules=['q_lin', 'k_lin', 'v_lin', 'ffn.lin1', 'ffn.lin2'] # We will start by allowing the query, key, and value linear layers to be modified by the model
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 5,310,722 || all params: 72,265,732 || trainable%: 7.3489


In [56]:
# Load evaluation metrics
accuracy = load("accuracy")
precision = load("precision")
recall = load("recall")
f1_score = load("f1")

# Define a metric function for evaluation
def compute_metrics(p):
    pred = np.argmax(p.predictions, axis=1)
    labels = p.label_ids

    return {
        "accuracy": accuracy.compute(predictions=pred, references=labels)['accuracy'],
        "precision": precision.compute(predictions=pred, references=labels, average='binary')['precision'],
        "recall": recall.compute(predictions=pred, references=labels, average='binary')['recall'],
        "f1": f1_score.compute(predictions=pred, references=labels, average='binary')['f1'],
    }

In [57]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Hyperparameters
lr = 0.0001 # Size of optimization step
batch_size = 8 # number of examples processed per optimization step
num_epochs = 10 # number of times the model runs through training data
weight_decay = 0.1

# Defining the training arguments
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=weight_decay,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True, 
    save_total_limit=4,
    report_to='tensorboard',
    do_eval=True, 
    logging_strategy='epoch',
    overwrite_output_dir=True,
    metric_for_best_model="accuracy"
)

In [58]:
trainer = Trainer(
    model=model,
    args=training_args, # hyperparameters
    train_dataset=results['train'], # training data
    eval_dataset=results['test'], # validation data
    tokenizer=tokenizer, # The narratives from the training and testing sets are already pre-tokenized. Passing the tokenizer here is primarily used for decoding predictions
    data_collator=data_collator,
    compute_metrics=compute_metrics, # Runs on HuggingFace's EvalPrediction object (see compute metrics notes for how this works)
)

  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [59]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6759,0.663032,0.581633,0.581633,1.0,0.735484
2,0.626,0.55648,0.734694,0.71831,0.894737,0.796875
3,0.4743,0.466987,0.785714,0.790323,0.859649,0.823529
4,0.3714,0.538394,0.806122,0.763889,0.964912,0.852713
5,0.3084,0.528038,0.816327,0.791045,0.929825,0.854839
6,0.2762,0.61347,0.795918,0.760563,0.947368,0.84375
7,0.2482,0.620929,0.806122,0.779412,0.929825,0.848
8,0.2412,0.617919,0.806122,0.779412,0.929825,0.848
9,0.1923,0.636749,0.806122,0.779412,0.929825,0.848
10,0.1596,0.662297,0.795918,0.768116,0.929825,0.84127


TrainOutput(global_step=490, training_loss=0.35733717217737315, metrics={'train_runtime': 282.9099, 'train_samples_per_second': 13.75, 'train_steps_per_second': 1.732, 'total_flos': 88370077390320.0, 'train_loss': 0.35733717217737315, 'epoch': 10.0})

In [28]:
print(type(trainer.optimizer.optimizer))

<class 'torch.optim.adamw.AdamW'>


## Sanity Check: Test model on training data

In [29]:
torch.backends.mps.is_available()

True

In [30]:
from torch.utils.data import DataLoader

In [32]:
model = trainer.model
# Move the model to Metal Performance Shaders (MPS), Apple’s optimized framework for fast tensor computations on Mac
model.to('mps')

# Set the model to evaluation mode (disables dropout and gradient tracking for layers that behave differently during training)
model.eval()

loader = DataLoader(
    results['test'].remove_columns(['text']),
    batch_size = 8,
    collate_fn=data_collator
)

# Create a list to store the model’s classification predictions
y_pred = []
y_true = []

with torch.no_grad():
    for batch in loader:
        batch = {k:v.to('mps') for k,v in batch.items()}
        
        logits = model(**batch).logits
        
        preds = torch.argmax(logits, dim=1)
        
        y_pred.extend(preds.to('cpu').numpy())
        y_true.extend(batch['labels'].to('cpu').numpy())

print(classification_report(y_true, y_pred))


              precision    recall  f1-score   support

           0       0.85      0.68      0.76        41
           1       0.80      0.91      0.85        57

    accuracy                           0.82        98
   macro avg       0.82      0.80      0.80        98
weighted avg       0.82      0.82      0.81        98

