In [29]:
#import required libraries
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from gensim.models import Word2Vec
import re
from nltk.tokenize import word_tokenize 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score,confusion_matrix

### Data Loading

In [2]:
#load the data
data=pd.read_excel("AccidentCategoryTrainingData.xlsx")

In [3]:
data.head()

Unnamed: 0,content,result,type
0,"While driving home from work, I was rear-ended...",Bodily Injury,
1,I was stopped at a red light when a distracted...,Bodily Injury,
2,"During a heavy rainstorm, I lost control of my...",Bodily Injury,
3,"While merging onto the highway, another car si...",Bodily Injury,
4,I was hit by a car running a red light while I...,Bodily Injury,


In [4]:
#Select the required columns
data=data[['content','result']]

In [5]:
data.head()

Unnamed: 0,content,result
0,"While driving home from work, I was rear-ended...",Bodily Injury
1,I was stopped at a red light when a distracted...,Bodily Injury
2,"During a heavy rainstorm, I lost control of my...",Bodily Injury
3,"While merging onto the highway, another car si...",Bodily Injury
4,I was hit by a car running a red light while I...,Bodily Injury


## Preprocess the data

In [6]:
stop_words=set(stopwords.words('english'))
def return_only_words(text):
    text=re.sub(r'[^a-zA-Z0-9-\s]','',text)
    text=text.lower()
    # Replace hyphens with spaces
    text = re.sub(r'-', ' ', text)
    text=word_tokenize(text)
  
    return ' '.join([word for word in text if word not in stop_words])

In [7]:
data['updated_content']=data['content'].apply(return_only_words)

In [8]:
data.head()

Unnamed: 0,content,result,updated_content
0,"While driving home from work, I was rear-ended...",Bodily Injury,driving home work rear ended speeding vehicle ...
1,I was stopped at a red light when a distracted...,Bodily Injury,stopped red light distracted driver plowed veh...
2,"During a heavy rainstorm, I lost control of my...",Bodily Injury,heavy rainstorm lost control car crashed tree ...
3,"While merging onto the highway, another car si...",Bodily Injury,merging onto highway another car sideswiped ca...
4,I was hit by a car running a red light while I...,Bodily Injury,hit car running red light crossing intersectio...


In [9]:
data['result'].unique()

array(['Bodily Injury', 'Property Damage', 'Uninsured or Underinsured',
       'Other'], dtype=object)

In [10]:
encoder=LabelEncoder()

In [11]:
data['encoded_result']=encoder.fit_transform(data['result'])

In [12]:
data.tail(5)

Unnamed: 0,content,result,updated_content,encoded_result
68,"While installing a new satellite dish, I accid...",Other,installing new satellite dish accidentally dro...,1
69,I was using my chainsaw to cut down a tree whe...,Other,using chainsaw cut tree fell wrong direction l...,1
70,"While using my leaf blower, I accidentally ble...",Other,using leaf blower accidentally blew pile leave...,1
71,I was using my pressure washer to clean my dri...,Other,using pressure washer clean driveway accidenta...,1
72,"While using my snow blower, I accidentally ble...",Other,using snow blower accidentally blew pile snow ...,1


In [13]:
# Function to count words in a dataframe column
max_words_size=max(data['updated_content'].apply(lambda x: len(x.split())))
print(max_words_size)

33


In [14]:
# Preprocessing: Tokenize the sentences in the 'text_column'
sentences = [sentence.split() for sentence in data['updated_content']]
sentences

[['driving',
  'home',
  'work',
  'rear',
  'ended',
  'speeding',
  'vehicle',
  'causing',
  'car',
  'spin',
  'control',
  'impact',
  'resulted',
  'severe',
  'whiplash',
  'injury',
  'necessitating',
  'immediate',
  'medical',
  'attention',
  'pain',
  'persists',
  'currently',
  'undergoing',
  'physical',
  'therapy'],
 ['stopped',
  'red',
  'light',
  'distracted',
  'driver',
  'plowed',
  'vehicle',
  'behind',
  'collision',
  'resulted',
  'fractured',
  'collarbone',
  'required',
  'surgery',
  'process',
  'recovery',
  'rehabilitation'],
 ['heavy',
  'rainstorm',
  'lost',
  'control',
  'car',
  'crashed',
  'tree',
  'accident',
  'caused',
  'concussion',
  'broken',
  'arm',
  'required',
  'hospitalization',
  'currently',
  'medication',
  'attending',
  'regular',
  'check',
  'ups'],
 ['merging',
  'onto',
  'highway',
  'another',
  'car',
  'sideswiped',
  'causing',
  'veer',
  'road',
  'incident',
  'resulted',
  'dislocated',
  'shoulder',
  'sever

### Word2Vec Model Initialization

In [15]:
embedding_size=max_words_size+20
word2vec_model=Word2Vec(sentences,vector_size=embedding_size, window=3, min_count=1, workers=2)

In [16]:
word2vec_model.wv['driving']

array([ 0.00140339,  0.01571719, -0.00845679,  0.00866563, -0.01344167,
       -0.00673493,  0.01859808, -0.00220385,  0.00120112, -0.00731929,
       -0.01355178, -0.00372208,  0.00368743, -0.00165041,  0.01010667,
       -0.00579368,  0.00415529,  0.01058814,  0.01501452, -0.00270645,
       -0.01752321,  0.00867661,  0.00174074,  0.01441794, -0.0013744 ,
       -0.00500134, -0.01687964, -0.00120708,  0.00474854,  0.0097266 ,
        0.0137422 , -0.0114922 ,  0.00392928,  0.01060062, -0.00901625,
       -0.00696039,  0.01309094,  0.00221633,  0.00108493,  0.00666032,
        0.00049215,  0.01863691,  0.00971145, -0.01633161, -0.0132462 ,
        0.00134089,  0.01225801, -0.01561564,  0.00623551,  0.00964798,
        0.01140446,  0.01485534, -0.01131086], dtype=float32)

In [17]:
# Create sentence embeddings by averaging word embeddings
def sentence_to_vector(sentence, model, embedding_size):
    vectors = [model.wv[word] for word in sentence if word in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)  # Average word vectors
    else:
        return np.zeros(embedding_size)  # Return a zero vector if no words are found

In [18]:
# Create feature matrix X by applying sentence_to_vector to each sentence
X = np.array([sentence_to_vector(sentence, word2vec_model, embedding_size) for sentence in sentences])

In [19]:
X[0]

array([-1.4076519e-03, -1.6659399e-03, -1.8743487e-03,  4.5391229e-05,
       -3.7144581e-03, -7.1921211e-04,  7.2147069e-04, -3.2753407e-04,
        3.3784821e-03,  8.6978672e-04,  2.9302426e-03, -2.9163335e-03,
       -3.5317345e-03,  1.7614327e-03, -2.6239673e-04,  1.0654965e-03,
        1.1538198e-03, -5.6973466e-04, -8.9155196e-04, -2.3936196e-03,
        8.5802650e-04,  3.1352669e-04,  1.5213974e-03,  2.2217578e-03,
       -3.2452568e-03,  1.4807599e-05, -1.2236689e-03, -1.2974364e-04,
        1.6779594e-04, -1.2688751e-04,  2.1142175e-03, -2.6091512e-03,
        4.3306430e-03, -2.7928627e-03, -2.0305852e-04,  1.6078324e-03,
        2.0005694e-03, -1.6668354e-03,  5.4534976e-03,  1.8456756e-03,
        2.1186417e-03,  2.4536331e-03,  1.2083442e-03,  8.6309237e-04,
        2.1628034e-03, -2.3424020e-03, -5.8481219e-05,  1.9951374e-03,
       -3.9986488e-03, -1.9305806e-03,  2.6871806e-03,  1.3731865e-03,
       -3.3986515e-05], dtype=float32)

In [20]:
# Target variable y
y = np.array(data['encoded_result'])

In [21]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1])

### Train test split

In [22]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

### Model Building using Naive Bayes

In [23]:
nb_model=GaussianNB()

In [24]:
nb_model.fit(X_train,y_train)

In [25]:
y_pred=nb_model.predict(X_test)

### Model Evaluation

In [26]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.9545


In [27]:
list(y_test)

[0, 1, 0, 0, 2, 1, 0, 2, 0, 3, 1, 2, 0, 3, 0, 2, 3, 3, 2, 2, 1, 0]

In [28]:
list(y_pred)

[0, 1, 0, 0, 2, 1, 2, 2, 0, 3, 1, 2, 0, 3, 0, 2, 3, 3, 2, 2, 1, 0]

In [33]:
# Evaluate the model
confusion_metrics = confusion_matrix(y_test, y_pred)
print(confusion_metrics)


[[7 0 1 0]
 [0 4 0 0]
 [0 0 6 0]
 [0 0 0 4]]
