In [1]:
#import all the necessary libraries
import pandas as pd 
import numpy as  np
import re

from sklearn.feature_extraction.text import TfidfVectorizer

#import all the necessary libraries to build a neural network classifier from tensorflow

import tensorflow as tf
from tensorflow.keras.models import Sequential                   
from tensorflow.keras.layers import Dense                        #import fully connected neural net layers
from tensorflow.keras.optimizers import Adam                     #choose adam as the optimization algorithm
from tensorflow.keras.losses import CategoricalCrossentropy      #cost function needed for softmax classification if the labels are 
                                                                 #one hot encoded

from tensorflow.keras.regularizers import l2                     #add regularization in order to avoid overfitting 

from sklearn.model_selection import train_test_split             #to split the training set into train and test set 

from sklearn.metrics import confusion_matrix, classification_report  #to proceed with error analysis on our predictions


import seaborn as sns                                                 #to visualize confusion matrices as a heatmap
import matplotlib.pyplot as plt


from tensorflow.keras.layers import LSTM, Dropout              #import long-short term memory neural net layers
                                                               #and dropout regularization layers

In [3]:
def  vectorize_string_csv_column_TF_idf(string_column):  
                                                #create a function that receives a column of strings from a csv file 
                                                  #and converts each entry into a unique tfidf vector depending on the 
                                                  #unique vocabulary
                                                   
                                                 #necessary to convert a text input into a numeric vector to be used 
                                                 #as input to the classifier

    
    tfidf_vectorizer = TfidfVectorizer()  
                                      #create a TF_idf_vectorizer model that will receive the entire csv column
                                      #and will eventually turn each entry into a numeric vector
                                      #the length of each vector will be the number of the unique words in the vocabulary
                                      #if this length=N the resulting vectors for each entry will be of dimension (N,)
    
    tfidf_matrix = tfidf_vectorizer.fit_transform(string_column)  
                                                    #apply the created model to the column title of the csv


    tfidf_vectors = tfidf_matrix.toarray()          #each entry of the tfidf_vectors is a numeric vector
                                                    #corresponding to a title entry
                                                    #convert it to an array format for ease of handling

    Vocabulary=tfidf_vectorizer.get_feature_names_out()  #export the unique vocabulary out of the created vectorizer model

    return tfidf_vectors, Vocabulary   

In [7]:
#create a function that finds all the unique labels in a column 
def find_unique_column_labels(column):
     
    unique_labels_list=[]               #create a list that will contain all the unique labels found in the input column

    for i in column : #search the entire column
        if i not in unique_labels_list:   #if the element i is not found in the unique list 
            unique_labels_list.append(i)  #append it to the list
    
    return  unique_labels_list 

In [10]:
def one_hot_encode_labels(unique_labels_of_a_column):
    #create a dictionary for these categories that coresponds each one into a one hot numpy vector
    #this is necessary in order to use a softmax classifier 

    number_of_classes=len(unique_labels_of_a_column)  #find the number of classes/possible labels from the vector that contains the unique labels

                                               #create a numpy I matrix I lxl where l is the number of classes 
                                               #each row of the I matrix will correspond to a one hot encoding for each label
    I=np.eye(number_of_classes)

    #create a dictionary to correspond each class name with it's one hot encoded label

    dict_labels={} #initialize an empty dictionary where the keys will be the labels and the values will be their one hot encoding


    for i in range(number_of_classes):
        dict_labels[unique_labels_of_a_column[i]]=I[i,:]

    return dict_labels

In [13]:
def create_Y_label(initial_label_column,dict_labels):
    #create the Y part of the dataset by receiving  a column  and the dict_labels corresponding to that column



    #len(dict_labels['biological'])

    Y1={}   #initialize an empty dictionary
    count=0 #and a count variable
    for i in initial_label_column:    #search through the hazard_category column of the data frame
        for j in dict_labels.keys():  #and through all the keys of the labels dictionary with keys all the unique labels 
                                  #and values their one hot encoded representation
            if i==j:                   #if you find a match
                Y1[count]=dict_labels[j]   #assign the category with it's one hot encoding
                count+=1
    #Y
    #now the dictionary above should be turned into a numpy matrix with its elements being column vectors

    # Convert dictionary values to a numpy matrix
    matrix = np.array([v for v in Y1.values()]).T

    matrix.shape #Nxm format
    return matrix    

In [90]:
def compile_lstm(X_train,Y_train,X_test,Y_test,hu_1,hu_2,number_of_classes,a_epochs,a_batch_size):


#in the following lstm neural net dropout regularization layers have been added added and l2 regularization is turned to 0
    
# LSTM Model architecture      Tx, and Ty are set by default by tensorflow
    model = Sequential([  
    # first LSTM layer the hidden units are frozen to 256 and instead of a tanh which is the most common actiovation function for an 
        #lstm a relu function is being applied
        LSTM(256, activation='relu', return_sequences=True, input_shape=(1, X_train.shape[1]), kernel_regularizer=l2(0)),
        Dropout(0.2),  # Add dropout for regularization

    # second LSTM layer
        #similar logic to the previous lstm layer but the number of hidden units remains frozen to 128
        LSTM(128, activation='relu', kernel_regularizer=l2(0)),
        Dropout(0.2),

    # fully connected layer with hu_1 number of hidden units, no regularization and relu activation
        Dense(hu_1, activation='relu', kernel_regularizer=l2(0)),

    # output layer with softmax activation with hidden units equal to the number of classes
        Dense(number_of_classes, activation='softmax', kernel_regularizer=l2(0))
    ])



    # Compile the model
    model.compile(
        optimizer=Adam(learning_rate=0.001),  # adam optimizer
        loss=CategoricalCrossentropy(),       # categorical cross-entropy loss
        metrics=['accuracy']                  
    )

    # Reshape the input data to be compatible with LSTM
    X_train_new = X_train.reshape((X_train.shape[0],1, X_train.shape[1]))  # reshape to (samples, time_steps, features)
    X_test_new = X_test.reshape((X_test.shape[0],1, X_test.shape[1]))      # reshape to (samples, time_steps, features)


    #train the model to fit the training data
    model.fit(
        X_train_new, Y_train,  # training data
        validation_data=(X_test_new, Y_test),  # validation data
        epochs=a_epochs,   # number of epochs
        batch_size=a_batch_size,  # batch size
        verbose=1  
    )

    #model evaluation on the test data
    test_loss, test_accuracy = model.evaluate(X_test_new, Y_test, verbose=1)

    
    print("Test Loss:", test_loss)
    print("Test Accuracy:", test_accuracy)

    return model

In [67]:
def lstm_create_heatmap_and_error_report(model,X_test,Y_test,num_of_classes):  #input the X_test 
                                                                    #and Y_test
                                                                    #make predictions on X_test with the model
                                                                    #compare the model predictions with 
                                                                    #the actual Y_test and create an error analysis

    X_test_new=X_test.reshape((X_test.shape[0],1, X_test.shape[1])) #reshape X_test to make it compatible with lstm 
    
    Y_predict_initial=model.predict(X_test_new) #we predict the model output for the X_test 
        #and we are going to compare with the actual lables from Y_test

    #Each entry y in Y_predict_initial is a vector of  outputs= number of classes, containing the probabilities that show 
    #how likely it is for the model to assign an entry x of x_test to a specific class.
    #For the entry x with prediction y if y[0] is the highest amongst the elements of y, x will be assigned to 
    #class 0. If y[1] is the highest then x will be assigned to class_1 and so on

    #As a result, we need to find the index of  maximum element of each y in Y_predict 
    #that will show us in which class x corresponds to:
    Y_intermediate = np.argmax(Y_predict_initial, axis=1) #find the index of the maximum element for each y in Y_predict_initial
    
    #we are also going to convert Y_test from a one hot encoding to 
    #the number of class this one hot encoding represents and compare it with 
    #the predicted class stored in Y_intermediate

    Y_true = np.argmax(Y_test, axis=1)  #due to the fact that we have a one hot encoding, finding the index of the max 
                                    #element will lead directly to the number of class it represents

    #
    #we are going to find out where Y_true matches our prediction in Y_intermediate 
    #and we are going to display a confusion matrix of the true vs the prediction

    
    conf_mat=confusion_matrix(Y_true,Y_intermediate)
    

  #  sns.heatmap(conf_mat,annot=True, fmt='d', cmap='Blues')
  #  plt.xlabel('Predicted Class')
  #  plt.ylabel('True Class')
  #  plt.title('Confusion Matrix')
  #  plt.show()
    
    #the classification report is applied to the test set which is a random split from the entire training set
    #as a result it may not include the entire number of classes and we will get the error report based on the 
    #classes stored in the confusion matrix and the test set

    report = classification_report(Y_true, Y_intermediate, target_names=[f"Class {i}" for i in range(conf_mat.shape[0])]) 
    print("\n",report)

    return 1

In [70]:
#make a function that inputs data X
#the model that makes predictions and the 
#dictionary with the labels and their one hot encoding. 
#It computes the numeric predictions for X and turns them into the text of the label 
#they correspond to

def lstm_from_pred_and_dictionary_to_labels(X,model,one_hot_dictionary):
    
        #predicted output
    X_new=X.reshape((X.shape[0],1, X.shape[1]))
    
    Y_pr=model.predict(X_new)

        #create a new dictionary with the key being the index/label of the one_hot_dictionary
        #and the value of this new  dictionary being the phrase of the one hot encoding 

    new_dict={}

    for key,value in one_hot_dictionary.items():   
        convert_one_hot_encode_to_number=np.argmax(value)    #get the number representation of the one hot encoding
        label=key                                            #get the phrase of the one hot encoding 
        new_dict[convert_one_hot_encode_to_number]=label     #store them as number-> phrase

    #use the newly created dictionary to map the predictions into the labels
    predictions_text_format=[]

    for i in Y_pr:
        chosen_label=np.argmax(i) #loop through the predictions of the model and choose to which label the prediction is assigned
        predictions_text_format.append(new_dict[chosen_label]) #get the phrase corresponding to that label and append it to a list
        
    return predictions_text_format

In [73]:
#load the csv incidents_train into a data frame

df=pd.read_csv('incidents_train.csv')
#df.head()

#isolate the title category which is to be used as the input X to a classifier
title=df['title']

In [76]:
X,unique_title_voc=vectorize_string_csv_column_TF_idf(title)
#X.shape

In [79]:
hazard=df['hazard']
unique_hazard_labels=find_unique_column_labels(hazard)

one_hot_encode_hazard=one_hot_encode_labels(unique_hazard_labels)
#one_hot_encode_hazard_category

Y_hazard_transposed=create_Y_label(hazard,one_hot_encode_hazard)

Y_hazard=Y_hazard_transposed.T
Y_hazard.shape

(5082, 128)

In [82]:
#split the data intro train and test set
#using the processed title data as X
#and hazard  as Y
X_train, X_test, Y_hazard_train, Y_hazard_test = train_test_split(X, Y_hazard, test_size=0.2, random_state=42)

In [92]:
#the number of hazards is 128 as it was found by the unique one hot encoding of the hazard labels
model_3=compile_lstm(X_train,Y_hazard_train,X_test,Y_hazard_test,256,128,128,10,32)

  super().__init__(**kwargs)


Epoch 1/10
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 96ms/step - accuracy: 0.1186 - loss: 4.3206 - val_accuracy: 0.1367 - val_loss: 3.3884
Epoch 2/10
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 92ms/step - accuracy: 0.1353 - loss: 3.1747 - val_accuracy: 0.2055 - val_loss: 3.2174
Epoch 3/10
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 93ms/step - accuracy: 0.2309 - loss: 2.8244 - val_accuracy: 0.3117 - val_loss: 2.9796
Epoch 4/10
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 94ms/step - accuracy: 0.3336 - loss: 2.4540 - val_accuracy: 0.3736 - val_loss: 2.9043
Epoch 5/10
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 93ms/step - accuracy: 0.4516 - loss: 2.0090 - val_accuracy: 0.3943 - val_loss: 3.0251
Epoch 6/10
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 93ms/step - accuracy: 0.5487 - loss: 1.6631 - val_accuracy: 0.4376 - val_loss: 3.2786
Epoch 7/10
[1m1

In [94]:
lstm_create_heatmap_and_error_report(model_3,X_test,Y_hazard_test,1)

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step

               precision    recall  f1-score   support

     Class 0       0.87      0.76      0.81       139
     Class 1       0.00      0.00      0.00         2
     Class 2       0.53      0.59      0.56        39
     Class 3       0.89      0.68      0.77        47
     Class 4       0.00      0.00      0.00         1
     Class 5       0.74      0.83      0.78       121
     Class 6       0.07      0.25      0.11         4
     Class 7       0.00      0.00      0.00         3
     Class 8       0.00      0.00      0.00         4
     Class 9       0.28      0.21      0.24        34
    Class 10       0.54      0.53      0.54        36
    Class 11       0.59      0.54      0.56       108
    Class 12       0.33      0.24      0.28        21
    Class 13       0.17      0.50      0.25         4
    Class 14       0.06      0.17      0.09         6
    Class 15       0.31      0.51      0.38        45
    C

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


1

In [96]:
#read the validation_file for hazard  and create a prediction vector

#load the csv incidents_val into a data frame

df_validation=pd.read_csv('incidents_val.csv')
#df.head()

#isolate the title category which is to be used as the input X to a classifier
title_validation=df_validation['title']

In [98]:
#follow the same steps as before to vectorize the title_validation 
#into a form that can be loaded into the trained model

#to avoid having different number of features for the train set and the validation set with the tfidf vectorizer we are 
#going to use the tfidf vectorizer fitted for the train set to the validation set 
#the function we created previously will not suffise because it will create a unique representation based on the validation set 
#with number of features different than the ones in the training set

tfidf_vectorizer = TfidfVectorizer()  #create a tfidfvectorizer object


tfidf_vectorizer.fit(title)        #fit it for the title of the training set 
                                                   
X_center=tfidf_vectorizer.transform(title)      #transform the title of the training set into a format where each entry is a 
                                                #numeric vector of shape (m,n) where m 
                                                #is the number of examples in the given set and n is the number of features

X_center_format=X_center.toarray()             #follow this conversion to get an array form
                                               #


X_val_0=tfidf_vectorizer.transform(title_validation)   #use the vectorizer that is fitted to the training set to transform the input 
                                                       #from the validation set. the result will be a numeric vector of shape (m_val,n)
                                                       #where m_val is the number of entries in the validation set and 
                                                       #n is the number of features which is the same as the number of features 
                                                       # used for training

X_val_1=X_val_0.toarray()

X_val_1.shape

(565, 7372)

In [100]:
#predictions in text format from the trained dense model named model

H_val_prediction=lstm_from_pred_and_dictionary_to_labels(X_val_1,model_3,one_hot_encode_hazard)  #use the aforementioned function
                                                       #to predict the Hazard labels for the validation set using the 
                                                       #validation title. This function also receives as input the dictionary with 
                                                       #the labels and their one hot encoding in order to convert the predictions back to text

[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step


In [49]:
#HZ_val_prediction

In [103]:
#now we are going to follow the exact same procedure and train a seperate model to predict 
#the product  first for the training set and then for the validation set


In [105]:
product=df['product'] #get the product for the data frame refering to the training set
unique_product_labels=find_unique_column_labels(product) #find the unique labels of product category

one_hot_encode_product=one_hot_encode_labels(unique_product_labels) #one hot encode these labels and get a 
#dictionary with the key being the label and the value being its one hot encoding


Y_product_transposed=create_Y_label(product,one_hot_encode_product)
#create the Y part of the data set

Y_product=Y_product_transposed.T
Y_product.shape

(5082, 1022)

In [108]:
#split the data intro train and test set
#using the processed title data as X
#and product  as Y
X_train, X_test, Y_product_train, Y_product_test = train_test_split(X, Y_product, test_size=0.2, random_state=42)

In [113]:
#number of classes is 1022 for the product, as it was found by its unique one hot encoding
model_4=compile_lstm(X_train,Y_product_train,X_test,Y_product_test,256,128,1022,50,32)

  super().__init__(**kwargs)


Epoch 1/50
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 100ms/step - accuracy: 0.0219 - loss: 6.7283 - val_accuracy: 0.0492 - val_loss: 6.1027
Epoch 2/50
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 96ms/step - accuracy: 0.0352 - loss: 5.8265 - val_accuracy: 0.0482 - val_loss: 6.1095
Epoch 3/50
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 98ms/step - accuracy: 0.0318 - loss: 5.5826 - val_accuracy: 0.0511 - val_loss: 6.1516
Epoch 4/50
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 97ms/step - accuracy: 0.0372 - loss: 5.4600 - val_accuracy: 0.0678 - val_loss: 6.3678
Epoch 5/50
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 97ms/step - accuracy: 0.0488 - loss: 5.3491 - val_accuracy: 0.0649 - val_loss: 6.4963
Epoch 6/50
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 97ms/step - accuracy: 0.0558 - loss: 5.2713 - val_accuracy: 0.0698 - val_loss: 6.8244
Epoch 7/50
[1m

In [115]:
lstm_create_heatmap_and_error_report(model_3,X_test,Y_product_test,1)

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step

               precision    recall  f1-score   support

     Class 0       0.00      0.00      0.00         4
     Class 1       0.00      0.00      0.00         7
     Class 2       0.00      0.00      0.00         0
     Class 3       0.00      0.00      0.00         7
     Class 4       0.00      0.00      0.00         1
     Class 5       0.00      0.00      0.00         9
     Class 6       0.00      0.00      0.00         7
     Class 7       0.00      0.00      0.00         1
     Class 8       0.00      0.00      0.00         1
     Class 9       0.00      0.00      0.00        23
    Class 10       0.00      0.00      0.00         1
    Class 11       0.04      0.25      0.07         4
    Class 12       0.00      0.00      0.00         3
    Class 13       0.00      0.00      0.00         2
    Class 14       0.00      0.00      0.00         2
    Class 15       0.00      0.00      0.00         8
    Cl

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


1

In [117]:
#follow the same steps as before to vectorize the title_validation 
#into a form that can be loaded into the trained model

#to avoid having different number of features for the train set and the validation set with the tfidf vectorizer we are 
#going to use the tfidf vectorizer fitted for the train set to the validation set 
#the function we created previously will not suffise because it will create a unique representation based on the validation set 
#with number of features different than the ones in the training set

tfidf_vectorizer = TfidfVectorizer()  #create a tfidfvectorizer object


tfidf_vectorizer.fit(title)        #fit it for the title of the training set 
                                                   
X_center=tfidf_vectorizer.transform(title)      #transform the title of the training set into a format where each entry is a 
                                                #numeric vector of shape (m,n) where m 
                                                #is the number of examples in the given set and n is the number of features

X_center_format=X_center.toarray()             #follow this conversion to get an array form
                                               #


X_val_0=tfidf_vectorizer.transform(title_validation)   #use the vectorizer that is fitted to the training set to transform the input 
                                                       #from the validation set. the result will be a numeric vector of shape (m_val,n)
                                                       #where m_val is the number of entries in the validation set and 
                                                       #n is the number of features which is the same as the number of features 
                                                       # used for training

X_val_1=X_val_0.toarray()

X_val_1.shape

(565, 7372)

In [119]:
P_val_prediction=lstm_from_pred_and_dictionary_to_labels(X_val_1,model_4,one_hot_encode_product)  #use the aforementioned function
                                                       #to predict the product  labels for the validation set using the 
                                                       #validation title. This function also receives as input the dictionary with 
                                                       #the labels and their one hot encoding in order to convert the predictions back to text

[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step


In [121]:
#PZ_val_prediction

len(H_val_prediction)
len(P_val_prediction)

565

In [123]:
#turn the predictions into columns of a new dataframe and save them to a csv file
predicted_data_final={
'hazard': H_val_prediction,
'product': P_val_prediction
    
}
df_final=pd.DataFrame(predicted_data_final)

# Save the DataFrame to a CSV file
df_final.to_csv('st2_title_lstm.csv', index=False)
