In [12]:
#import all the necessary libraries
import pandas as pd 
import numpy as  np
import re

from sklearn.feature_extraction.text import TfidfVectorizer

#import all the necessary libraries to build a neural network classifier from tensorflow

import tensorflow as tf
from tensorflow.keras.models import Sequential                   
from tensorflow.keras.layers import Dense                        #import fully connected neural net layers
from tensorflow.keras.optimizers import Adam                     #choose adam as the optimization algorithm
from tensorflow.keras.losses import CategoricalCrossentropy      #cost function needed for softmax classification if the labels are 
                                                                 #one hot encoded

from tensorflow.keras.regularizers import l2                     #add regularization in order to avoid overfitting 

from sklearn.model_selection import train_test_split             #to split the training set into train and test set 

from sklearn.metrics import confusion_matrix, classification_report  #to proceed with error analysis on our predictions


import seaborn as sns                                                 #to visualize confusion matrices as a heatmap
import matplotlib.pyplot as plt


from tensorflow.keras.layers import LSTM, Dropout              #import long-short term memory neural net layers
                                                               #and dropout regularization layers

In [15]:
def  vectorize_string_csv_column_TF_idf(string_column):  
                                                #create a function that receives a column of strings from a csv file 
                                                  #and converts each entry into a unique tfidf vector depending on the 
                                                  #unique vocabulary
                                                   
                                                 #necessary to convert a text input into a numeric vector to be used 
                                                 #as input to the classifier

    
    tfidf_vectorizer = TfidfVectorizer()  
                                      #create a TF_idf_vectorizer model that will receive the entire csv column
                                      #and will eventually turn each entry into a numeric vector
                                      #the length of each vector will be the number of the unique words in the vocabulary
                                      #if this length=N the resulting vectors for each entry will be of dimension (N,)
    
    tfidf_matrix = tfidf_vectorizer.fit_transform(string_column)  
                                                    #apply the created model to the column title of the csv


    tfidf_vectors = tfidf_matrix.toarray()          #each entry of the tfidf_vectors is a numeric vector
                                                    #corresponding to a title entry
                                                    #convert it to an array format for ease of handling

    Vocabulary=tfidf_vectorizer.get_feature_names_out()  #export the unique vocabulary out of the created vectorizer model

    return tfidf_vectors, Vocabulary   

In [18]:
#create a function that finds all the unique labels in a column 
def find_unique_column_labels(column):
     
    unique_labels_list=[]               #create a list that will contain all the unique labels found in the input column

    for i in column : #search the entire column
        if i not in unique_labels_list:   #if the element i is not found in the unique list 
            unique_labels_list.append(i)  #append it to the list
    
    return  unique_labels_list 

In [21]:
def one_hot_encode_labels(unique_labels_of_a_column):
    #create a dictionary for these categories that coresponds each one into a one hot numpy vector
    #this is necessary in order to use a softmax classifier 

    number_of_classes=len(unique_labels_of_a_column)  #find the number of classes/possible labels from the vector that contains the unique labels

                                               #create a numpy I matrix I lxl where l is the number of classes 
                                               #each row of the I matrix will correspond to a one hot encoding for each label
    I=np.eye(number_of_classes)

    #create a dictionary to correspond each class name with it's one hot encoded label

    dict_labels={} #initialize an empty dictionary where the keys will be the labels and the values will be their one hot encoding


    for i in range(number_of_classes):
        dict_labels[unique_labels_of_a_column[i]]=I[i,:]

    return dict_labels

In [24]:
def create_Y_label(initial_label_column,dict_labels):
    #create the Y part of the dataset by receiving  a column  and the dict_labels corresponding to that column



    #len(dict_labels['biological'])

    Y1={}   #initialize an empty dictionary
    count=0 #and a count variable
    for i in initial_label_column:    #search through the hazard_category column of the data frame
        for j in dict_labels.keys():  #and through all the keys of the labels dictionary with keys all the unique labels 
                                  #and values their one hot encoded representation
            if i==j:                   #if you find a match
                Y1[count]=dict_labels[j]   #assign the category with it's one hot encoding
                count+=1
    #Y
    #now the dictionary above should be turned into a numpy matrix with its elements being column vectors

    # Convert dictionary values to a numpy matrix
    matrix = np.array([v for v in Y1.values()]).T

    matrix.shape #Nxm format
    return matrix    

In [27]:
def compile_lstm(X_train,Y_train,X_test,Y_test,hu_1,hu_2,number_of_classes,a_epochs,a_batch_size):


#in the following lstm neural net dropout regularization layers have been added added and l2 regularization is turned to 0
    
# LSTM Model architecture      Tx, and Ty are set by default by tensorflow
    model = Sequential([  
    # first LSTM layer the hidden units are frozen to 64 and instead of a tanh which is the most common actiovation function for an 
        #lstm a relu function is being applied
        LSTM(64, activation='relu', return_sequences=True, input_shape=(1, X_train.shape[1]), kernel_regularizer=l2(0)),
        Dropout(0.2),  # Add dropout for regularization

    # second LSTM layer
        #similar logic to the previous lstm layer but the number of hidden units remains frozen to 32
        LSTM(32, activation='relu', kernel_regularizer=l2(0)),
        Dropout(0.2),

    # fully connected layer with hu_1 number of hidden units, no regularization and relu activation
        Dense(hu_1, activation='relu', kernel_regularizer=l2(0)),

    # output layer with softmax activation with hidden units equal to the number of classes
        Dense(number_of_classes, activation='softmax', kernel_regularizer=l2(0))
    ])



    # Compile the model
    model.compile(
        optimizer=Adam(learning_rate=0.001),  # adam optimizer
        loss=CategoricalCrossentropy(),       # categorical cross-entropy loss
        metrics=['accuracy']                  
    )

    # Reshape the input data to be compatible with LSTM
    X_train_new = X_train.reshape((X_train.shape[0],1, X_train.shape[1]))  # reshape to (samples, time_steps, features)
    X_test_new = X_test.reshape((X_test.shape[0],1, X_test.shape[1]))      # reshape to (samples, time_steps, features)


    #train the model to fit the training data
    model.fit(
        X_train_new, Y_train,  # training data
        validation_data=(X_test_new, Y_test),  # validation data
        epochs=a_epochs,   # number of epochs
        batch_size=a_batch_size,  # batch size
        verbose=1  
    )

    #model evaluation on the test data
    test_loss, test_accuracy = model.evaluate(X_test_new, Y_test, verbose=1)

    
    print("Test Loss:", test_loss)
    print("Test Accuracy:", test_accuracy)

    return model

In [30]:
def lstm_create_heatmap_and_error_report(model,X_test,Y_test,num_of_classes):  #input the X_test 
                                                                    #and Y_test
                                                                    #make predictions on X_test with the model
                                                                    #compare the model predictions with 
                                                                    #the actual Y_test and create an error analysis

    X_test_new=X_test.reshape((X_test.shape[0],1, X_test.shape[1])) #reshape X_test to make it compatible with lstm 
    
    Y_predict_initial=model.predict(X_test_new) #we predict the model output for the X_test 
        #and we are going to compare with the actual lables from Y_test

    #Each entry y in Y_predict_initial is a vector of  outputs= number of classes, containing the probabilities that show 
    #how likely it is for the model to assign an entry x of x_test to a specific class.
    #For the entry x with prediction y if y[0] is the highest amongst the elements of y, x will be assigned to 
    #class 0. If y[1] is the highest then x will be assigned to class_1 and so on

    #As a result, we need to find the index of  maximum element of each y in Y_predict 
    #that will show us in which class x corresponds to:
    Y_intermediate = np.argmax(Y_predict_initial, axis=1) #find the index of the maximum element for each y in Y_predict_initial
    
    #we are also going to convert Y_test from a one hot encoding to 
    #the number of class this one hot encoding represents and compare it with 
    #the predicted class stored in Y_intermediate

    Y_true = np.argmax(Y_test, axis=1)  #due to the fact that we have a one hot encoding, finding the index of the max 
                                    #element will lead directly to the number of class it represents

    #
    #we are going to find out where Y_true matches our prediction in Y_intermediate 
    #and we are going to display a confusion matrix of the true vs the prediction

    
    conf_mat=confusion_matrix(Y_true,Y_intermediate)
    

    #sns.heatmap(conf_mat,annot=True, fmt='d', cmap='Blues')
    #plt.xlabel('Predicted Class')
    #plt.ylabel('True Class')
    #plt.title('Confusion Matrix')
    #plt.show()

    #the classification report is applied to the test set which is a random split from the entire training set
    #as a result it may not include the entire number of classes and we will get the error report based on the 
    #classes stored in the confusion matrix and the test set
    report = classification_report(Y_true, Y_intermediate, target_names=[f"Class {i}" for i in range(conf_mat.shape[0])]) 
    print("\n",report)

    return 1

In [33]:
#make a function that inputs data X
#the model that makes predictions and the 
#dictionary with the labels and their one hot encoding. 
#It computes the numeric predictions for X and turns them into the text of the label 
#they correspond to

def lstm_from_pred_and_dictionary_to_labels(X,model,one_hot_dictionary):
    
        #predicted output
    X_new=X.reshape((X.shape[0],1, X.shape[1]))
    
    Y_pr=model.predict(X_new)

        #create a new dictionary with the key being the index/label of the one_hot_dictionary
        #and the value of this new  dictionary being the phrase of the one hot encoding 

    new_dict={}

    for key,value in one_hot_dictionary.items():   
        convert_one_hot_encode_to_number=np.argmax(value)    #get the number representation of the one hot encoding
        label=key                                            #get the phrase of the one hot encoding 
        new_dict[convert_one_hot_encode_to_number]=label     #store them as number-> phrase

    #use the newly created dictionary to map the predictions into the labels
    predictions_text_format=[]

    for i in Y_pr:
        chosen_label=np.argmax(i) #loop through the predictions of the model and choose to which label the prediction is assigned
        predictions_text_format.append(new_dict[chosen_label]) #get the phrase corresponding to that label and append it to a list
        
    return predictions_text_format

In [36]:
#load the csv incidents_train into a data frame

df=pd.read_csv('incidents_train.csv')
#df.head()

#isolate the text category which is to be used as the input X to a classifier
text=df['text']

In [39]:
X,unique_title_voc=vectorize_string_csv_column_TF_idf(text)
#X.shape

In [41]:
X.shape

(5082, 41409)

In [43]:
hazard=df['hazard']
unique_hazard_labels=find_unique_column_labels(hazard)

one_hot_encode_hazard=one_hot_encode_labels(unique_hazard_labels)
#one_hot_encode_hazard

Y_hazard_transposed=create_Y_label(hazard,one_hot_encode_hazard)

Y_hazard=Y_hazard_transposed.T
Y_hazard.shape

(5082, 128)

In [46]:
#split the data intro train and test set
#using the processed title data as X
#and hazard  as Y
X_train, X_test, Y_hazard_train, Y_hazard_test = train_test_split(X, Y_hazard, test_size=0.2, random_state=42)

In [49]:
#the number of classes is 128 as it was found previously by the unique one-hot encoding
model_3=compile_lstm(X_train,Y_hazard_train,X_test,Y_hazard_test,64,32,128,25,32)
#15 epochs yields good results lets try to increase it

  super().__init__(**kwargs)


Epoch 1/25
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 134ms/step - accuracy: 0.1263 - loss: 4.6452 - val_accuracy: 0.1367 - val_loss: 3.4786
Epoch 2/25
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 126ms/step - accuracy: 0.1184 - loss: 3.3735 - val_accuracy: 0.1475 - val_loss: 3.2413
Epoch 3/25
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 124ms/step - accuracy: 0.1845 - loss: 3.0042 - val_accuracy: 0.3314 - val_loss: 2.9840
Epoch 4/25
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 124ms/step - accuracy: 0.3193 - loss: 2.6773 - val_accuracy: 0.3894 - val_loss: 2.7648
Epoch 5/25
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 124ms/step - accuracy: 0.4188 - loss: 2.2464 - val_accuracy: 0.4326 - val_loss: 2.6431
Epoch 6/25
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 124ms/step - accuracy: 0.4646 - loss: 1.9803 - val_accuracy: 0.4690 - val_loss: 2.5766
Epoch 7/25

In [52]:
lstm_create_heatmap_and_error_report(model_3,X_test,Y_hazard_test,1)

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step

               precision    recall  f1-score   support

     Class 0       0.91      0.92      0.92       139
     Class 1       0.00      0.00      0.00         2
     Class 2       0.59      0.74      0.66        39
     Class 3       0.97      0.81      0.88        47
     Class 4       0.00      0.00      0.00         1
     Class 5       0.88      0.88      0.88       121
     Class 6       0.07      0.25      0.11         4
     Class 7       0.00      0.00      0.00         3
     Class 8       0.00      0.00      0.00         4
     Class 9       0.37      0.53      0.43        34
    Class 10       0.82      0.64      0.72        36
    Class 11       0.72      0.73      0.72       108
    Class 12       0.34      0.52      0.42        21
    Class 13       0.57      1.00      0.73         4
    Class 14       0.17      0.17      0.17         6
    Class 15       0.54      0.73      0.62        45
    C

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


1

In [55]:
#read the validation_file for hazard and create a prediction vector

#load the csv incidents_val into a data frame

df_validation=pd.read_csv('incidents_val.csv')
#df.head()

#isolate the text category which is to be used as the input X to a classifier
text_validation=df_validation['text']

In [57]:
#follow the same steps as before to vectorize the text_validation 
#into a form that can be loaded into the trained model

#to avoid having different number of features for the train set and the validation set with the tfidf vectorizer we are 
#going to use the tfidf vectorizer fitted for the train set to the validation set 
#the function we created previously will not suffise because it will create a unique representation based on the validation set 
#with number of features different than the ones in the training set

tfidf_vectorizer = TfidfVectorizer()  #create a tfidfvectorizer object


tfidf_vectorizer.fit(text)        #fit it for the title of the training set 
                                                   
X_center=tfidf_vectorizer.transform(text)      #transform the title of the training set into a format where each entry is a 
                                                #numeric vector of shape (m,n) where m 
                                                #is the number of examples in the given set and n is the number of features

X_center_format=X_center.toarray()             #follow this conversion to get an array form
                                               #


X_val_0=tfidf_vectorizer.transform(text_validation)   #use the vectorizer that is fitted to the training set to transform the input 
                                                       #from the validation set. the result will be a numeric vector of shape (m_val,n)
                                                       #where m_val is the number of entries in the validation set and 
                                                       #n is the number of features which is the same as the number of features 
                                                       # used for training

X_val_1=X_val_0.toarray()

X_val_1.shape

(565, 41409)

In [58]:
#predictions in text format from the trained dense model named model

H_val_prediction=lstm_from_pred_and_dictionary_to_labels(X_val_1,model_3,one_hot_encode_hazard)  #use the aforementioned function
                                                       #to predict the Hazard labels for the validation set using the 
                                                       #validation title. This function also receives as input the dictionary with 
                                                       #the labels and their one hot encoding in order to convert the predictions back to text

[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step


In [62]:
#now we are going to follow the exact same procedure and train a seperate model to predict 
#the product category first for the training set and then for the validation set

In [64]:
product=df['product'] #get the product  for the data frame refering to the training set
unique_product_labels=find_unique_column_labels(product) #find the unique labels of product 

one_hot_encode_product=one_hot_encode_labels(unique_product_labels) #one hot encode these labels and get a 
#dictionary with the key being the label and the value being its one hot encoding


Y_product_transposed=create_Y_label(product,one_hot_encode_product)
#create the Y part of the data set

Y_product=Y_product_transposed.T
Y_product.shape

(5082, 1022)

In [67]:
#split the data intro train and test set
#using the processed text data as X
#and product  as Y
X_train, X_test, Y_product_train, Y_product_test = train_test_split(X, Y_product, test_size=0.2, random_state=42)

In [70]:
#number of classes is 1022 for the product in the training set as it was found by the 
#function that creates the unique one hot encodings
model_4=compile_lstm(X_train,Y_product_train,X_test,Y_product_test,128,64,1022,25,32)

  super().__init__(**kwargs)


Epoch 1/25
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 132ms/step - accuracy: 0.0315 - loss: 6.8126 - val_accuracy: 0.0492 - val_loss: 6.2377
Epoch 2/25
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 128ms/step - accuracy: 0.0269 - loss: 6.0268 - val_accuracy: 0.0492 - val_loss: 6.2102
Epoch 3/25
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 127ms/step - accuracy: 0.0327 - loss: 5.8082 - val_accuracy: 0.0492 - val_loss: 6.1589
Epoch 4/25
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 127ms/step - accuracy: 0.0362 - loss: 5.5635 - val_accuracy: 0.0570 - val_loss: 6.4026
Epoch 5/25
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 126ms/step - accuracy: 0.0390 - loss: 5.4693 - val_accuracy: 0.0619 - val_loss: 6.3977
Epoch 6/25
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 125ms/step - accuracy: 0.0485 - loss: 5.3533 - val_accuracy: 0.0669 - val_loss: 6.6538
Epoch 7/25

In [72]:
lstm_create_heatmap_and_error_report(model_4,X_test,Y_product_test,22)

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step

               precision    recall  f1-score   support

     Class 0       0.00      0.00      0.00         4
     Class 1       0.00      0.00      0.00         7
     Class 2       0.09      0.14      0.11         7
     Class 3       0.00      0.00      0.00         1
     Class 4       0.31      0.44      0.36         9
     Class 5       0.00      0.00      0.00         7
     Class 6       0.00      0.00      0.00         1
     Class 7       0.00      0.00      0.00         1
     Class 8       0.38      0.52      0.44        23
     Class 9       0.00      0.00      0.00         1
    Class 10       0.00      0.00      0.00         4
    Class 11       0.05      0.33      0.08         3
    Class 12       0.00      0.00      0.00         2
    Class 13       0.00      0.00      0.00         2
    Class 14       0.00      0.00      0.00         8
    Class 15       0.00      0.00      0.00         3
    C

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


1

In [74]:
#follow the same steps as before to vectorize the text_validation 
#into a form that can be loaded into the trained model

#to avoid having different number of features for the train set and the validation set with the tfidf vectorizer we are 
#going to use the tfidf vectorizer fitted for the train set to the validation set 
#the function we created previously will not suffise because it will create a unique representation based on the validation set 
#with number of features different than the ones in the training set

tfidf_vectorizer = TfidfVectorizer()  #create a tfidfvectorizer object


tfidf_vectorizer.fit(text)        #fit it for the title of the training set 
                                                   
X_center=tfidf_vectorizer.transform(text)      #transform the title of the training set into a format where each entry is a 
                                                #numeric vector of shape (m,n) where m 
                                                #is the number of examples in the given set and n is the number of features

X_center_format=X_center.toarray()             #follow this conversion to get an array form
                                               #


X_val_0=tfidf_vectorizer.transform(text_validation)   #use the vectorizer that is fitted to the training set to transform the input 
                                                       #from the validation set. the result will be a numeric vector of shape (m_val,n)
                                                       #where m_val is the number of entries in the validation set and 
                                                       #n is the number of features which is the same as the number of features 
                                                       # used for training

X_val_1=X_val_0.toarray()

X_val_1.shape

(565, 41409)

In [75]:
P_val_prediction=lstm_from_pred_and_dictionary_to_labels(X_val_1,model_4,one_hot_encode_product)  #use the aforementioned function
                                                       #to predict the product category labels for the validation set using the 
                                                       #validation title. This function also receives as input the dictionary with 
                                                       #the labels and their one hot encoding in order to convert the predictions back to text

[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step


In [79]:
#turn the predictions into columns of a new dataframe and save them to a csv file
predicted_data_final={
'hazard': H_val_prediction,
'product': P_val_prediction
    
}
df_final=pd.DataFrame(predicted_data_final)

# Save the DataFrame to a CSV file
df_final.to_csv('st2_text_lstm.csv', index=False)
