In [77]:
#import all the necessary libraries
import pandas as pd 
import numpy as  np
import re

from sklearn.feature_extraction.text import TfidfVectorizer

#import all the necessary libraries to build a neural network classifier from tensorflow

import tensorflow as tf
from tensorflow.keras.models import Sequential                   
from tensorflow.keras.layers import Dense                        #import fully connected neural net layers
from tensorflow.keras.optimizers import Adam                     #choose adam as the optimization algorithm
from tensorflow.keras.losses import CategoricalCrossentropy      #cost function needed for softmax classification if the labels are 
                                                                 #one hot encoded

from tensorflow.keras.regularizers import l2                     #add regularization in order to avoid overfitting 

from sklearn.model_selection import train_test_split             #to split the training set into train and test set 

from sklearn.metrics import confusion_matrix, classification_report  #to proceed with error analysis on our predictions


import seaborn as sns                                                 #to visualize confusion matrices as a heatmap
import matplotlib.pyplot as plt


from tensorflow.keras.layers import LSTM, Dropout              #import long-short term memory neural net layers
                                                               #and dropout regularization layers

In [80]:
def  vectorize_string_csv_column_TF_idf(string_column):  
                                                #create a function that receives a column of strings from a csv file 
                                                  #and converts each entry into a unique tfidf vector depending on the 
                                                  #unique vocabulary
                                                   
                                                 #necessary to convert a text input into a numeric vector to be used 
                                                 #as input to the classifier

    
    tfidf_vectorizer = TfidfVectorizer()  
                                      #create a TF_idf_vectorizer model that will receive the entire csv column
                                      #and will eventually turn each entry into a numeric vector
                                      #the length of each vector will be the number of the unique words in the vocabulary
                                      #if this length=N the resulting vectors for each entry will be of dimension (N,)
    
    tfidf_matrix = tfidf_vectorizer.fit_transform(string_column)  
                                                    #apply the created model to the column title of the csv


    tfidf_vectors = tfidf_matrix.toarray()          #each entry of the tfidf_vectors is a numeric vector
                                                    #corresponding to a title entry
                                                    #convert it to an array format for ease of handling

    Vocabulary=tfidf_vectorizer.get_feature_names_out()  #export the unique vocabulary out of the created vectorizer model

    return tfidf_vectors, Vocabulary   

In [83]:
#create a function that finds all the unique labels in a column 
def find_unique_column_labels(column):
     
    unique_labels_list=[]               #create a list that will contain all the unique labels found in the input column

    for i in column : #search the entire column
        if i not in unique_labels_list:   #if the element i is not found in the unique list 
            unique_labels_list.append(i)  #append it to the list
    
    return  unique_labels_list 

In [86]:
def one_hot_encode_labels(unique_labels_of_a_column):
    #create a dictionary for these categories that coresponds each one into a one hot numpy vector
    #this is necessary in order to use a softmax classifier 

    number_of_classes=len(unique_labels_of_a_column)  #find the number of classes/possible labels from the vector that contains the unique labels

                                               #create a numpy I matrix I lxl where l is the number of classes 
                                               #each row of the I matrix will correspond to a one hot encoding for each label
    I=np.eye(number_of_classes)

    #create a dictionary to correspond each class name with it's one hot encoded label

    dict_labels={} #initialize an empty dictionary where the keys will be the labels and the values will be their one hot encoding


    for i in range(number_of_classes):
        dict_labels[unique_labels_of_a_column[i]]=I[i,:]

    return dict_labels

In [89]:
def create_Y_label(initial_label_column,dict_labels):
    #create the Y part of the dataset by receiving  a column  and the dict_labels corresponding to that column



    #len(dict_labels['biological'])

    Y1={}   #initialize an empty dictionary
    count=0 #and a count variable
    for i in initial_label_column:    #search through the hazard_category column of the data frame
        for j in dict_labels.keys():  #and through all the keys of the labels dictionary with keys all the unique labels 
                                  #and values their one hot encoded representation
            if i==j:                   #if you find a match
                Y1[count]=dict_labels[j]   #assign the category with it's one hot encoding
                count+=1
    #Y
    #now the dictionary above should be turned into a numpy matrix with its elements being column vectors

    # Convert dictionary values to a numpy matrix
    matrix = np.array([v for v in Y1.values()]).T

    matrix.shape #Nxm format
    return matrix    

In [92]:
def train_and_eval_dense_nn(X_train,Y_train,X_test,Y_test,hu_1,hu_2,number_of_classes,a_epochs,a_batch_size):
    #define a function that trains a fully connected dense neural net to fit the pre-processed data
    #X_train are the training data
    #Y_train are the training labels 
    #X_test,Y_test are the data we are going to use for validation
    #hu_1 is the number of hidden units in layer 1
    #hu_2 is the number of hidden units in layer 2
    #number of classes is self explanatory and it is required for the softmax classification layer
    #a_epochs is the number of training epochs 
    #and a_batch size is the number of batches that we use to split the training data
    #the activation function is chosen as the relu activation f(z)=max(z,0)
    #and the regularization is set to zero after manual tuning
    
    model = Sequential([                                          #model architecture    
    Dense(hu_1, activation='relu', input_shape=(X_train.shape[1],),kernel_regularizer=l2(0)),  
    Dense(hu_2, activation='relu',kernel_regularizer=l2(0)),                      
    Dense(number_of_classes, activation='softmax',kernel_regularizer=l2(0))         
    ])

    # Compile the model
    model.compile(optimizer=Adam(learning_rate=0.001),           #for training the model the Adam optimization algorithm has been chosen 
                                                                  #with initial learning rate=0.001
        loss=CategoricalCrossentropy(),                            #since the output labels are one-hot encoded we have chosen the 
                                                                 #Categorical cross entropy function as a loss function
        metrics=['accuracy'])

    model.fit(     #the method fit is used to actually train the model using the X_train and Y_train 
    X_train, Y_train,    #as training data and X_test,Y_test as validation data 
    validation_data=(X_test, Y_test),
    epochs=a_epochs,          #the number of epochs chosen for the training are 30
    batch_size=a_batch_size,     #the training itself is mini batch gradient-descent with adam and the training data -set is split into 
    verbose=1          #32 batches    
    )


    test_loss, test_accuracy = model.evaluate(X_test, Y_test, verbose=1)   #we now evaluate the models performance on the
                                                                       #X_test and Y_test data for which we now the labels 
                                                                       #but these labels are unkown to the model 
                                                                       #using only the input data X_test it predicts the output 
                                                                       #labels and then compares them with the actual Y_test labels

    print("Test Loss:", test_loss)                           #depict the final value of the cost function for the test set                                     
    print("Test Accuracy:", test_accuracy)                  #depict the accuracy of predictions in the test set



    
    return model

In [129]:
def create_heatmap_and_error_report(model,X_test,Y_test,num_of_classes):  #input the X_test 
                                                                    #and Y_test
                                                                    #make predictions on X_test with the model
                                                                    #compare the model predictions with 
                                                                    #the actual Y_test and create an error analysis

    
    Y_predict_initial=model.predict(X_test) #we predict the model output for the X_test 
        #and we are going to compare with the actual lables from Y_test

    #Each entry y in Y_predict_initial is a vector of  outputs= number of classes, containing the probabilities that show 
    #how likely it is for the model to assign an entry x of x_test to a specific class.
    #For the entry x with prediction y if y[0] is the highest amongst the elements of y, x will be assigned to 
    #class 0. If y[1] is the highest then x will be assigned to class_1 and so on

    #As a result, we need to find the index of  maximum element of each y in Y_predict 
    #that will show us in which class x corresponds to:
    Y_intermediate = np.argmax(Y_predict_initial, axis=1) #find the index of the maximum element for each y in Y_predict_initial
    
    #we are also going to convert Y_test from a one hot encoding to 
    #the number of class this one hot encoding represents and compare it with 
    #the predicted class stored in Y_intermediate

    Y_true = np.argmax(Y_test, axis=1)  #due to the fact that we have a one hot encoding, finding the index of the max 
                                    #element will lead directly to the number of class it represents

    #
    #we are going to find out where Y_true matches our prediction in Y_intermediate 
    #and we are going to display a confusion matrix of the true vs the prediction

    
    conf_mat=confusion_matrix(Y_true,Y_intermediate)
    

   # sns.heatmap(conf_mat,annot=True, fmt='d', cmap='Blues')
   # plt.xlabel('Predicted Class')
   # plt.ylabel('True Class')
   # plt.title('Confusion Matrix')
   # plt.show()

    print(conf_mat.shape)
    report = classification_report(Y_true, Y_intermediate, target_names=[f"Class {i}" for i in range(conf_mat.shape[0])]) #i in range number of classes
    print("\n",report)

    return 1

In [125]:
#make a function that inputs data X
#the model that makes predictions and the 
#dictionary with the labels and their one hot encoding. 
#It computes the numeric predictions for X and turns them into the text of the label 
#they correspond to

def from_pred_and_dictionary_to_labels(X,model,one_hot_dictionary):
    
        #predicted output
    Y_pr=model.predict(X)

        #create a new dictionary with the key being the index/label of the one_hot_dictionary
        #and the value of this new  dictionary being the phrase of the one hot encoding 

    new_dict={}

    for key,value in one_hot_dictionary.items():   
        convert_one_hot_encode_to_number=np.argmax(value)    #get the number representation of the one hot encoding
        label=key                                            #get the phrase of the one hot encoding 
        new_dict[convert_one_hot_encode_to_number]=label     #store them as number-> phrase

    #use the newly created dictionary to map the predictions into the labels
    predictions_text_format=[]

    for i in Y_pr:
        chosen_label=np.argmax(i) #loop through the predictions of the model and choose to which label the prediction is assigned
        predictions_text_format.append(new_dict[chosen_label]) #get the phrase corresponding to that label and append it to a list
        
    return predictions_text_format

In [101]:
#load the csv incidents_train into a data frame

df=pd.read_csv('incidents_train.csv')
#df.head()

#isolate the title category which is to be used as the input X to a classifier
title=df['title']

In [104]:
X,unique_title_voc=vectorize_string_csv_column_TF_idf(title)
#X.shape

In [107]:
hazard=df['hazard']  #get the hazard category for the data frame refering to the training set
unique_hazard_labels=find_unique_column_labels(hazard) #find the unique labels of product category


one_hot_encode_hazard=one_hot_encode_labels(unique_hazard_labels)#one hot encode these labels and get a 
#dictionary with the key being the label and the value being its one hot encoding
#one_hot_encode_hazard_category

Y_hazard_transposed=create_Y_label(hazard,one_hot_encode_hazard)
#create the Y part of the data-set but in a transposed format

Y_hazard=Y_hazard_transposed.T
Y_hazard.shape

(5082, 128)

In [110]:
#split the data intro train and test set
#using the processed title data as X
#and hazard category as Y
X_train, X_test, Y_hazard_train, Y_hazard_test = train_test_split(X, Y_hazard, test_size=0.2, random_state=42)

In [112]:
#128 is the number of unique classes in hazard
model=train_and_eval_dense_nn(X_train,Y_hazard_train,X_test,Y_hazard_test,256,120,128,10,32)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 24ms/step - accuracy: 0.1777 - loss: 4.1353 - val_accuracy: 0.3746 - val_loss: 2.8878
Epoch 2/10
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 22ms/step - accuracy: 0.4294 - loss: 2.4668 - val_accuracy: 0.4789 - val_loss: 2.3555
Epoch 3/10
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 22ms/step - accuracy: 0.6345 - loss: 1.6112 - val_accuracy: 0.5506 - val_loss: 2.1160
Epoch 4/10
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 23ms/step - accuracy: 0.7685 - loss: 1.0957 - val_accuracy: 0.5624 - val_loss: 2.0551
Epoch 5/10
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 23ms/step - accuracy: 0.8424 - loss: 0.7576 - val_accuracy: 0.5811 - val_loss: 2.0348
Epoch 6/10
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 23ms/step - accuracy: 0.9109 - loss: 0.4782 - val_accuracy: 0.5801 - val_loss: 2.0359
Epoch 7/10
[1m128/128

In [133]:
#the heatmap and error report is on the artificial tets set , as a result 
#all the number of classes in hazard might not be included in the test set
#so the report for the f1 score includes only the classes that are found in the artificial test set
create_heatmap_and_error_report(model,X_test,Y_hazard_test,0)

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
(111, 111)

               precision    recall  f1-score   support

     Class 0       0.77      0.83      0.80       139
     Class 1       0.00      0.00      0.00         2
     Class 2       0.48      0.64      0.55        39
     Class 3       0.81      0.81      0.81        47
     Class 4       0.00      0.00      0.00         1
     Class 5       0.67      0.86      0.75       121
     Class 6       0.11      0.25      0.15         4
     Class 7       0.00      0.00      0.00         3
     Class 8       0.00      0.00      0.00         4
     Class 9       0.28      0.32      0.30        34
    Class 10       0.57      0.64      0.61        36
    Class 11       0.55      0.78      0.64       108
    Class 12       0.69      0.43      0.53        21
    Class 13       0.60      0.75      0.67         4
    Class 14       1.00      0.33      0.50         6
    Class 15       0.77      0.51      0.61      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


1

In [135]:
#read the validation_file for hazard category and create a prediction vector

#load the csv incidents_val into a data frame

df_validation=pd.read_csv('incidents_val.csv')
#df.head()

#isolate the title category which is to be used as the input X to a classifier
title_validation=df_validation['title']

In [137]:
#follow the same steps as before to vectorize the title_validation 
#into a form that can be loaded into the trained model

#to avoid having different number of features for the train set and the validation set with the tfidf vectorizer we are 
#going to use the tfidf vectorizer fitted for the train set to the validation set 
#the function we created previously will not suffise because it will create a unique representation based on the validation set 
#with number of features different than the ones in the training set

tfidf_vectorizer = TfidfVectorizer()  #create a tfidfvectorizer object


tfidf_vectorizer.fit(title)        #fit it for the title of the training set 
                                                   
X_center=tfidf_vectorizer.transform(title)      #transform the title of the training set into a format where each entry is a 
                                                #numeric vector of shape (m,n) where m 
                                                #is the number of examples in the given set and n is the number of features

X_center_format=X_center.toarray()             #follow this conversion to get an array form
                                               #


X_val_0=tfidf_vectorizer.transform(title_validation)   #use the vectorizer that is fitted to the training set to transform the input 
                                                       #from the validation set. the result will be a numeric vector of shape (m_val,n)
                                                       #where m_val is the number of entries in the validation set and 
                                                       #n is the number of features which is the same as the number of features 
                                                       # used for training

X_val_1=X_val_0.toarray()

X_val_1.shape

(565, 7372)

In [139]:
#predictions in text format from the trained dense model named model

H_val_prediction=from_pred_and_dictionary_to_labels(X_val_1,model,one_hot_encode_hazard)  #use the aforementioned function
                                                       #to predict the Hazard  labels for the validation set using the 
                                                       #validation title. This function also receives as input the dictionary with 
                                                       #the labels and their one hot encoding in order to convert the predictions back to text

[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step


In [141]:
#now we are going to follow the exact same procedure and train a seperate model to predict 
#the product  first for the training set and then for the validation set


In [147]:
product=df['product'] #get the product category for the data frame refering to the training set
unique_product_labels=find_unique_column_labels(product) #find the unique labels of product category

one_hot_encode_product=one_hot_encode_labels(unique_product_labels) #one hot encode these labels and get a 
#dictionary with the key being the label and the value being its one hot encoding


Y_product_transposed=create_Y_label(product,one_hot_encode_product)
#create the Y part of the data set

Y_product=Y_product_transposed.T
Y_product.shape

(5082, 1022)

In [150]:
#split the data intro train and test set
#using the processed title data as X
#and product category as Y
X_train, X_test, Y_product_train, Y_product_test = train_test_split(X, Y_product, test_size=0.2, random_state=42)

In [155]:
#train a different dense net than before in order to predict the product-category column of the data frame
#22 is the number of unique categories for product-category
model_2=train_and_eval_dense_nn(X_train,Y_product_train,X_test,Y_product_test,256,128,Y_product_train.shape[1],15,32)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/15
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 26ms/step - accuracy: 0.0344 - loss: 6.6708 - val_accuracy: 0.0659 - val_loss: 6.0547
Epoch 2/15
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 24ms/step - accuracy: 0.0588 - loss: 5.6696 - val_accuracy: 0.0846 - val_loss: 5.8744
Epoch 3/15
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 24ms/step - accuracy: 0.1015 - loss: 5.1193 - val_accuracy: 0.1504 - val_loss: 5.7652
Epoch 4/15
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 24ms/step - accuracy: 0.2377 - loss: 4.3486 - val_accuracy: 0.2409 - val_loss: 5.6202
Epoch 5/15
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 24ms/step - accuracy: 0.3920 - loss: 3.5048 - val_accuracy: 0.2822 - val_loss: 5.6845
Epoch 6/15
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 24ms/step - accuracy: 0.5528 - loss: 2.6193 - val_accuracy: 0.3215 - val_loss: 5.9198
Epoch 7/15
[1m128/128

In [159]:
Y_product_test.shape

(1017, 1022)

In [161]:
create_heatmap_and_error_report(model_2,X_test,Y_product_test,0)

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
(538, 538)

               precision    recall  f1-score   support

     Class 0       0.00      0.00      0.00         4
     Class 1       0.50      0.29      0.36         7
     Class 2       0.20      0.43      0.27         7
     Class 3       0.20      1.00      0.33         1
     Class 4       0.47      0.78      0.58         9
     Class 5       0.00      0.00      0.00         7
     Class 6       0.00      0.00      0.00         1
     Class 7       0.00      0.00      0.00         1
     Class 8       0.28      0.48      0.35        23
     Class 9       0.00      0.00      0.00         1
    Class 10       0.33      0.25      0.29         4
    Class 11       0.14      0.33      0.20         3
    Class 12       0.25      0.50      0.33         2
    Class 13       0.50      0.50      0.50         2
    Class 14       0.38      0.38      0.38         8
    Class 15       1.00      0.33      0.50      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


1

In [163]:
#follow the same steps as before to vectorize the title_validation 
#into a form that can be loaded into the trained model

#to avoid having different number of features for the train set and the validation set with the tfidf vectorizer we are 
#going to use the tfidf vectorizer fitted for the train set to the validation set 
#the function we created previously will not suffise because it will create a unique representation based on the validation set 
#with number of features different than the ones in the training set

tfidf_vectorizer = TfidfVectorizer()  #create a tfidfvectorizer object


tfidf_vectorizer.fit(title)        #fit it for the title of the training set 
                                                   
X_center=tfidf_vectorizer.transform(title)      #transform the title of the training set into a format where each entry is a 
                                                #numeric vector of shape (m,n) where m 
                                                #is the number of examples in the given set and n is the number of features

X_center_format=X_center.toarray()             #follow this conversion to get an array form
                                               #


X_val_0=tfidf_vectorizer.transform(title_validation)   #use the vectorizer that is fitted to the training set to transform the input 
                                                       #from the validation set. the result will be a numeric vector of shape (m_val,n)
                                                       #where m_val is the number of entries in the validation set and 
                                                       #n is the number of features which is the same as the number of features 
                                                       # used for training

X_val_1=X_val_0.toarray()

X_val_1.shape

(565, 7372)

In [165]:
#predictions in text format from the trained dense model named model

P_val_prediction=from_pred_and_dictionary_to_labels(X_val_1,model_2,one_hot_encode_product)  #use the aforementioned function
                                                       #to predict the product labels for the validation set using the 
                                                       #validation title. This function also receives as input the dictionary with 
                                                       #the labels and their one hot encoding in order to convert the predictions back to text

[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step


In [167]:
#PZ_val_prediction

len(H_val_prediction)
len(P_val_prediction)

565

In [169]:
#turn the predictions into columns of a new dataframe and save them to a csv file
predicted_data_final={
'hazard': H_val_prediction,
'product': P_val_prediction
    
}
df_final=pd.DataFrame(predicted_data_final)

# Save the DataFrame to a CSV file
df_final.to_csv('st2_title_dense.csv', index=False)
