## Deep Learning Models


In [1]:
import pandas as pd
import nltk
import numpy as np
import csv
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from nltk.tokenize import word_tokenize

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras import models
from keras.models import load_model
from keras import layers
from keras.layers import SimpleRNN
from keras.layers import Embedding 
from keras.layers import Flatten
from keras.layers import Dense 
from keras.optimizers import RMSprop




In [60]:
train= pd.read_csv("data/training_data.csv")

In [61]:
train.head()

Unnamed: 0,Complaint,Class
0,hello mailed dispute letter bureau remove one ...,credit_reporting
1,equifax violation following section equifax re...,credit_reporting
2,deceptive offering promotional payment plan ma...,credit_card
3,whenever try connect pnc bank checking account...,retail_banking
4,account number concern presented yet handled r...,credit_reporting


In [62]:
#Pre-processing 
#Defining parameters

max_words = 10000
seq_len=100
embed_dim=100
tokenizer = Tokenizer(num_words = max_words, lower=True )
tokenizer.fit_on_texts(train['Complaint'].values)

In [66]:
#Tokenisation and padding
X = tokenizer.texts_to_sequences(train['Complaint'].values)
X = pad_sequences(X, maxlen=seq_len)

In [9]:
#Converting categorical labels to numeric
y = pd.get_dummies(train['Class']).values

In [10]:
#Splitting data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)
print("")
print("Train shape: ",X_train.shape, y_train.shape)
print("Test shape: ",X_test.shape, y_test.shape)


Train shape:  (12668, 100) (12668, 5)
Test shape:  (3167, 100) (3167, 5)


### Bidirectional LSTM

In [25]:
################### Step 2: Building the model
model_lstm = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(max_words, embed_dim, input_length=X.shape[1]),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(5, activation='softmax')
])


In [28]:
################### Step 3: Compile
model_lstm.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print("")
print('-------------------- Model Summary --------------------')
model_lstm.summary() # print model summary


-------------------- Model Summary --------------------
Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 100, 100)          1000000   
                                                                 
 bidirectional_6 (Bidirecti  (None, 100, 128)          84480     
 onal)                                                           
                                                                 
 bidirectional_7 (Bidirecti  (None, 100, 64)           41216     
 onal)                                                           
                                                                 
 bidirectional_8 (Bidirecti  (None, 64)                24832     
 onal)                                                           
                                                                 
 dense_4 (Dense)             (None, 24)                1560    

In [29]:
################# Step 4: Model fitting
epochs = 10
batch_size = 32
print("")
print('-------------------- Training --------------------')
lstm_model = model_lstm.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test))


-------------------- Training --------------------
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [32]:
from tensorflow.keras.utils import to_categorical

# Ensure y_test is 1D
y_test_flat = y_test.reshape(-1)

# Convert integer labels to one-hot encoded format
y_test_one_hot = to_categorical(y_test_flat, num_classes=5) 

# Step 5: Evaluation
print("")
print('--------------------Evaluating for Test Data--------------------')
lstm_eval = model_lstm.evaluate(X_test, y_test_one_hot, verbose=0)
print('Test set LSTM MODEL\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(lstm_eval[0], lstm_eval[1]))

print("")
print('--------------------Predicting for Test Data--------------------')
y_pred = model_lstm.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)
print("")
print(confusion_matrix(y_test_flat, y_pred))
print("")
print(classification_report(y_test_flat, y_pred))



--------------------Evaluating for Test Data--------------------
Test set LSTM MODEL
  Loss: 0.753
  Accuracy: 0.803

--------------------Predicting for Test Data--------------------

[[ 206   35   24   18   30]
 [  48 1573   90   48    8]
 [  16  115  303   23    4]
 [  17   49   17  290    5]
 [  54   11    2    9  172]]

              precision    recall  f1-score   support

           0       0.60      0.66      0.63       313
           1       0.88      0.89      0.89      1767
           2       0.69      0.66      0.68       461
           3       0.75      0.77      0.76       378
           4       0.79      0.69      0.74       248

    accuracy                           0.80      3167
   macro avg       0.74      0.73      0.74      3167
weighted avg       0.80      0.80      0.80      3167



In [43]:
print(y_test.shape, y_pred.shape)

(3167,) (3167,)


In [35]:
model_lstm.save('models/LSTM')

INFO:tensorflow:Assets written to: models/LSTM\assets


INFO:tensorflow:Assets written to: models/LSTM\assets


In [36]:
################# Step 6 - Use model to make predictions
print("")
print('-------------------- Predicting for Test Dataset--------------------')
model = models.load_model('models/LSTM')
test_data = pd.read_csv('data/test_data.csv', names=['Complaint'])
print(test_data)


-------------------- Predicting for Test Dataset--------------------






                                              Complaint
0     debt cable bill result identity theft someone ...
1     payment deducted account fedloan service feder...
2     entered agreement company located ca called us...
3             ar resource inc original creditor removed
4     ive disputed debt transunion twice continue te...
...                                                 ...
4056  interest rate mortgage jumped one month change...
4057  chrysler capital reporting inaccurate late pay...
4058  federally protected consumer experian violated...
4059  please review attached documentation capital o...
4060  complaint experian reporting incorrect informa...

[4061 rows x 1 columns]


In [37]:
#Tokenisation and padding
seq_len = 100
tokenizer = Tokenizer()
tokenizer.fit_on_texts(test_data['Complaint'].values)
test_array = tokenizer.texts_to_sequences(test_data['Complaint'].values)
test_array = pad_sequences(test_array, maxlen=seq_len)

In [38]:
label = {0: 'retail_banking', 1: 'credit_reporting', 2: 'mortgages_and_loans', 3: 'debt_collection', 4: 'credit_card'}
predictions = model.predict(test_array)
predictions = np.argmax(predictions, axis=1)
results = []
for i in range(4061):
    results.append(label[predictions[i]])
print('\nPredicted Labels:\n', results)


Predicted Labels:
 ['credit_reporting', 'credit_reporting', 'credit_reporting', 'credit_reporting', 'mortgages_and_loans', 'credit_reporting', 'debt_collection', 'credit_reporting', 'debt_collection', 'credit_reporting', 'credit_reporting', 'debt_collection', 'credit_reporting', 'credit_reporting', 'retail_banking', 'credit_reporting', 'mortgages_and_loans', 'debt_collection', 'mortgages_and_loans', 'retail_banking', 'credit_reporting', 'credit_reporting', 'credit_reporting', 'retail_banking', 'credit_reporting', 'retail_banking', 'credit_reporting', 'credit_reporting', 'credit_reporting', 'debt_collection', 'credit_reporting', 'credit_reporting', 'credit_reporting', 'mortgages_and_loans', 'credit_reporting', 'credit_reporting', 'credit_reporting', 'credit_reporting', 'credit_reporting', 'retail_banking', 'debt_collection', 'credit_reporting', 'retail_banking', 'mortgages_and_loans', 'credit_reporting', 'credit_reporting', 'credit_reporting', 'retail_banking', 'credit_reporting', 'mor

In [39]:
# storing the target labels of the test dataset 
np.savetxt("testdata_classlabels_LSTM.csv", results, delimiter = ',', fmt="%s")

---

### Feed Forward Neural Network

In [44]:
from keras import models
from keras.models import Sequential
from keras.models import load_model
from keras import layers
from keras.optimizers import RMSprop
from keras.models import Sequential 
from keras.layers import Dense 
from keras.layers import Embedding 
from keras.layers import Flatten
from keras.layers import Dense

In [45]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Pre-processing

df_train = pd.DataFrame(data=train)

In [48]:
# defining parameters
max_words = 10000
seq_len = 100
embed_dim = 100
tokenizer = Tokenizer(num_words=max_words, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df_train['Complaint'].values)
word_index = tokenizer.word_index

In [49]:
#Tokenisation and padding
tokenizer = Tokenizer()
X = tokenizer.texts_to_sequences(df_train['Complaint'].values)
X = pad_sequences(X, maxlen=seq_len)

In [50]:
#Converting categorical labels to numeric
y = pd.get_dummies(df_train['Class']).values

In [51]:
# Training the model

# Splitting into train, validation and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)
print("")
print("Train shape: ",X_train.shape, y_train.shape)
print("Test shape: ",X_test.shape, y_test.shape)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.20, random_state = 20)
print("Train shape: ",X_train.shape, y_train.shape)
print("Validation shape: ",X_val.shape, y_val.shape)


Train shape:  (12668, 100) (12668, 5)
Test shape:  (3167, 100) (3167, 5)
Train shape:  (10134, 100) (10134, 5)
Validation shape:  (2534, 100) (2534, 5)


In [52]:
# Building model
model = Sequential()
model.add(Embedding(max_words, embed_dim, input_length=X.shape[1]))
model.add(Flatten())
model.add(Dense(128, activation='relu')) 
model.add(Dense(64, activation='relu')) 
model.add(Dense(16, activation='relu'))
model.add(Dense(5, activation='softmax'))

In [53]:
# Step 3: Compile
model.compile(loss='categorical_crossentropy',optimizer='adam', metrics=['accuracy'])
print("")
print('-------------------- Model Summary --------------------')
model.summary() # print model summary
print("")


-------------------- Model Summary --------------------
Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 100, 100)          1000000   
                                                                 
 flatten (Flatten)           (None, 10000)             0         
                                                                 
 dense_6 (Dense)             (None, 128)               1280128   
                                                                 
 dense_7 (Dense)             (None, 64)                8256      
                                                                 
 dense_8 (Dense)             (None, 16)                1040      
                                                                 
 dense_9 (Dense)             (None, 5)                 85        
                                                               

In [54]:
# Step 4: Model fitting
epochs = 10
batch_size = 32
print("")
print('-------------------- Training --------------------')
ff_model = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_val, y_val))
print("")


-------------------- Training --------------------
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10



In [55]:
# Step 5: Evaluation

print("")
print('-------------------- Evaluating for test set --------------------')
ff_eval = model.evaluate(X_test, y_test)
print('Test set FF Model\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(ff_eval[0], ff_eval[1]))
print("")
print('-------------------- Predicting for test set --------------------')
y_pred= model.predict(X_test) 
y_pred= np.argmax(y_pred, axis=1)
y_test= np.argmax(y_test, axis=1)
print("")
print(confusion_matrix(y_test, y_pred))
print("")
print(classification_report(y_test, y_pred))
model.save('models/FF')


-------------------- Evaluating for test set --------------------
Test set FF Model
  Loss: 1.291
  Accuracy: 0.558

-------------------- Predicting for test set --------------------

[[   0  313    0    0    0]
 [   0 1767    0    0    0]
 [   0  461    0    0    0]
 [   0  378    0    0    0]
 [   0  248    0    0    0]]

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       313
           1       0.56      1.00      0.72      1767
           2       0.00      0.00      0.00       461
           3       0.00      0.00      0.00       378
           4       0.00      0.00      0.00       248

    accuracy                           0.56      3167
   macro avg       0.11      0.20      0.14      3167
weighted avg       0.31      0.56      0.40      3167



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


INFO:tensorflow:Assets written to: models/FF\assets


INFO:tensorflow:Assets written to: models/FF\assets


In [57]:
# Step 6 - Use model to make predictions

print("")
print('-------------------- Predicting for test dataset--------------------')
model = models.load_model('models/FF')
test_data = pd.read_csv('data/test_data.csv', names=['Complaint'])
print(test_data)


-------------------- Predicting for test dataset--------------------
                                              Complaint
0     debt cable bill result identity theft someone ...
1     payment deducted account fedloan service feder...
2     entered agreement company located ca called us...
3             ar resource inc original creditor removed
4     ive disputed debt transunion twice continue te...
...                                                 ...
4056  interest rate mortgage jumped one month change...
4057  chrysler capital reporting inaccurate late pay...
4058  federally protected consumer experian violated...
4059  please review attached documentation capital o...
4060  complaint experian reporting incorrect informa...

[4061 rows x 1 columns]


In [58]:
#Tokenisation and padding
seq_len = 100
tokenizer = Tokenizer()
tokenizer.fit_on_texts(test_data['Complaint'].values)
test_array = tokenizer.texts_to_sequences(test_data['Complaint'].values)
test_array = pad_sequences(test_array, maxlen=seq_len)

label = {0: 'retail_banking', 1: 'credit_reporting', 2: 'mortgages_and_loans', 3: 'debt_collection', 4: 'credit_card'}
predictions = model.predict(test_array)
predictions = np.argmax(predictions, axis=1)
results = []
for i in range(4061):
    results.append(label[predictions[i]])
print('\nPredicted Labels:\n', results)

#Storing the target labels of the test dataset 
np.savetxt("testdata_classlabels_FF.csv", results, delimiter = ',', fmt="%s")


Predicted Labels:
 ['credit_reporting', 'credit_reporting', 'credit_reporting', 'credit_reporting', 'credit_reporting', 'credit_reporting', 'credit_reporting', 'credit_reporting', 'credit_reporting', 'credit_reporting', 'credit_reporting', 'credit_reporting', 'credit_reporting', 'credit_reporting', 'credit_reporting', 'credit_reporting', 'credit_reporting', 'credit_reporting', 'credit_reporting', 'credit_reporting', 'credit_reporting', 'credit_reporting', 'credit_reporting', 'credit_reporting', 'credit_reporting', 'credit_reporting', 'credit_reporting', 'credit_reporting', 'credit_reporting', 'credit_reporting', 'credit_reporting', 'credit_reporting', 'credit_reporting', 'credit_reporting', 'credit_reporting', 'credit_reporting', 'credit_reporting', 'credit_reporting', 'credit_reporting', 'credit_reporting', 'credit_reporting', 'credit_reporting', 'credit_reporting', 'credit_reporting', 'credit_reporting', 'credit_reporting', 'credit_reporting', 'credit_reporting', 'credit_reporting',