In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
from pathlib import Path
import os
import logging
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dropout, Embedding
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('./data/consumer_complaints.csv')

data = df[['product', 'consumer_complaint_narrative']]
data.dropna(inplace=True)
text = data['consumer_complaint_narrative'].tolist()

  df = pd.read_csv('./data/consumer_complaints.csv')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.dropna(inplace=True)


In [18]:
data

Unnamed: 0,product,consumer_complaint_narrative
190126,Debt collection,XXXX has claimed I owe them {$27.00} for XXXX ...
190135,Consumer Loan,Due to inconsistencies in the amount owed that...
190155,Mortgage,In XX/XX/XXXX my wages that I earned at my job...
190207,Mortgage,I have an open and current mortgage with Chase...
190208,Mortgage,XXXX was submitted XX/XX/XXXX. At the time I s...
...,...,...
553084,Credit reporting,"XXXX XXXX is reporting incorrectly, payments h..."
553085,Credit reporting,Reflecting incorrect payment status. Have been...
553086,Payday loan,I have been paying {$180.00} a month through d...
553090,Mortgage,I recently became aware that Amerisave Mortgag...


In [4]:
# Encoding labels for product category
product_category = list(data['product'].unique())
labels = []
for i in data['product']:
    labels.append(product_category.index(i))

In [5]:
# Tokenizing text
token = Tokenizer()
token.fit_on_texts(text)
vocabs = token.index_word
vocabs_len = len(token.word_index) + 1

# Mapping token back to text
encoded_text = token.texts_to_sequences(text)

In [6]:
# Preparing data for modeling
max_length = max(len(x) for x in encoded_text)
X = pad_sequences(encoded_text, maxlen=max_length, padding='post')
X_train, X_test, y_train, y_test = train_test_split(X, np.array(labels), test_size=0.2, random_state=42)

In [9]:
# Building and training model
vec_size = 300

model = Sequential()
model.add(Embedding(vocabs_len, vec_size, input_length=max_length))

model.add(Conv1D(kernel_size=64, filters=8, activation='relu'))
model.add(MaxPooling1D(2))
model.add(Dropout(0.5))

model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(16, activation='relu'))

model.add(GlobalMaxPooling1D())

model.add(Dense(len(product_category), activation='softmax'))

model.summary()

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

history = model.fit(X_train, y_train, epochs = 5, validation_data=(X_test, y_test))

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 864, 300)          16553400  
                                                                 
 conv1d_2 (Conv1D)           (None, 801, 8)            153608    
                                                                 
 max_pooling1d_2 (MaxPooling  (None, 400, 8)           0         
 1D)                                                             
                                                                 
 dropout_4 (Dropout)         (None, 400, 8)            0         
                                                                 
 dense_6 (Dense)             (None, 400, 32)           288       
                                                                 
 dropout_5 (Dropout)         (None, 400, 32)           0         
                                                      

In [15]:
import matplotlib.pyplot as plt 
from io import BytesIO
import seaborn as sns
sns.set_theme()

from reportlab.lib.pagesizes import A4
from reportlab.platypus import SimpleDocTemplate, BaseDocTemplate, Paragraph, Image, PageBreak, Frame, PageTemplate
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle

In [21]:
# start pdf
doc = SimpleDocTemplate('./output/report.pdf', pagesize=A4)
styles = getSampleStyleSheet()

# frame_full = Frame(doc.leftMargin, doc.bottomMargin, doc.width, doc.height, id='full')
# frame_left = Frame(doc.leftMargin, doc.bottomMargin, (doc.width / 2) - 6, doc.height, id='left_col')
# frame_right = Frame(doc.leftMargin + (doc.width / 2) + 6, doc.bottomMargin, (doc.width / 2) - 6, doc.height, id='right_col')

# # Create page templates
# single_col_template = PageTemplate(id='OneCol', frames=[frame_full])
# two_col_template = PageTemplate(id='TwoCol', frames=[frame_left, frame_right])
# doc.addPageTemplates([single_col_template, two_col_template])

elements = []

centered_h3_style = ParagraphStyle(
        name='CenteredH3',
        parent=styles['Heading3'],  # Inherit properties from Heading3 style
        alignment=1,  # 0=left, 1=center, 2=right, 3=justify
    )


In [22]:
# start report
elements.append(Paragraph("Report Model Training: Finance Consumer Complaints Classification", styles['Title']))
elements.append(PageBreak())

In [23]:
loss=[1.216254711151123,
        0.7048466205596924,
        0.5876390337944031,
        0.5053926110267639,
        0.4354717433452606]

val_loss = [1.1236376762390137,
        0.975493848323822,
        0.897347092628479,
        0.8284736275672913,
        0.7970521450042725]

acc = [0.5927512645721436,
        0.7880959510803223,
        0.8237968683242798,
        0.8494873046875,
        0.8677307367324829]

val_acc = [0.7660529613494873,
        0.7914234399795532,
        0.8158957958221436,
        0.8089357614517212,
        0.8057925701141357]

In [24]:
# loss = history.history['loss']
# val_loss = history.history['val_loss']

# Determine the number of epochs
epochs = range(1, len(loss) + 1)

# Plotting the training and validation loss
plt.plot(epochs, loss, label='Training Loss')
plt.plot(epochs, val_loss, label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

buffer = BytesIO()
plt.savefig(buffer, format='png')
buffer.seek(0)
plt.close()

elements.append(Image(buffer, width=400, height=300))

# acc = history.history['accuracy']
# val_acc = history.history['val_accuracy']

# Determine the number of epochs
epochs = range(1, len(loss) + 1)

# Plotting the training and validation loss
plt.plot(epochs, acc, label='Training Accuracy')
plt.plot(epochs, val_acc, label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

buffer = BytesIO()
plt.savefig(buffer, format='png')
buffer.seek(0)
plt.close()

elements.append(Image(buffer, width=400, height=300))


In [25]:
doc.build(elements)