# **Text Classification**

---






In [None]:
# Run this cell if you use Colab
from google.colab import drive
drive.mount('/content/drive')

Load and read in dataset

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_pickle('consumer_complaint_dataset.data', compression='gzip')

In [None]:
df.head()

## Distribution

In [None]:
pd.crosstab(df.topic, columns="Count").sort_values(by='Count', ascending=False)

## Group Labels

In [None]:
df.loc[df['topic']=='Credit reporting', 'topic'] = 'Credit reporting, credit repair services, or other personal consumer reports'
df.loc[df['topic']=='Credit card', 'topic'] = 'Credit card or prepaid card'
df.loc[df['topic']=='Prepaid card', 'topic'] = 'Credit card or prepaid card'
df.loc[df['topic']=='Payday load', 'topic'] = 'Payday loan, title loan, or personal loan'
df.loc[df['topic']=='Virtual currency', 'topic'] = 'Money transfer, virtual currency, or money service'
df.loc[df['topic']=='Money transfers', 'topic'] = 'Money transfer, virtual currency, or money service'
df = df[df['topic'] != 'Other financial service']

## Labels after grouping

In [None]:
pd.crosstab(df.topic, columns="Count").sort_values(by='Count', ascending=False)

In [None]:
df['topic'].value_counts().sort_values(ascending=False).plot(kind='bar',title='Number of complaints per topic')
plt.show()

## Function to retrieve text

In [None]:
def print_plot(index):
  example = df[df.index == index][['input','topic']].values[0]
  if len(example)>0:
    print(example[0])
    print('Topic: ',example[1])
print_plot(10)

## Clean Data

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords
stopwords.words('english')[:10]

In [None]:
import re

REPLACE_BY_SPLACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

## Function to clean text

In [None]:
def clean_text(text):
  text = text.lower()
  text = REPLACE_BY_SPLACE_RE.sub(' ', text)
  text = BAD_SYMBOLS_RE.sub('', text)
  text = text.replace('x', '')
  text = ' '.join(word for word in text.split() if word not in STOPWORDS)
  return text

In [None]:
df['input'] = df['input'].apply(clean_text)

df['input'] = df['input'].str.replace('\d+','')

In [None]:
print_plot(10)

# Modeling
1. Vectorize input consumer complaints
2. Limit dataset to top 50000 words
3. Set max number of words in each complaint to 250

## Train Tokenizer

In [None]:
from keras.preprocessing.text import Tokenizer

MAX_NB_WORDS = 50000

MAX_SEQUENCE_LENGTH = 250

EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words = MAX_NB_WORDS,
                      filters = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~',
                      lower=True)
tokenizer.fit_on_texts(df['input'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.'%len(word_index))

## Pad sequence

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

X = tokenizer.texts_to_sequences(df['input'].values)[:2500]
X = pad_sequences(X, maxlen = MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

In [None]:
df['input'].values[0]

In [None]:
X[0]

## Convert output label into numeric format

In [None]:
Y = pd.get_dummies(df['topic']).values[:2500]
print('Shape of label tensor:', Y.shape)

In [None]:
Y

## Split dataset to Training and Test set

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

## Construct LSTM Text Classifier

In [None]:
from keras.models import Sequential
from keras.layers import Embedding, SpatialDropout1D, LSTM, Dense
from keras.callbacks import EarlyStopping

model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length = X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(12, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

epochs=3
batch_size=64

history = model.fit(X_train, Y_train,
                    epochs=epochs,
                    batch_size=batch_size,
                    validation_split=0.1,
                    callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

## Evaluate the model

In [None]:
acc = model.evaluate(X_test, Y_test)
print('Test set \n\tLoss: {:0.3f}\n\tAccuracy: {:0.3f}'.format(acc[0], acc[1]))

## Loss

In [None]:
plt.title('Loss')
plt.plot(history.history['loss'], label='Train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show()

## Accuracy

In [None]:
plt.title('Accuracy')
plt.plot(history.history['accuracy'], label='Train')
plt.plot(history.history['val_accuracy'], label='test')
plt.legend()
plt.show()

## Confusion Matrix

In [None]:
labels = pd.get_dummies(df['topic']).columns
list(labels)

In [None]:
from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)

# confusion_matrix(Y_test.argmax(axis=1),
#                  y_pred.argmax(axis=1))

pd.DataFrame(confusion_matrix(Y_test.argmax(axis=1),
                              y_pred.argmax(axis=1)),
             index=labels, columns=labels)

## Classification Report

In [None]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)
print(classification_report(y_true=Y_test.argmax(axis=1),
                            y_pred=y_pred.argmax(axis=1)))

## Test using new complaints


In [None]:
import numpy as np
new_complaint = ['I am a victim of identity theft']
seq = tokenizer.texts_to_sequences(new_complaint)
padded = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH)
pred = model.predict(padded)
labels=pd.get_dummies(df['topic']).columns.values
print(pred, labels[np.argmax(pred)])