<a href="https://colab.research.google.com/github/Chintan2108/Consumer-Complaint-Classification-OPEN-AI/blob/master/complaints_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# coding: utf-8

# # Multiclass Classification For User Complaints in Banking
# 
# ## Introduction
# This is an NLP-based problem solving approach for the dataset available at http://www.cs.toronto.edu/~complingweb/data/karaOne/karaOne.html
#domain - automotive 

import nltk
import pickle
import gensim
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.corpus import wordnet as wn
from stop_words import get_stop_words
import re, sys, math, string
import calendar as cal
import numpy as np
from ast import literal_eval
import logging
from gensim.models import word2vec

#from textblob import TextBlob
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from  sklearn.calibration import CalibratedClassifierCV
from keras.layers import Embedding
from keras.layers import Input, Dense, Embedding, Conv2D, MaxPooling2D, Dropout,concatenate
from keras.layers.core import Reshape, Flatten
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam
from keras.models import Model
from keras import regularizers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from gensim.models.keyedvectors import KeyedVectors
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import altair as alt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from numpy import array

main_df = pd.read_csv('data/Consumer_Complaints.csv')


stplist = ['title', 'body', 'xxxx']
english_stopwords = get_stop_words(language='english')
english_stopwords += stplist
english_stopwords = list(set(english_stopwords))

def get_wordnet_pos(word):
    """
    Function that determines the the Part-of-speech (POS) tag.
    Acts as input to lemmatizer. Result is of the form: [('complaint', 'NN'), ... ]
    """
    if word.startswith('N'):
        return wn.NOUN
    elif word.startswith('V'):
        return wn.VERB
    elif word.startswith('J'):
        return wn.ADJ
    elif word.startswith('R'):
        return wn.ADV
    else:
        return wn.NOUN


def clean_up(text):
    """
    Function to clean data.
    Steps:
    - Removing special characters, numbers
    - Lemmatization
    - Stop-words removal
    - Getting a unique list of words
    - TODO: try removing names and company names like Navient (Proper nouns)
    """
    #lemma = WordNetLemmatizer()
    lemmatizer = nltk.WordNetLemmatizer().lemmatize
    text = re.sub('\W+', ' ', str(text))
    text = re.sub(r'[0-9]+', '', text.lower())
    # correcting spellings of words using TextBlob - user complaints are bound to have spelling mistakes
    # However, this idea was later dropped because TextBlob may change the words.
    # text = TextBlob(text).correct()
    word_pos = nltk.pos_tag(nltk.word_tokenize(text))
    normalized_text_lst = [lemmatizer(x[0], get_wordnet_pos(x[1])).lower() for x in word_pos]
    stop_words_free = [i for i in normalized_text_lst if i not in english_stopwords and len(i) > 3]
    stop_words_free = list(set(stop_words_free))
    return(stop_words_free)


def get_average_word2vec(complaints_lst, model, num_features=300):
    """
    Function to average the vectors in a list.
    Say a list contains 'flower' and 'leaf'. Then this function gives - model[flower] + model[leaf]/2
    - index2words gets the list of words in the model.
    - Gets the list of words that are contained in index2words (vectorized_lst) and 
      the number of those words (nwords).
    - Gets the average using these two and numpy.
    """
    #complaint_feature_vecs = np.zeros((len(complaints_lst),num_features), dtype="float32") #?used?
    index2word_set = set(model.wv.index2word)
    vectorized_lst = []
    vectorized_lst = [model[word] if word in index2word_set else np.zeros(num_features) for word in complaints_lst]    
    nwords = len(vectorized_lst)
    summed = np.sum(vectorized_lst, axis=0)
    averaged_vector = np.divide(summed, nwords)
    return averaged_vector





# -----------------------------------------------------------------------------------------------------------------

# ## Technique 2: Word2Vec
# I tried creating my own model for Word2Vec. However, this only contained 17million words, as opposed to Google's GoogleNews' pretrained Word2Vec model (negative 300 bin layers). So, I chose to go ahead with the pre-trained model.
# In lieu of time, I couldn't do this - but I would have preferred to complement the Google Word2Vec model with words from this dataset. This Word2Vec model is up until 2013, post which slang/other important words might have been introduced in the vocabulary. 
# Of course, these words could also be company-complaint specific. For example, for ATB Bank, someone might be using ATB bank or a specific Policy name like ATBUltraInsurance. These would also be removed.
# Apart from this, these complaints contain a lot of spelling mistakes and words joined together. Such as: `immeditalely`, `demaging`,  `practiciing`, etc. (shown as missing_words in the cells below), and two words joined together into one word, such as 'givenrequesting'.
# I tried looking into it and found out about a library called TextBlob. However, people also warned against its used because it might not always be right. So I chose to not use it and skip over these words for now.
# There were also short forms not detected by the model.


# Creating a Word2Vec model using training set
vocabulary_of_all_words = input_df['complaint'].tolist()
num_features = 300
min_word_count = 10                      
num_workers = 8
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsampling for frequent words
word2vec_model_name = "trained_models/300features_10minwords_10context1"
word2vec_complaints = word2vec.Word2Vec(vocabulary_of_all_words, workers=num_workers, size=num_features, 
                                   min_count=min_word_count, window=context, sample=downsampling)
word2vec_complaints.save(word2vec_model_name)

# Fetching trained model to save time.
word2vec_complaints = gensim.models.Word2Vec.load(word2vec_model_name)

vocab_lst_flat = [item for sublist in vocabulary_of_all_words for item in sublist]
vocab_lst_flat = list(set(vocab_lst_flat))
# Loading a pre-trained GoogleNews model
# word2vec_model = KeyedVectors.load_word2vec_format("trained_models/GoogleNews-vectors-negative300.bin", binary=True)

# Exploring this model to see how well it has trained and checking for spelling mistakes in user-complaints
try:
    word2vec_complaints.wv.most_similar("good")
except KeyError:
    print("Sorry, this word doesn't exist in the vocabulary.")
    
words_not_present = 0
words_present = 0
total_unique_tokens = len(set(vocab_lst_flat))
missing_words = []
for i in vocab_lst_flat:
    try:
        p = word2vec_complaints[i]
        words_present+=1
    except KeyError:
        missing_words.append(i)
        words_not_present+=1
print(words_present, words_not_present, total_unique_tokens)

# Examples of spelling mistakes, grammatical errors, etc.
print(missing_words[:20])


# #### Choosing a Word2Vec Model
# - The Google word2vec model isn't able to account for a lot of words. It can be made better by retraining on more words from the training set. However, a lot of these words are spelling mistakes.
# - The presence of 'xxxx', 'xx', etc. in various forms is a simple fix which can also be implemented.
# - Initially, I had planned to use Google's pretrained Word2Vec model. However, after waiting for hours for training on Google word2vec model, I switched back to the Word2Vec model for want of speed.


# # These take a very long time to be averaged. Commenting this code and reading from file the saved output.
# embeddings_df = input_df['complaint'].apply(lambda complaint: get_average_word2vec(complaint, word2vec_complaints, 
#                                                                                    num_features)).to_frame()
# col_lst = []
# for i in range(num_features):
#     col_lst.append('vec_'+str(i+1))
# # Easy to write to file and process when exploded into columns
# exploded_em_df = pd.DataFrame(embeddings_df.complaint.tolist(), columns=col_lst)
# exploded_em_df = pd.DataFrame(embeddings_df)['complaint'].apply(pd.Series)
# exploded_em_df.head()
# exploded_em_df.to_csv("data/modified/vocab_trained_word2Vec.csv", index=False)

exploded_em_df = pd.read_csv('data/modified/vocab_trained_word2Vec.csv')
print("Word2Vec output:\n")
exploded_em_df.head()

input_df = input_df.reset_index(drop=True)
vectorized_df = pd.concat([exploded_em_df, input_df[['product']]], axis=1)                        
vectorized_df = shuffle(vectorized_df)

if vectorized_df[vectorized_df.isnull().any(axis=1)].empty:
    res = "True" # No NaNs exist in the cleaned dataset.
else:
    res = "False"
print(res)
print(vectorized_df.shape)
if not res:
    vectorized_df[vectorized_df.isnull().any(axis=1)]
    vectorized_df.dropna(axis=0, how='any')
    print(vectorized_df.shape)


# ### Training and Test Sets]

vectorized_data = np.array(vectorized_df.drop('product', axis=1))
vectorized_target = np.array(vectorized_df['product'])

train_x, test_x, train_y, test_y = train_test_split(vectorized_data, vectorized_target,
                                                    test_size=0.3,
                                                    random_state=123)



# 3. Deep Neural Network - CNN: Upon reading online some discussion on this, I thought of implementing CNNs. It said - what has recently been shown to work much better and simpler than RNNs is using word vectors, pre-trained on a large corpus, as features to the neural network. RNNs were called 'slow and fickle to train'.


# Model 3: CNN using Keras
from keras.layers import Embedding
from keras.preprocessing.text import Tokenizer
NUM_WORDS = 20000
texts = train_df.complaints_untokenized
products_unique = vectorized_df['product'].unique()

dict_products = {}
for i, complaint in enumerate(products_unique):
    dict_products[complaint] = i
labels = vectorized_df['product'].apply(lambda x:dict_products[x])

vocab_lst_flat = [item for sublist in vocabulary_of_all_words for item in sublist]

tokenizer = Tokenizer(num_words=NUM_WORDS,filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\'',
                      lower=True)
tokenizer.fit_on_texts(texts)
sequences_train = tokenizer.texts_to_sequences(texts)
sequences_valid=tokenizer.texts_to_sequences(val_df.complaints_untokenized)
word_index = tokenizer.word_index

EMBEDDING_DIM=300
vocabulary_size=min(len(word_index) + 1, NUM_WORDS)
embedding_layer = Embedding(vocabulary_size, EMBEDDING_DIM)

    
train_df = train_df.drop(val_df.index)
                    
size_train = len(train_x)
size_test = len(test_x)
output_labels_unique = np.asarray(sorted(list(set(labels))))

X_train = pad_sequences(sequences_train)
X_val = pad_sequences(sequences_valid,maxlen=X_train.shape[1]) #test
# convert into dummy representation of the output labels
y_train = to_categorical(np.asarray(labels[train_df.index]))
y_val = to_categorical(np.asarray(labels[val_df.index]))

sequence_length = X_train.shape[1]
filter_sizes = [3,4,5]
num_filters = 100
drop = 0.5

output_dim = len(products_unique)

print('Shape of X train and X test tensors:', X_train.shape, X_val.shape)
print('Shape of label train and test tensors:', y_train.shape, y_val.shape)

inputs = Input(shape=(sequence_length,))
embedding = embedding_layer(inputs)
reshape = Reshape((sequence_length, EMBEDDING_DIM, 1))(embedding)

conv_0 = Conv2D(num_filters, (filter_sizes[0], EMBEDDING_DIM), activation='relu', 
                                kernel_regularizer=regularizers.l2(0.01))(reshape)
conv_1 = Conv2D(num_filters, (filter_sizes[1], EMBEDDING_DIM), activation='relu', 
                                kernel_regularizer=regularizers.l2(0.01))(reshape)
conv_2 = Conv2D(num_filters, (filter_sizes[2], EMBEDDING_DIM), activation='relu', 
                                kernel_regularizer=regularizers.l2(0.01))(reshape)

maxpool_0 = MaxPooling2D((sequence_length - filter_sizes[0] + 1, 1), strides=(1,1))(conv_0)
maxpool_1 = MaxPooling2D((sequence_length - filter_sizes[1] + 1, 1), strides=(1,1))(conv_1)
maxpool_2 = MaxPooling2D((sequence_length - filter_sizes[2] + 1, 1), strides=(1,1))(conv_2)

merged_tensor = concatenate([maxpool_0, maxpool_1, maxpool_2], axis=1)
flatten = Flatten()(merged_tensor)
reshape = Reshape((3*num_filters,))(flatten)
dropout = Dropout(drop)(flatten)
output = Dense(units=output_dim, activation='softmax', kernel_regularizer=regularizers.l2(0.01))(dropout)

cnn_model = Model(inputs, output)
adam = Adam(lr=1e-3)
cnn_model.compile(loss='categorical_crossentropy',
              optimizer=adam,
              metrics=['acc'])
callbacks = [EarlyStopping(monitor='val_loss')]

cnn_model.fit(X_train, y_train, batch_size=1000, epochs=10, verbose=1, validation_data=(X_val, y_val),
                callbacks=callbacks)

# Predicting on the test set
sequences_test = test_x
X_test = pad_sequences(sequences_test, maxlen=X_train.shape[1])
cnn_preds = cnn_model.predict(X_test)
print("Predictions from CNN completed.")


cnn_results = pd.DataFrame(data={"actual_label":test_y, "predicted_label":cnn_preds})
# Accuracy: wherever the labels were correctly predicted.
cnn_results['correctly_predicted'] = np.where(cnn_results['actual_label'] == cnn_results['predicted_label'], 
                                                1, 0)
cnn_accuracy = (naive_results['correctly_predicted'].sum()/cnn_results.shape[0])*100
print("Accuracy of the CNN Model is: {0:.2f}.".format(cnn_accuracy))


# -----------------------------------------------------------------------------------------------------------------

# ## Conclusion
# 
# - The model that performed best was: CNN with SQUAD liek pre-training. It gave an accuracy measure of: 75.30%. This was obtained with the word2Vec model made out of the the training set. Further, 
# the gensim word model was used to create the sentence level representations of the consumer complaints post the pre-training
# -----------------------------------------------------------------------------------------------------------------