##########################################################################
## Consumer complaints classification model
### Author : Fahid Latheef A
### Date written: 21-Oct-2020

##########################################################################
### Aim: To assign the complaints to most appropriate department
### Dataset for training: complaints.csv
### Description: 
##### This module builds machine learning model that can classify consumer complaints to different catogories
##### The model is saved in a serializable object using pickle
##########################################################################

##### Input: complaints.csv
##### Outputs:
###### CC_model_MNB.pkl
###### CC_model_LR.pkl
###### CC_model_RF.pkl
###### CC_model_BLSVC.pkl
##########################################################################

In [1]:
# import data processing libraries
import pandas as pd
import numpy as np
# import machine learning model libraries
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [2]:
# List of stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')
stop.append('nan') # adding nan as a stopword
print(stop)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Fahid\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
%%time
# load the dataset
cc = pd.read_csv('complaints.csv')

Wall time: 5.04 s


In [5]:
# size of the data
cc.shape

(903983, 13)

In [6]:
# snapshot of data
cc.head()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,3/12/2014,Mortgage,Other mortgage,"Loan modification,collection,foreclosure",,,,Referral,3/17/2014,Closed with explanation,Yes,No,759217
1,10/1/2016,Credit reporting,,Incorrect information on credit report,Account status,I have outdated information on my credit repor...,Company has responded to the consumer and the ...,Web,10/5/2016,Closed with explanation,Yes,No,2141773
2,10/17/2016,Consumer Loan,Vehicle loan,Managing the loan or lease,,I purchased a new car on XXXX XXXX. The car de...,,Web,10/20/2016,Closed with explanation,Yes,No,2163100
3,6/8/2014,Credit card,,Bankruptcy,,,,Web,6/10/2014,Closed with explanation,Yes,Yes,885638
4,9/13/2014,Debt collection,Credit card,Communication tactics,Frequent or repeated calls,,,Web,9/13/2014,Closed with explanation,Yes,Yes,1027760


In [7]:
# check for null values
cc.isnull().sum()

Date received                        0
Product                              0
Sub-product                     235160
Issue                                0
Sub-issue                       477597
Consumer complaint narrative    704013
Company public response         646002
Submitted via                        0
Date sent to company                 0
Company response to consumer         0
Timely response?                     0
Consumer disputed?              135408
Complaint ID                         0
dtype: int64

In [8]:
%%time
# combine 3 columns and apply string function to make one single columns
cc['new_complaint_narrative'] = cc[['Issue','Sub-issue','Consumer complaint narrative']].apply(lambda x:
                                                                                               ' '.join(x.astype(str)),axis=1)
cc['new_complaint_narrative'].head()

Wall time: 1min 41s


0     Loan modification,collection,foreclosure nan nan
1    Incorrect information on credit report Account...
2    Managing the loan or lease nan I purchased a n...
3                                   Bankruptcy nan nan
4    Communication tactics Frequent or repeated cal...
Name: new_complaint_narrative, dtype: object

In [9]:
%%time
# Converting to lower-case and stripping the whitespaces
cc['new_complaint_narrative'] = cc['new_complaint_narrative'].apply(lambda x: x.strip().lower())
cc['new_complaint_narrative'].head()

Wall time: 1.18 s


0     loan modification,collection,foreclosure nan nan
1    incorrect information on credit report account...
2    managing the loan or lease nan i purchased a n...
3                                   bankruptcy nan nan
4    communication tactics frequent or repeated cal...
Name: new_complaint_narrative, dtype: object

In [10]:
%%time
# Remove Stop-Words
# stop is the list of stopwords
cc['new_complaint_narrative'] = cc['new_complaint_narrative'].apply(lambda words: ' '.join(word.lower()
                                                                                           for word in words.split() if word not in stop))
cc['new_complaint_narrative'].head()

Wall time: 2min


0             loan modification,collection,foreclosure
1    incorrect information credit report account st...
2    managing loan lease purchased new car xxxx xxx...
3                                           bankruptcy
4        communication tactics frequent repeated calls
Name: new_complaint_narrative, dtype: object

In [11]:
# Function to remove punctuations
# Replace Punctuations with ' '
punctuations = [',', '.', ';', ':', '(', ')', '{', '}', '[', ']']
def remove_punctuations(text, punctuations = punctuations):
    for letter in text:
        if letter in punctuations:
            text = text.replace(letter, ' ')
    return text

In [12]:
%%time
# Remove Punctuations
cc['new_complaint_narrative'] = cc['new_complaint_narrative'].apply(remove_punctuations)
cc['new_complaint_narrative'].head()

Wall time: 53.4 s


0             loan modification collection foreclosure
1    incorrect information credit report account st...
2    managing loan lease purchased new car xxxx xxx...
3                                           bankruptcy
4        communication tactics frequent repeated calls
Name: new_complaint_narrative, dtype: object

In [13]:
# take only the required two columns
df = cc[['Product','new_complaint_narrative']]

In [14]:
df.shape

(903983, 2)

In [15]:
df.isnull().sum()

Product                    0
new_complaint_narrative    0
dtype: int64

In [16]:
# what are the different catogories?
df.Product.value_counts()

Mortgage                                                                        242194
Debt collection                                                                 171567
Credit reporting                                                                140424
Credit card                                                                      89190
Bank account or service                                                          86207
Credit reporting, credit repair services, or other personal consumer reports     59186
Student loan                                                                     38612
Consumer Loan                                                                    31608
Credit card or prepaid card                                                      11921
Checking or savings account                                                       9947
Payday loan                                                                       5546
Money transfers                            

In [17]:
%%time
# combining product categories
df.Product[df.Product == 'Money transfer, virtual currency, or money service'] = 'Money transfers'
df.Product[df.Product == 'Prepaid card'] = 'Credit card or prepaid card'
df.Product[df.Product == 'Virtual currency'] = 'Money transfers'
df.Product[df.Product == 'Payday loan'] = 'Payday loan, title loan, or personal loan'
df.Product[df.Product == 'Credit card'] = 'Credit card or prepaid card'

Wall time: 604 ms


In [18]:
df.Product.value_counts()

Mortgage                                                                        242194
Debt collection                                                                 171567
Credit reporting                                                                140424
Credit card or prepaid card                                                     104930
Bank account or service                                                          86207
Credit reporting, credit repair services, or other personal consumer reports     59186
Student loan                                                                     38612
Consumer Loan                                                                    31608
Checking or savings account                                                       9947
Payday loan, title loan, or personal loan                                         7791
Money transfers                                                                   7585
Vehicle loan or lease                      

In [19]:
# build ML model pipeline Multinomial NB
from sklearn.naive_bayes import MultinomialNB

comp_class_MNB = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [20]:
# test and train split
X_train, X_test, y_train, y_test = train_test_split(df['new_complaint_narrative'], df['Product'], random_state = 123)

In [21]:
%%time
# train the model
comp_class_MNB.fit(X_train, y_train)

Wall time: 36.2 s


Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [22]:
%%time
# accuracy on train data
comp_class_MNB.score(X_train,y_train)

Wall time: 31.6 s


0.9139939261372269

In [23]:
%%time
# accuracy on test data
comp_class_MNB.score(X_test,y_test)

Wall time: 10.7 s


0.9124497778721747

In [24]:
new_complaint = ["This company refuses to provide me verification and validation of debt per my right under the FDCPA. I do not believe this debt is mine."]
print(comp_class_MNB.predict(new_complaint))

['Debt collection']


In [25]:
%%time
# save the model for later use
import pickle
pickle.dump(comp_class_MNB, open("CC_model_MNB.pkl", 'wb'))

Wall time: 499 ms


In [26]:
# build ML model pipeline Logistic Regression
from sklearn.linear_model import LogisticRegression

comp_class_LogReg = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LogisticRegression(multi_class='multinomial')),
])

In [27]:
%%time
# train the model
comp_class_LogReg.fit(X_train, y_train)

Wall time: 3min 20s


Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
   

In [28]:
%%time
# accuracy on train data
comp_class_LogReg.score(X_train,y_train)

Wall time: 28.9 s


0.9740231007379198

In [29]:
%%time
# accuracy on test data
comp_class_LogReg.score(X_test,y_test)

Wall time: 10.1 s


0.9714950707092161

In [30]:
new_complaint = ["This company refuses to provide me verification and validation of debt per my right under the FDCPA. I do not believe this debt is mine."]
print(comp_class_LogReg.predict(new_complaint))

['Debt collection']


In [31]:
%%time
# save the model for later use
pickle.dump(comp_class_LogReg, open("CC_model_LR.pkl", 'wb'))

Wall time: 363 ms


### Omitting Decision Tree because Random Forest is a collection of Decision Trees and is better for large Data

In [32]:
# build ML model pipeline Random Forest
from sklearn.ensemble import RandomForestClassifier

comp_class_RF = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', RandomForestClassifier()),
])

In [33]:
%%time
# train the model
comp_class_RF.fit(X_train, y_train)

Wall time: 45min 56s


Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                

In [34]:
%%time
# accuracy on Train Data
comp_class_RF.score(X_train, y_train)

Wall time: 1min 26s


0.9879599461346604

In [35]:
%%time
# accuracy on test data
comp_class_RF.score(X_test,y_test)

Wall time: 32.5 s


0.9728269526894281

In [36]:
print(comp_class_RF.predict(new_complaint))

['Debt collection']


In [37]:
# save the model for later use
pickle.dump(comp_class_RF, open("CC_model_RF.pkl", 'wb'))

### I am using BSVC since SVC is taking a lot of time to converge
### KNN is very computationally expensive for large dataset, so convergance is slow. ignoring them

In [38]:
# build ML model pipeline Bagging Linear Support Vector Classifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import BaggingClassifier

comp_class_BLSVC = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', BaggingClassifier(LinearSVC(verbose = 1),bootstrap = False)),
])

In [39]:
%%time
# train the model
comp_class_BLSVC.fit(X_train, y_train)

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]Wall time: 9min 23s
Parser   : 181 ms


Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                                                            class_weight=None,
                                                            dual=True,
                                                            fit_intercept=True,
                                                            intercept_scaling=1,
                         

In [40]:
%%time
# accuracy on train data
comp_class_BLSVC.score(X_train,y_train)

Wall time: 42 s


0.9813152759566186

In [41]:
# accuracy on test data
comp_class_BLSVC.score(X_test,y_test)

0.973999539814864

In [42]:
print(comp_class_BLSVC.predict(["This company refuses to provide me verification and validation of debt per my right under the FDCPA. I do not believe this debt is mine."]))

['Debt collection']


In [43]:
# save the model for later use
pickle.dump(comp_class_BLSVC, open("CC_model_BLSVC.pkl", 'wb'))

## Comparing the Models