# Complaints Classification model
## Author: Atul Raj Maurya
## Date: 21-10-2020

In [1]:
# importing pandas and numpy
import pandas as pd
import numpy as np

# import warnings to removing warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
# reading csv file and storing it to cc dataframe
cc= pd.read_csv('../../Dataset/complaints.csv')

In [3]:
# top 5 rors of cc dataframe
cc.head()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,03-12-2014,Mortgage,Other mortgage,"Loan modification,collection,foreclosure",,,,Referral,3/17/2014,Closed with explanation,Yes,No,759217
1,10-01-2016,Credit reporting,,Incorrect information on credit report,Account status,I have outdated information on my credit repor...,Company has responded to the consumer and the ...,Web,10-05-2016,Closed with explanation,Yes,No,2141773
2,10/17/2016,Consumer Loan,Vehicle loan,Managing the loan or lease,,I purchased a new car on XXXX XXXX. The car de...,,Web,10/20/2016,Closed with explanation,Yes,No,2163100
3,06-08-2014,Credit card,,Bankruptcy,,,,Web,06-10-2014,Closed with explanation,Yes,Yes,885638
4,9/13/2014,Debt collection,Credit card,Communication tactics,Frequent or repeated calls,,,Web,9/13/2014,Closed with explanation,Yes,Yes,1027760


In [4]:
# shape of complaints dataset
cc.shape

(903983, 13)

## Data preprocessing

In [5]:
# checking for null values
cc.isnull().sum()

Date received                        0
Product                              0
Sub-product                     235160
Issue                                0
Sub-issue                       477597
Consumer complaint narrative    704013
Company public response         646002
Submitted via                        0
Date sent to company                 0
Company response to consumer         0
Timely response?                     0
Consumer disputed?              135408
Complaint ID                         0
dtype: int64

### 1. merging similer columns

In [6]:
# Here as we can see sub-issue and Consumer complaint narrative contains too many errors, 
# and both are defining the issue, so we will join them with Consumer complaint narrative column

cc['Consumer complaint narrative']= cc[['Issue','Sub-issue','Consumer complaint narrative']].apply(lambda x: ' '.join(x.astype(str)),axis=1)

In [7]:
# print Consumer complaint narrative from cc
cc['Consumer complaint narrative'].head()

0     Loan modification,collection,foreclosure nan nan
1    Incorrect information on credit report Account...
2    Managing the loan or lease nan I purchased a n...
3                                   Bankruptcy nan nan
4    Communication tactics Frequent or repeated cal...
Name: Consumer complaint narrative, dtype: object

### 2. dividing cc datframe

In [8]:
# as we can see conmplaint classification can be done by only product and coustomer complaint narrative 
# so we use only those 2 columns for ml model
df= cc[['Product','Consumer complaint narrative']]

In [9]:
df.head()

Unnamed: 0,Product,Consumer complaint narrative
0,Mortgage,"Loan modification,collection,foreclosure nan nan"
1,Credit reporting,Incorrect information on credit report Account...
2,Consumer Loan,Managing the loan or lease nan I purchased a n...
3,Credit card,Bankruptcy nan nan
4,Debt collection,Communication tactics Frequent or repeated cal...


In [10]:
df.shape

(903983, 2)

In [11]:
df.isnull().sum()

Product                         0
Consumer complaint narrative    0
dtype: int64

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 903983 entries, 0 to 903982
Data columns (total 2 columns):
 #   Column                        Non-Null Count   Dtype 
---  ------                        --------------   ----- 
 0   Product                       903983 non-null  object
 1   Consumer complaint narrative  903983 non-null  object
dtypes: object(2)
memory usage: 13.8+ MB


### 3. reduce the items in a column

In [13]:
df.Product.value_counts()

Mortgage                                                                        242194
Debt collection                                                                 171567
Credit reporting                                                                140424
Credit card                                                                      89190
Bank account or service                                                          86207
Credit reporting, credit repair services, or other personal consumer reports     59186
Student loan                                                                     38612
Consumer Loan                                                                    31608
Credit card or prepaid card                                                      11921
Checking or savings account                                                       9947
Payday loan                                                                       5546
Money transfers                            

In [14]:
# As we can see here is too many items in this column and we will merge the catagories
df.Product[df.Product == 'Money transfer, virtual currency, or money service']= 'Money transfers'
df.Product[df.Product == 'Virtual currency']= 'Money transfers'

df.Product[df.Product == 'Prepaid card']= 'Credit card or prepaid card'
df.Product[df.Product == 'Credit card']= 'Credit card or prepaid card'

df.Product[df.Product == 'Payday loan']= 'Payday loan, title loan, or personal loan'
df.Product[df.Product == 'Credit reporting, credit repair services, or other personal consumer reports']= 'Credit reporting'

In [15]:
# So we can see that we have divided products to some usefull catagories
df.Product.value_counts()

Mortgage                                     242194
Credit reporting                             199610
Debt collection                              171567
Credit card or prepaid card                  104930
Bank account or service                       86207
Student loan                                  38612
Consumer Loan                                 31608
Checking or savings account                    9947
Payday loan, title loan, or personal loan      7791
Money transfers                                7585
Vehicle loan or lease                          2873
Other financial service                        1059
Name: Product, dtype: int64

### 4. Split Train and Test data

In [16]:
# import aklearn library to split
from sklearn.model_selection import train_test_split

In [17]:
X_train,X_test,Y_train,Y_test= train_test_split(df['Consumer complaint narrative'], df['Product'], test_size=0.2, random_state=5)

## ML Model

In [18]:
# import ML libraries
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

In [19]:
# creating ML pipeline
comp_class= Pipeline([
    ('vect',CountVectorizer()),
    ('ifidf',TfidfTransformer()),
    ('clf',MultinomialNB())
])

In [20]:
# fit the training data to pipeline
comp_class.fit(X_train,Y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('ifidf', TfidfTransformer()),
                ('clf', MultinomialNB())])

In [21]:
# see accuracy score
comp_class.score(X_train,Y_train)

0.9310536984952695

In [22]:
comp_class.score(X_test,Y_test)

0.9309999612825434

### Check by predicting by ML model

In [23]:
# type your complaint here
new_comp= ['how dont know how to manage loan']
comp_class.predict(new_comp)

array(['Student loan'], dtype='<U41')

## Save the Model

In [24]:
import pickle

In [25]:
# dump ml model in pickle file
pickle.dump(comp_class,open("ComplaintClassificationMlModel.pkl",'wb'))  
# wb is for write binary