In [35]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
%matplotlib inline

In [36]:
data = pd.read_csv('Consumer_Complaints_with_Consumer_Complaint_Narratives.csv')

In [37]:
data.head(3)

Unnamed: 0,Date_received,Product,Sub-product,Issue,Sub-issue,Consumer_complaint_narrative,Company_public_response,Company,State,ZIP_code,Tags,Consumer_consent_provided?,Submitted_via,Date_sent_to_company,Company_response_to_consumer,Timely_response?,Consumer_disputed?,Complaint_ID
0,12/2/2016,Consumer Loan,Vehicle loan,Taking out the loan or lease,,Capitol One 360 keeps changing what I owe on m...,,Capital One,OH,453XX,,Consent provided,Web,12/2/2016,Closed with explanation,Yes,No,2233355
1,12/15/2016,Credit reporting,,Unable to get credit report/credit score,Problem getting my free annual report,I requested all XXXX reports. I got through th...,Company has responded to the consumer and the ...,Experian,FL,320XX,,Consent provided,Web,12/15/2016,Closed with non-monetary relief,Yes,No,2252210
2,11/13/2016,Student loan,Federal student loan servicing,Dealing with my lender or servicer,Having problems with customer service,I received a forberance on my loans last sprin...,,AES/PHEAA,PA,151XX,,Consent provided,Web,11/15/2016,Closed with explanation,Yes,No,2205926


In [38]:
#-->data['Company_response_to_consumer']

In [39]:
#filter and retain records with desired company response labels
responselist = ['Closed with explanation', 'Closed with non-monetary relief', 'Closed with monetary relief']
df = data[data.Company_response_to_consumer.isin(responselist)]

In [40]:
#-->df['Company_response_to_consumer']
len(df.index) #number of records

134855

In [41]:
#drop features/columns not needed for experiment
df = df.drop('Date_received', 1)
df = df.drop('Sub-product', 1)
df = df.drop('Sub-issue', 1)
df = df.drop('Consumer_complaint_narrative', 1)
df = df.drop('Company_public_response', 1)
df = df.drop('Tags', 1)
df = df.drop('Consumer_consent_provided?', 1)
df = df.drop('Submitted_via', 1)
df = df.drop('Date_sent_to_company', 1)
df = df.drop('Timely_response?', 1)
df = df.drop('Consumer_disputed?', 1)
df = df.drop('Complaint_ID', 1)

In [42]:
df.head(3)

Unnamed: 0,Product,Issue,Company,State,ZIP_code,Company_response_to_consumer
0,Consumer Loan,Taking out the loan or lease,Capital One,OH,453XX,Closed with explanation
1,Credit reporting,Unable to get credit report/credit score,Experian,FL,320XX,Closed with non-monetary relief
2,Student loan,Dealing with my lender or servicer,AES/PHEAA,PA,151XX,Closed with explanation


In [43]:
df.dtypes

Product                         object
Issue                           object
Company                         object
State                           object
ZIP_code                        object
Company_response_to_consumer    object
dtype: object

In [44]:
#convert column types to 'category'
df['Product'] = df['Product'].astype('category')
df['Issue'] = df['Issue'].astype('category')
df['Company'] = df['Company'].astype('category')
df['State'] = df['State'].astype('category')
df['ZIP_code'] = df['ZIP_code'].astype('category')
df['Company_response_to_consumer'] = df['Company_response_to_consumer'].astype('category')

In [45]:
df.dtypes

Product                         category
Issue                           category
Company                         category
State                           category
ZIP_code                        category
Company_response_to_consumer    category
dtype: object

In [46]:
#select columns of 'category' type
cat_columns = df.select_dtypes(['category']).columns
cat_columns

Index(['Product', 'Issue', 'Company', 'State', 'ZIP_code',
       'Company_response_to_consumer'],
      dtype='object')

In [47]:
#assign number values to categories
df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes)
#-->df

In [48]:
#seperate class/target column from rest of data
df_target = df["Company_response_to_consumer"] #dataframe with only consumer complaint
df_vec = df.drop('Company_response_to_consumer', 1) #dataframe with all except consumer complaint
df_vec.head()
#-->df_target

Unnamed: 0,Product,Issue,Company,State,ZIP_code
0,1,82,461,43,433
1,3,85,874,13,307
2,10,34,31,46,144
3,3,49,2416,3,343
4,6,57,2524,31,596


In [49]:
#convert df_vec dataframe to vector (any of the two sets of code will do)
df_matrix=df_vec.values
df_matrix
#numpyMatrix = df_vec.as_matrix()
#numpyMatrix

array([[   1,   82,  461,   43,  433],
       [   3,   85,  874,   13,  307],
       [  10,   34,   31,   46,  144],
       ..., 
       [  10,   34, 1688,   20,  568],
       [  10,   34, 1688,   35,  264],
       [  10,   16, 1688,   15,  287]], dtype=int16)

In [50]:
#split data into train and test set
vec_train, vec_test, target_train, target_test = train_test_split(df_matrix, df_target, test_size=0.2, random_state=4)
#-->vec_train
#-->vec_test

In [51]:
#machine learning algorithhm used for classification
mnb = MultinomialNB()
gnb = GaussianNB()

In [52]:
#covert to integer type to enable classification
#target_train = target_train.astype('int')
#vec_train = vec_train.astype('int')

#=======================================================================================================================#
# Beginning of Experiment 1: GaussianNB on Product	Issue	Company 	State	ZIP_code

In [53]:
#build classifier on train set
gnb.fit(vec_train, target_train)

GaussianNB()

In [54]:
#use built classifier to predict labels of test set
gnb_pred = gnb.predict(vec_test)
gnb_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int8)

In [55]:
#get actual labels of test set
actual = np.array(target_test)
actual

array([0, 0, 0, ..., 0, 0, 0], dtype=int8)

In [56]:
#count correct predictions
actualExp = 0 # counts actual 'closed with explanation' OR (0) labels
actualRel = 0 # counts actual 'closed with monetary relief' OR (1) labels
actualNon = 0 # counts actual 'closed with non-monetary relief' OR (2) labels

predExp = 0 # counts correct predicted 0 labels
predExp_Rel = 0 # counts incorrect predictions of 0 when 1
predExp_Non = 0 # counts incorrect predictions of 0 when 2

predRel = 0 # counts correct predicted 1 labels
predRel_Exp = 0 # counts incorrect predicitons of 1 when 0
predRel_Non = 0 # counts incorrect predictions of 1 when 2

predNon = 0 # counts correct predicted 2 labels
predNon_Exp = 0 # counts incorrect predictions of 2 when 0
predNon_Rel = 0 # counts incorrect predicitons of 2 when 1

In [57]:
count = 0
for i in range (len(gnb_pred)):
    if gnb_pred[i] == actual[i]:
        count = count + 1
count


21654

In [59]:
for i in range (len(gnb_pred)):
    act = actual[i]
    pred = gnb_pred[i]
    
    if pred == act:
        if act == 0:
            predExp = predExp + 1
            actualExp = actualExp + 1
        elif act == 1:
            predRel = predRel + 1
            actualRel = actualRel + 1
        elif act == 2:
            predNon = predNon + 1
            actualNon = actualNon + 1
    
    else:
        if act == 0:
            actualExp = actualExp + 1
            if pred == 1:
                predRel_Exp = predRel_Exp + 1
            elif pred == 2:
                predNon_Exp = predNon_Exp + 1
        if act == 1:
            actualRel = actualRel + 1
            if pred == 0:
                predExp_Rel = predExp_Rel + 1
            elif pred == 2:
                predNon_Rel = predNon_Rel + 1
        if act == 2:
            actualNon = actualNon + 1
            if pred == 0:
                predExp_Non = predExp_Non + 1
            elif pred == 1:
                predRel_Non = predRel_Non + 1

In [60]:
predExp

21654

In [61]:
predExp_Rel

1948

In [62]:
predExp_Non

3369

In [63]:
predRel

0

In [64]:
predRel_Exp

0

In [65]:
predRel_Non

0

In [66]:
predNon

0

In [67]:
predNon_Exp

0

In [68]:
predNon_Rel

0

In [69]:
actualExp

21654

In [70]:
actualRel

1948

In [71]:
actualNon

3369

In [72]:
# End of Experiment 1: GaussianNB on Product	Issue	Company 	State	ZIP_code
# =======================================================================================================================#
# Beginning of Experiment 2:MultinomialNB on Product	Issue	Company

In [73]:
# NOTE: MultinomialNB did not work initially because 'State' and 'ZIP_code' contain null inputs which gets changed to -1 by
# the category transformation. This causes an error because MultinomialNB cannot operate on negative values. So in order to test
# prediction on MultinomialNB I'll be removing 'State' and 'ZIP_code' from the data and matrix

len(df_vec[df_vec.State < 0])

414

In [74]:
df_vec2 = df_vec.drop('State', 1)
df_vec2 = df_vec2.drop('ZIP_code', 1)
df_vec2.head()

Unnamed: 0,Product,Issue,Company
0,1,82,461
1,3,85,874
2,10,34,31
3,3,49,2416
4,6,57,2524


In [75]:
#convert df_vec dataframe to vector (any of the two sets of code will do)
df_matrix2 = df_vec2.values
df_matrix2
#numpyMatrix = df_vec.as_matrix()
#numpyMatrix

array([[   1,   82,  461],
       [   3,   85,  874],
       [  10,   34,   31],
       ..., 
       [  10,   34, 1688],
       [  10,   34, 1688],
       [  10,   16, 1688]], dtype=int16)

In [76]:
#split data into train and test set
vec_train2, vec_test2, target_train2, target_test2 = train_test_split(df_matrix2, df_target, test_size=0.2, random_state=4)
#-->vec_train
#-->vec_test

In [77]:
#build classifier on train set
mnb.fit(vec_train2, target_train2)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [93]:
#use built classifier to predict labels of test set
mnb_pred = mnb.predict(vec_test2)
mnb_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int8)

In [94]:
#get actual labels of test set
actual2 = np.array(target_test2)
actual2

array([0, 0, 0, ..., 0, 0, 0], dtype=int8)

In [95]:
#count correct predictions
count2 = 0
for i in range (len(mnb_pred)):
    if mnb_pred[i] == actual2[i]:
        count2 = count2 + 1
count2

19261

In [96]:
#count correct predictions
actualExp = 0 # counts actual 'closed with explanation' OR (0) labels
actualRel = 0 # counts actual 'closed with monetary relief' OR (1) labels
actualNon = 0 # counts actual 'closed with non-monetary relief' OR (2) labels

predExp = 0 # counts correct predicted 0 labels
predExp_Rel = 0 # counts incorrect predictions of 0 when 1
predExp_Non = 0 # counts incorrect predictions of 0 when 2

predRel = 0 # counts correct predicted 1 labels
predRel_Exp = 0 # counts incorrect predicitons of 1 when 0
predRel_Non = 0 # counts incorrect predictions of 1 when 2

predNon = 0 # counts correct predicted 2 labels
predNon_Exp = 0 # counts incorrect predictions of 2 when 0
predNon_Rel = 0 # counts incorrect predicitons of 2 when 1

In [97]:
for i in range (len(gnb_pred)):
    act = actual[i]
    pred = mnb_pred[i]
    
    if pred == act:
        if act == 0:
            predExp = predExp + 1
            actualExp = actualExp + 1
        elif act == 1:
            predRel = predRel + 1
            actualRel = actualRel + 1
        elif act == 2:
            predNon = predNon + 1
            actualNon = actualNon + 1
    
    else:
        if act == 0:
            actualExp = actualExp + 1
            if pred == 1:
                predRel_Exp = predRel_Exp + 1
            elif pred == 2:
                predNon_Exp = predNon_Exp + 1
        if act == 1:
            actualRel = actualRel + 1
            if pred == 0:
                predExp_Rel = predExp_Rel + 1
            elif pred == 2:
                predNon_Rel = predNon_Rel + 1
        if act == 2:
            actualNon = actualNon + 1
            if pred == 0:
                predExp_Non = predExp_Non + 1
            elif pred == 1:
                predRel_Non = predRel_Non + 1

In [98]:
predExp

18693

In [99]:
predExp_Rel

1380

In [100]:
predExp_Non

2934

In [101]:
predRel

568

In [102]:
predRel_Exp

2961

In [103]:
predRel_Non

435

In [104]:
predNon

0

In [105]:
predNon_Exp

0

In [106]:
predNon_Rel

0

In [107]:
#accuracy of MultinomialNB
(18693+568)/26971

0.7141374068443884

In [None]:
# End of Experiment 2:MultinomialNB on Product	Issue	Company
# =======================================================================================================================#