In [1]:
#importing packages
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
%matplotlib inline



In [36]:
data = pd.read_csv('Consumer_Complaints_with_Consumer_Complaint_Narratives.csv')

In [37]:
data.head(3)

Unnamed: 0,Date_received,Product,Sub-product,Issue,Sub-issue,Consumer_complaint_narrative,Company_public_response,Company,State,ZIP_code,Tags,Consumer_consent_provided?,Submitted_via,Date_sent_to_company,Company_response_to_consumer,Timely_response?,Consumer_disputed?,Complaint_ID
0,12/2/2016,Consumer Loan,Vehicle loan,Taking out the loan or lease,,Capitol One 360 keeps changing what I owe on m...,,Capital One,OH,453XX,,Consent provided,Web,12/2/2016,Closed with explanation,Yes,No,2233355
1,12/15/2016,Credit reporting,,Unable to get credit report/credit score,Problem getting my free annual report,I requested all XXXX reports. I got through th...,Company has responded to the consumer and the ...,Experian,FL,320XX,,Consent provided,Web,12/15/2016,Closed with non-monetary relief,Yes,No,2252210
2,11/13/2016,Student loan,Federal student loan servicing,Dealing with my lender or servicer,Having problems with customer service,I received a forberance on my loans last sprin...,,AES/PHEAA,PA,151XX,,Consent provided,Web,11/15/2016,Closed with explanation,Yes,No,2205926


In [38]:
#-->data['Company_response_to_consumer']

In [39]:
#filter and retain records with desired company response labels
responselist = ['Closed with explanation', 'Closed with non-monetary relief', 'Closed with monetary relief']
df = data[data.Company_response_to_consumer.isin(responselist)]

In [40]:
#-->df['Company_response_to_consumer']
len(df.index) #number of records

134855

In [41]:
#drop features/columns not needed for experiment
df = df.drop('Sub-product', 1)
df = df.drop('Sub-issue', 1)
df = df.drop('Consumer_complaint_narrative', 1)
df = df.drop('Company_public_response', 1)
df = df.drop('Tags', 1)
df = df.drop('Consumer_consent_provided?', 1)
df = df.drop('Submitted_via', 1)
df = df.drop('Date_sent_to_company', 1)
df = df.drop('Timely_response?', 1)
df = df.drop('Consumer_disputed?', 1)
df = df.drop('Complaint_ID', 1)

In [42]:
df.head(3)

Unnamed: 0,Date_received,Product,Issue,Company,State,ZIP_code,Company_response_to_consumer
0,12/2/2016,Consumer Loan,Taking out the loan or lease,Capital One,OH,453XX,Closed with explanation
1,12/15/2016,Credit reporting,Unable to get credit report/credit score,Experian,FL,320XX,Closed with non-monetary relief
2,11/13/2016,Student loan,Dealing with my lender or servicer,AES/PHEAA,PA,151XX,Closed with explanation


In [43]:
df.dtypes

Date_received                   object
Product                         object
Issue                           object
Company                         object
State                           object
ZIP_code                        object
Company_response_to_consumer    object
dtype: object

In [44]:
#Override null entries to avoid negative numbers when converting to category
df.State.replace(np.NaN, 'XX', inplace=True)
df.ZIP_code.replace(np.NaN, 'XXXXX', inplace=True)

In [45]:
#convert column types to 'category'
df['Product'] = df['Product'].astype('category')
df['Issue'] = df['Issue'].astype('category')
df['Company'] = df['Company'].astype('category')
df['State'] = df['State'].astype('category')
df['ZIP_code'] = df['ZIP_code'].astype('category')
df['Company_response_to_consumer'] = df['Company_response_to_consumer'].astype('category')

In [46]:
df.dtypes

Date_received                     object
Product                         category
Issue                           category
Company                         category
State                           category
ZIP_code                        category
Company_response_to_consumer    category
dtype: object

In [47]:
#select columns of 'category' type
cat_columns = df.select_dtypes(['category']).columns
cat_columns

Index(['Product', 'Issue', 'Company', 'State', 'ZIP_code',
       'Company_response_to_consumer'],
      dtype='object')

In [48]:
#assign number values to categories
df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes)
#-->df

In [49]:
##increment State and ZIP_code by 1 to change -1 to 0 and preserve distinction property
#for index, row in df.iterrows():
#    if row['State']<0:
#        print (row['State'])
## NOTE: MultinomialNB did not work initially because 'State' and 'ZIP_code' contain null inputs which gets changed to -1 by
## the category transformation. This causes an error because MultinomialNB cannot operate on negative values. So in order to test
## prediction on MultinomialNB I'll be removing 'State' and 'ZIP_code' from the data and matrix

len(df[df.ZIP_code < 0])

0

In [479]:
#sort by date recieved and split data to train and test set at this point
df['Date_received'] = pd.to_datetime(df['Date_received'])
df = df.sort_values('Date_received')
cutoff = int(df.shape[0] * 0.2)
train = df.iloc[:cutoff]
test = df.iloc[cutoff +1:]
#test.tail(10)

In [480]:
#seperate class/target column from rest of data
target_train = train["Company_response_to_consumer"] #dataframe with only consumer complaint
target_test = test["Company_response_to_consumer"]
#remove other features not needed in experiment
vec_train = train.drop('Company_response_to_consumer', 1) #dataframe with all except consumer complaint
vec_train = vec_train.drop('Date_received', 1)
vec_test = test.drop('Company_response_to_consumer', 1)
vec_test = vec_test.drop('Date_received', 1)

target_test.tail(3)

85680    0
85040    0
85083    0
Name: Company_response_to_consumer, dtype: int8

In [481]:
#convert *_vec dataframe to vector (any of the two sets of code will do)
vec_train2 = vec_train.values
vec_train2
#numpyMatrix = df_vec.as_matrix()
#numpyMatrix

array([[   3,   49,  874,   50,  280],
       [   2,   28,  530,   25,   18],
       [   2,   14, 1273,   13,  324],
       ..., 
       [   4,   24,  822,   20,  568],
       [   3,   49,  851,   58,  906],
       [   4,   24, 1879,   35,  263]], dtype=int16)

In [482]:
vec_test2 = vec_test.values
vec_test2

array([[   4,   24,  812,    8,  868],
       [   6,   56, 1791,   25,   16],
       [   6,   57, 1701,   45,  896],
       ..., 
       [  10,   34, 2287,   45,  896],
       [   2,   22,  529,    8,  869],
       [   4,   37, 2161,   39,   65]], dtype=int16)

In [483]:
#machine learning algorithhm used for classification
mnb = MultinomialNB()
gnb = GaussianNB()

In [484]:
#covert to integer type to enable classification
#target_train = target_train.astype('int')
#vec_train = vec_train.astype('int')

#=======================================================================================================================#
# Beginning of Experiment 1: GaussianNB on Product	Issue	Company 	State	ZIP_code

In [485]:
#build classifier on train set
mnb.fit(vec_train2, target_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [486]:
#use built classifier to predict labels of test set
mnb_pred = mnb.predict(vec_test2)
mnb_pred

array([2, 0, 2, ..., 0, 2, 0], dtype=int8)

In [487]:
#get actual labels of test set
actual = np.array(target_test)
actual

array([2, 0, 0, ..., 0, 0, 0], dtype=int8)

In [488]:
#count correct predictions
actualExp = 0 # counts actual 'closed with explanation' OR (0) labels
actualRel = 0 # counts actual 'closed with monetary relief' OR (1) labels
actualNon = 0 # counts actual 'closed with non-monetary relief' OR (2) labels
    
predExp = 0 # counts correct predicted 0 labels
predExp_Rel = 0 # counts incorrect predictions of 0 when 1
predExp_Non = 0 # counts incorrect predictions of 0 when 2
   
predRel = 0 # counts correct predicted 1 labels
predRel_Exp = 0 # counts incorrect predicitons of 1 when 0
predRel_Non = 0 # counts incorrect predictions of 1 when 2
    
predNon = 0 # counts correct predicted 2 labels
predNon_Exp = 0 # counts incorrect predictions of 2 when 0
predNon_Rel = 0 # counts incorrect predicitons of 2 when 1

In [489]:
for i in range (len(mnb_pred)):
    act = actual[i]
    pred = mnb_pred[i]
    
    if pred == act:
        if act == 0:
            predExp = predExp + 1
            actualExp = actualExp + 1
        elif act == 1:
            predRel = predRel + 1
            actualRel = actualRel + 1
        elif act == 2:
            predNon = predNon + 1
            actualNon = actualNon + 1
    
    else:
        if act == 0:
            actualExp = actualExp + 1
            if pred == 1:
                predRel_Exp = predRel_Exp + 1
            elif pred == 2:
                predNon_Exp = predNon_Exp + 1
        if act == 1:
            actualRel = actualRel + 1
            if pred == 0:
                predExp_Rel = predExp_Rel + 1
            elif pred == 2:
                predNon_Rel = predNon_Rel + 1
        if act == 2:
            actualNon = actualNon + 1
            if pred == 0:
                predExp_Non = predExp_Non + 1
            elif pred == 1:
                predRel_Non = predRel_Non + 1

In [490]:
print ("predExp: %d" % ( predExp ))
print ("predExp_Rel: %d" % ( predExp_Rel ))
print ("predExp_Non: %d" % ( predExp_Non ))
print ("predRel: %d" % ( predRel ))
print ("predRel_Exp: %d" % ( predRel_Exp ))
print ("predRel_Non: %d" % ( predRel_Non ))
print ("predNon: %d" % ( predNon ))
print ("predNon_Exp: %d" % ( predNon_Exp ))
print ("predNon_Rel: %d" % ( predNon_Rel ))
print ("actualExp: %d" % ( actualExp ))
print ("actualRel: %d" % ( actualRel ))
print ("actualNon: %d" % ( actualNon ))

accuracy = (predExp+predRel+predNon)/(actualExp+actualRel+actualNon)
accuracy

predExp: 46880
predExp_Rel: 3802
predExp_Non: 7165
predRel: 440
predRel_Exp: 1934
predRel_Non: 291
predNon: 5811
predNon_Exp: 37419
predNon_Rel: 4141
actualExp: 86233
actualRel: 8383
actualNon: 13267


0.4924872315378697

In [491]:
# End of Experiment 1: GaussianNB on Product	Issue	Company 	State	ZIP_code
# =======================================================================================================================#
# Beginning of Experiment 2:MultinomialNB on Product	Issue	Company

In [492]:
#df_vec2 = df_vec.drop('State', 1)
#df_vec2 = df_vec2.drop('ZIP_code', 1)
gnb.fit(vec_train, target_train)

GaussianNB(priors=None)

In [493]:
#use built classifier to predict labels of test set
gnb_pred = gnb.predict(vec_test)
gnb_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int8)

In [494]:
#count correct predictions
actualExp = 0 # counts actual 'closed with explanation' OR (0) labels
actualRel = 0 # counts actual 'closed with monetary relief' OR (1) labels
actualNon = 0 # counts actual 'closed with non-monetary relief' OR (2) labels

predExp = 0 # counts correct predicted 0 labels
predExp_Rel = 0 # counts incorrect predictions of 0 when 1
predExp_Non = 0 # counts incorrect predictions of 0 when 2

predRel = 0 # counts correct predicted 1 labels
predRel_Exp = 0 # counts incorrect predicitons of 1 when 0
predRel_Non = 0 # counts incorrect predictions of 1 when 2

predNon = 0 # counts correct predicted 2 labels
predNon_Exp = 0 # counts incorrect predictions of 2 when 0
predNon_Rel = 0 # counts incorrect predicitons of 2 when 1

In [495]:
for i in range (len(gnb_pred)):
    act = actual[i]
    pred = gnb_pred[i]
    
    if pred == act:
        if act == 0:
            predExp = predExp + 1
            actualExp = actualExp + 1
        elif act == 1:
            predRel = predRel + 1
            actualRel = actualRel + 1
        elif act == 2:
            predNon = predNon + 1
            actualNon = actualNon + 1
    
    else:
        if act == 0:
            actualExp = actualExp + 1
            if pred == 1:
                predRel_Exp = predRel_Exp + 1
            elif pred == 2:
                predNon_Exp = predNon_Exp + 1
        if act == 1:
            actualRel = actualRel + 1
            if pred == 0:
                predExp_Rel = predExp_Rel + 1
            elif pred == 2:
                predNon_Rel = predNon_Rel + 1
        if act == 2:
            actualNon = actualNon + 1
            if pred == 0:
                predExp_Non = predExp_Non + 1
            elif pred == 1:
                predRel_Non = predRel_Non + 1

In [496]:
print ("predExp: %d" % ( predExp ))
print ("predExp_Rel: %d" % ( predExp_Rel ))
print ("predExp_Non: %d" % ( predExp_Non ))
print ("predRel: %d" % ( predRel ))
print ("predRel_Exp: %d" % ( predRel_Exp ))
print ("predRel_Non: %d" % ( predRel_Non ))
print ("predNon: %d" % ( predNon ))
print ("predNon_Exp: %d" % ( predNon_Exp ))
print ("predNon_Rel: %d" % ( predNon_Rel ))
print ("actualExp: %d" % ( actualExp ))
print ("actualRel: %d" % ( actualRel ))
print ("actualNon: %d" % ( actualNon ))

accuracy = (predExp+predRel+predNon)/(actualExp+actualRel+actualNon)
accuracy

predExp: 86233
predExp_Rel: 8383
predExp_Non: 13267
predRel: 0
predRel_Exp: 0
predRel_Non: 0
predNon: 0
predNon_Exp: 0
predNon_Rel: 0
actualExp: 86233
actualRel: 8383
actualNon: 13267


0.7993196333064524

In [497]:
# End of Experiment 2:MultinomialNB on Product	Issue	Company
# =======================================================================================================================#

In [498]:
#remove other features not needed in experiment
vec_train = vec_train.drop('State', 1) 
vec_train = vec_train.drop('ZIP_code', 1)
vec_test = vec_test.drop('State', 1)
vec_test = vec_test.drop('ZIP_code', 1)

target_test.tail(3)

85680    0
85040    0
85083    0
Name: Company_response_to_consumer, dtype: int8

In [499]:
#convert *_vec dataframe to vector (any of the two sets of code will do)
vec_train2 = vec_train.values
vec_train2

array([[   3,   49,  874],
       [   2,   28,  530],
       [   2,   14, 1273],
       ..., 
       [   4,   24,  822],
       [   3,   49,  851],
       [   4,   24, 1879]], dtype=int16)

In [500]:
vec_test2 = vec_test.values
vec_test2

array([[   4,   24,  812],
       [   6,   56, 1791],
       [   6,   57, 1701],
       ..., 
       [  10,   34, 2287],
       [   2,   22,  529],
       [   4,   37, 2161]], dtype=int16)

In [501]:
#build classifier on train set
mnb.fit(vec_train, target_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [502]:
#use built classifier to predict labels of test set
mnb_pred = mnb.predict(vec_test)
mnb_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int8)

In [503]:
#get actual labels of test set
actual = np.array(target_test)
actual

array([2, 0, 0, ..., 0, 0, 0], dtype=int8)

In [504]:
#count correct predictions
actualExp = 0 # counts actual 'closed with explanation' OR (0) labels
actualRel = 0 # counts actual 'closed with monetary relief' OR (1) labels
actualNon = 0 # counts actual 'closed with non-monetary relief' OR (2) labels
    
predExp = 0 # counts correct predicted 0 labels
predExp_Rel = 0 # counts incorrect predictions of 0 when 1
predExp_Non = 0 # counts incorrect predictions of 0 when 2
   
predRel = 0 # counts correct predicted 1 labels
predRel_Exp = 0 # counts incorrect predicitons of 1 when 0
predRel_Non = 0 # counts incorrect predictions of 1 when 2
    
predNon = 0 # counts correct predicted 2 labels
predNon_Exp = 0 # counts incorrect predictions of 2 when 0
predNon_Rel = 0 # counts incorrect predicitons of 2 when 1

In [505]:
for i in range (len(mnb_pred)):
    act = actual[i]
    pred = mnb_pred[i]
    
    if pred == act:
        if act == 0:
            predExp = predExp + 1
            actualExp = actualExp + 1
        elif act == 1:
            predRel = predRel + 1
            actualRel = actualRel + 1
        elif act == 2:
            predNon = predNon + 1
            actualNon = actualNon + 1
    
    else:
        if act == 0:
            actualExp = actualExp + 1
            if pred == 1:
                predRel_Exp = predRel_Exp + 1
            elif pred == 2:
                predNon_Exp = predNon_Exp + 1
        if act == 1:
            actualRel = actualRel + 1
            if pred == 0:
                predExp_Rel = predExp_Rel + 1
            elif pred == 2:
                predNon_Rel = predNon_Rel + 1
        if act == 2:
            actualNon = actualNon + 1
            if pred == 0:
                predExp_Non = predExp_Non + 1
            elif pred == 1:
                predRel_Non = predRel_Non + 1

In [506]:
print ("predExp: %d" % ( predExp ))
print ("predExp_Rel: %d" % ( predExp_Rel ))
print ("predExp_Non: %d" % ( predExp_Non ))
print ("predRel: %d" % ( predRel ))
print ("predRel_Exp: %d" % ( predRel_Exp ))
print ("predRel_Non: %d" % ( predRel_Non ))
print ("predNon: %d" % ( predNon ))
print ("predNon_Exp: %d" % ( predNon_Exp ))
print ("predNon_Rel: %d" % ( predNon_Rel ))
print ("actualExp: %d" % ( actualExp ))
print ("actualRel: %d" % ( actualRel ))
print ("actualNon: %d" % ( actualNon ))

accuracy = (predExp+predRel+predNon)/(actualExp+actualRel+actualNon)
accuracy

predExp: 69178
predExp_Rel: 5636
predExp_Non: 11031
predRel: 48
predRel_Exp: 132
predRel_Non: 4
predNon: 2232
predNon_Exp: 16923
predNon_Rel: 2699
actualExp: 86233
actualRel: 8383
actualNon: 13267


0.6623657110017334