In [188]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
%matplotlib inline

In [189]:
data = pd.read_csv('Consumer_Complaints_with_Consumer_Complaint_Narratives.csv')

In [198]:
data.head(3)

Unnamed: 0,Date_received,Product,Sub-product,Issue,Sub-issue,Consumer_complaint_narrative,Company_public_response,Company,State,ZIP_code,Tags,Consumer_consent_provided?,Submitted_via,Date_sent_to_company,Company_response_to_consumer,Timely_response?,Consumer_disputed?,Complaint_ID
0,12/2/2016,Consumer Loan,Vehicle loan,Taking out the loan or lease,,Capitol One 360 keeps changing what I owe on m...,,Capital One,OH,453XX,,Consent provided,Web,12/2/2016,Closed with explanation,Yes,No,2233355
1,12/15/2016,Credit reporting,,Unable to get credit report/credit score,Problem getting my free annual report,I requested all XXXX reports. I got through th...,Company has responded to the consumer and the ...,Experian,FL,320XX,,Consent provided,Web,12/15/2016,Closed with non-monetary relief,Yes,No,2252210
2,11/13/2016,Student loan,Federal student loan servicing,Dealing with my lender or servicer,Having problems with customer service,I received a forberance on my loans last sprin...,,AES/PHEAA,PA,151XX,,Consent provided,Web,11/15/2016,Closed with explanation,Yes,No,2205926


In [199]:
data.tail(2)

Unnamed: 0,Date_received,Product,Sub-product,Issue,Sub-issue,Consumer_complaint_narrative,Company_public_response,Company,State,ZIP_code,Tags,Consumer_consent_provided?,Submitted_via,Date_sent_to_company,Company_response_to_consumer,Timely_response?,Consumer_disputed?,Complaint_ID
139376,8/7/2015,Student loan,Non-federal student loan,Dealing with my lender or servicer,Keep getting calls about my loan,"XXXX : XXXX University, XXXX, NC. \nXXXX XXXX ...",,"Navient Solutions, LLC.",NC,277XX,,Consent provided,Web,8/13/2015,Closed with explanation,Yes,No,1508714
139377,2/11/2016,Student loan,Non-federal student loan,Can't repay my loan,Can't get flexible payment options,I am have been making payments to Navient priv...,,"Navient Solutions, LLC.",GA,300XX,,Consent provided,Web,2/16/2016,Closed with explanation,Yes,No,1782604


In [200]:
#-->data['Company_response_to_consumer']

In [201]:
#filter and retain records with desired company response labels
responselist = ['Closed with explanation', 'Closed with non-monetary relief', 'Closed with monetary relief']
df = data[data.Company_response_to_consumer.isin(responselist)]

In [202]:
#-->df['Company_response_to_consumer']
len(df.index) #number of records

134855

In [203]:
#drop features/columns not needed for experiment
df = df.drop('Date_received', 1)
df = df.drop('Sub-product', 1)
df = df.drop('Sub-issue', 1)
df = df.drop('Consumer_complaint_narrative', 1)
df = df.drop('Company_public_response', 1)
df = df.drop('Tags', 1)
df = df.drop('Consumer_consent_provided?', 1)
df = df.drop('Submitted_via', 1)
df = df.drop('Date_sent_to_company', 1)
df = df.drop('Timely_response?', 1)
df = df.drop('Consumer_disputed?', 1)
df = df.drop('Complaint_ID', 1)

In [204]:
df.head()

Unnamed: 0,Product,Issue,Company,State,ZIP_code,Company_response_to_consumer
0,Consumer Loan,Taking out the loan or lease,Capital One,OH,453XX,Closed with explanation
1,Credit reporting,Unable to get credit report/credit score,Experian,FL,320XX,Closed with non-monetary relief
2,Student loan,Dealing with my lender or servicer,AES/PHEAA,PA,151XX,Closed with explanation
3,Credit reporting,Incorrect information on credit report,"TransUnion Intermediate Holdings, Inc.",AL,358XX,Closed with explanation
4,Mortgage,"Loan servicing, payments, escrow account",Vanderbilt Mortgage & Finance,MO,630XX,Closed with explanation


In [205]:
df.dtypes

Product                         object
Issue                           object
Company                         object
State                           object
ZIP_code                        object
Company_response_to_consumer    object
dtype: object

In [206]:
#convert column types to 'category'
df['Product'] = df['Product'].astype('category')
df['Issue'] = df['Issue'].astype('category')
df['Company'] = df['Company'].astype('category')
df['State'] = df['State'].astype('category')
df['ZIP_code'] = df['ZIP_code'].astype('category')
df['Company_response_to_consumer'] = df['Company_response_to_consumer'].astype('category')

In [207]:
df.dtypes

Product                         category
Issue                           category
Company                         category
State                           category
ZIP_code                        category
Company_response_to_consumer    category
dtype: object

In [208]:
#select columns of 'category' type
cat_columns = df.select_dtypes(['category']).columns
cat_columns

Index(['Product', 'Issue', 'Company', 'State', 'ZIP_code',
       'Company_response_to_consumer'],
      dtype='object')

In [209]:
#assign number values to categories
df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes)
#-->df

In [210]:
#RETURN: I should probably first split data to train and test set at this point

#seperate class/target column from rest of data
df_target = df["Company_response_to_consumer"] #dataframe with only consumer complaint
df_vec = df.drop('Company_response_to_consumer', 1) #dataframe with all except consumer complaint
df_vec.head()
#-->df_target

Unnamed: 0,Product,Issue,Company,State,ZIP_code
0,1,82,461,43,433
1,3,85,874,13,307
2,10,34,31,46,144
3,3,49,2416,3,343
4,6,57,2524,31,596


In [211]:
#convert df_vec dataframe to vector (any of the two sets of code will do)
df_matrix=df_vec.values
df_matrix
#numpyMatrix = df_vec.as_matrix()
#numpyMatrix

array([[   1,   82,  461,   43,  433],
       [   3,   85,  874,   13,  307],
       [  10,   34,   31,   46,  144],
       ..., 
       [  10,   34, 1688,   20,  568],
       [  10,   34, 1688,   35,  264],
       [  10,   16, 1688,   15,  287]], dtype=int16)

In [212]:
#split data into train and test set
vec_train, vec_test, target_train, target_test = train_test_split(df_matrix, df_target, test_size=0.2, random_state=4)
#-->vec_train
#-->vec_test

In [213]:
#machine learning algorithhm used for classification
mnb = MultinomialNB()
gnb = GaussianNB()

In [214]:
#covert to integer type to enable classification
#target_train = target_train.astype('int')
#vec_train = vec_train.astype('int')

#=======================================================================================================================#
# Beginning of Experiment 1: GaussianNB on Product	Issue	Company 	State	ZIP_code

In [215]:
#build classifier on train set
gnb.fit(vec_train, target_train)

GaussianNB()

In [216]:
#use built classifier to predict labels of test set
gnb_pred = gnb.predict(vec_test)
gnb_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int8)

In [217]:
#get actual labels of test set
actual = np.array(target_test)
actual

array([0, 0, 0, ..., 0, 0, 0], dtype=int8)

In [239]:
#count correct predictions
count = 0
for i in range (len(gnb_pred)):
    if gnb_pred[i] == actual[i]:
        count = count + 1
count

21654

In [219]:
len(gnb_pred)

26971

In [220]:
#accuracy
21654/26971

0.8028623336175893

In [221]:
# End of Experiment 1: GaussianNB on Product	Issue	Company 	State	ZIP_code
# =======================================================================================================================#
# Beginning of Experiment 2:MultinomialNB on Product	Issue	Company

In [223]:
# NOTE: MultinomialNB did not work initially because 'State' and 'ZIP_code' contain null inputs which gets changed to -1 by
# the category transformation. This causes an error because MultinomialNB cannot operate on negative values. So in order to test
# prediction on MultinomialNB I'll be removing 'State' and 'ZIP_code' from the data and matrix

len(df_vec[df_vec.State < 0])

414

In [233]:
df_vec2 = df_vec.drop('State', 1)
df_vec2 = df_vec2.drop('ZIP_code', 1)
df_vec2.head()

Unnamed: 0,Product,Issue,Company
0,1,82,461
1,3,85,874
2,10,34,31
3,3,49,2416
4,6,57,2524


In [234]:
#convert df_vec dataframe to vector (any of the two sets of code will do)
df_matrix2 = df_vec2.values
df_matrix2
#numpyMatrix = df_vec.as_matrix()
#numpyMatrix

array([[   1,   82,  461],
       [   3,   85,  874],
       [  10,   34,   31],
       ..., 
       [  10,   34, 1688],
       [  10,   34, 1688],
       [  10,   16, 1688]], dtype=int16)

In [235]:
#split data into train and test set
vec_train2, vec_test2, target_train2, target_test2 = train_test_split(df_matrix2, df_target, test_size=0.2, random_state=4)
#-->vec_train
#-->vec_test

In [236]:
#build classifier on train set
mnb.fit(vec_train2, target_train2)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [237]:
#use built classifier to predict labels of test set
mnb_pred = mnb.predict(vec_test2)
mnb_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int8)

In [238]:
#get actual labels of test set
actual2 = np.array(target_test2)
actual2

array([0, 0, 0, ..., 0, 0, 0], dtype=int8)

In [240]:
#count correct predictions
count2 = 0
for i in range (len(mnb_pred)):
    if mnb_pred[i] == actual2[i]:
        count2 = count2 + 1
count2

19261

In [242]:
len(mnb_pred)

26971

In [244]:
#accuracy
19261/26971

0.7141374068443884

In [None]:
# End of Experiment 2:MultinomialNB on Product	Issue	Company
# =======================================================================================================================#