In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
%matplotlib inline

from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD

import re
from nltk.corpus import stopwords # Import the stop word list
from nltk.stem import PorterStemmer, WordNetLemmatizer

from sklearn import tree, preprocessing
from sklearn.tree import _tree
from sklearn_pandas import DataFrameMapper, cross_val_score



In [2]:
data = pd.read_csv('Consumer_Complaints_with_Consumer_Complaint_Narratives.csv')

In [3]:
#filter and retain records with desired company response labels
responselist = ['Closed with explanation', 'Closed with non-monetary relief', 'Closed with monetary relief']
df = data[data.Company_response_to_consumer.isin(responselist)]

In [4]:
#drop features/columns not needed for experiment
#df = df.drop('Product', 1)
#df = df.drop('Issue', 1)
#df = df.drop('Company', 1)
#df = df.drop('State', 1)
#df = df.drop('ZIP_code', 1)
df = df.drop('Consumer_complaint_narrative', 1)
#df = df.drop('Sub-product', 1)
#df = df.drop('Sub-issue', 1)
df = df.drop('Company_public_response', 1)
df = df.drop('Tags', 1)
df = df.drop('Consumer_consent_provided?', 1)
df = df.drop('Submitted_via', 1)
df = df.drop('Date_sent_to_company', 1)
df = df.drop('Timely_response?', 1)
df = df.drop('Consumer_disputed?', 1)
df = df.drop('Complaint_ID', 1)

In [5]:
df.head(3)

Unnamed: 0,Date_received,Product,Sub-product,Issue,Sub-issue,Company,State,ZIP_code,Company_response_to_consumer
0,12/2/2016,Consumer Loan,Vehicle loan,Taking out the loan or lease,,Capital One,OH,453XX,Closed with explanation
1,12/15/2016,Credit reporting,,Unable to get credit report/credit score,Problem getting my free annual report,Experian,FL,320XX,Closed with non-monetary relief
2,11/13/2016,Student loan,Federal student loan servicing,Dealing with my lender or servicer,Having problems with customer service,AES/PHEAA,PA,151XX,Closed with explanation


In [6]:
def predict(entry, cls, values):
    prediction = cls.predict_proba(entry)
    leaf = cls.apply(entry)
    classifier = np.argmax(prediction)
    samples = cls.tree_.n_node_samples
    confident = False
    keep = False
    prediction_class = values[classifier]
    confident_prediction = prediction[0][classifier] == 1 and samples[leaf] > 50
    closed_with_explanation = prediction_class == 'Closed with explanation' and prediction[0][classifier] > 0.8
    if confident_prediction:
        confident = True
    return (prediction_class, confident)

In [7]:
def create_new_features(data):
    data['combined_product'] = data.apply(lambda x: "%s-%s" % (x['Product'], x['Sub-product']), axis = 1)
    data['combined_issue'] = data.apply(lambda x: "%s-%s" % (x['Issue'], x['Sub-issue']), axis = 1)

In [8]:
def shape_data(data):
    decision_tree_data = data.ix[:, ['combined_product', 'combined_issue', 'State', 'Company']]
    return pd.get_dummies(decision_tree_data)

In [9]:
def split_data(data):
    data['Date_received'] = pd.to_datetime(data['Date_received'])
    data = data.sort_values('Date_received')
    cutoff = int(data.shape[0] * 0.8)
    return(data, cutoff)

In [10]:
def run_prediction(data):
    create_new_features(data)
    data, cutoff = split_data(data)
    shaped_data = shape_data(data)
    train = data.iloc[:cutoff]
    test = data.iloc[cutoff +1:]
    train_shaped = shaped_data.iloc[:cutoff]
    test_shaped = shaped_data.iloc[cutoff +1:]
    print('Train Set')
    print(train[['Company_response_to_consumer']].groupby(['Company_response_to_consumer']).size())
    print('Test Set')
    print(test[['Company_response_to_consumer']].groupby(['Company_response_to_consumer']).size())
    clf = tree.DecisionTreeClassifier()
    target = train['Company_response_to_consumer'].values
    clf = clf.fit( train_shaped.values, y = target )
    train['Prediction'] = train_shaped.apply(lambda x: predict(x, clf, clf.classes_), axis = 1)
    train['Confident'] = train.apply(lambda x: x.Prediction[1], axis = 1)
    train['Prediction'] = train.apply(lambda x: x.Prediction[0], axis = 1)
    test['Prediction'] = test_shaped.apply(lambda x: predict(x, clf, clf.classes_), axis = 1)
    test['Confident'] = test.apply(lambda x: x.Prediction[1], axis = 1)
    test['Prediction'] = test.apply(lambda x: x.Prediction[0], axis = 1)
    return (train, test)

In [None]:
train, test = run_prediction(df)

Train Set
Company_response_to_consumer
Closed with explanation            85862
Closed with monetary relief         8013
Closed with non-monetary relief    14009
dtype: int64
Test Set
Company_response_to_consumer
Closed with explanation            21749
Closed with monetary relief         2173
Closed with non-monetary relief     3048
dtype: int64




In [15]:
train.head()

Unnamed: 0,Date_received,Product,Sub-product,Issue,Sub-issue,Company,State,ZIP_code,Company_response_to_consumer,combined_product,combined_issue,Prediction,Confident
21011,2015-03-19,Credit reporting,,Incorrect information on credit report,Account status,Experian,SC,293XX,Closed with explanation,Credit reporting-nan,Incorrect information on credit report-Account...,Closed with explanation,False
20919,2015-03-19,Credit card,,Credit determination,,"Citizens Financial Group, Inc.",MA,023XX,Closed with explanation,Credit card-nan,Credit determination-nan,Closed with explanation,False
20925,2015-03-19,Credit card,,Billing statement,,JPMorgan Chase & Co.,FL,337XX,Closed with explanation,Credit card-nan,Billing statement-nan,Closed with explanation,False
20999,2015-03-19,Bank account or service,Checking account,"Account opening, closing, or management",,Ally Financial Inc.,FL,331XX,Closed with explanation,Bank account or service-Checking account,"Account opening, closing, or management-nan",Closed with explanation,False
41540,2015-03-19,Consumer Loan,Vehicle loan,Managing the loan or lease,,DriveTime,FL,336XX,Closed with explanation,Consumer Loan-Vehicle loan,Managing the loan or lease-nan,Closed with explanation,True


In [18]:
#filter out records marked confident in train and test set
responselist = [False]
ftrain = train[train.Confident.isin(responselist)]
ftest = test[test.Confident.isin(responselist)]

ftrain.head()

Unnamed: 0,Date_received,Product,Sub-product,Issue,Sub-issue,Company,State,ZIP_code,Company_response_to_consumer,combined_product,combined_issue,Prediction,Confident
21011,2015-03-19,Credit reporting,,Incorrect information on credit report,Account status,Experian,SC,293XX,Closed with explanation,Credit reporting-nan,Incorrect information on credit report-Account...,Closed with explanation,False
20919,2015-03-19,Credit card,,Credit determination,,"Citizens Financial Group, Inc.",MA,023XX,Closed with explanation,Credit card-nan,Credit determination-nan,Closed with explanation,False
20925,2015-03-19,Credit card,,Billing statement,,JPMorgan Chase & Co.,FL,337XX,Closed with explanation,Credit card-nan,Billing statement-nan,Closed with explanation,False
20999,2015-03-19,Bank account or service,Checking account,"Account opening, closing, or management",,Ally Financial Inc.,FL,331XX,Closed with explanation,Bank account or service-Checking account,"Account opening, closing, or management-nan",Closed with explanation,False
41544,2015-03-19,Credit reporting,,Incorrect information on credit report,Reinserted previously deleted info,Equifax,TX,751XX,Closed with explanation,Credit reporting-nan,Incorrect information on credit report-Reinser...,Closed with explanation,False


In [19]:
print('Filtered Train Set')
print(ftrain[['Company_response_to_consumer']].groupby(['Company_response_to_consumer']).size())
print('Filtered Test Set')
print(ftest[['Company_response_to_consumer']].groupby(['Company_response_to_consumer']).size())

Filtered Train Set
Company_response_to_consumer
Closed with explanation            49334
Closed with monetary relief         8013
Closed with non-monetary relief    13178
dtype: int64
Filtered Test Set
Company_response_to_consumer
Closed with explanation            13337
Closed with monetary relief         1982
Closed with non-monetary relief     2728
dtype: int64


In [20]:
# filtered out records 
responselist = [True]
removedtrain = train[train.Confident.isin(responselist)]
removedtest = test[test.Confident.isin(responselist)]

removedtrain.head()

Unnamed: 0,Date_received,Product,Sub-product,Issue,Sub-issue,Company,State,ZIP_code,Company_response_to_consumer,combined_product,combined_issue,Prediction,Confident
41540,2015-03-19,Consumer Loan,Vehicle loan,Managing the loan or lease,,DriveTime,FL,336XX,Closed with explanation,Consumer Loan-Vehicle loan,Managing the loan or lease-nan,Closed with explanation,True
41546,2015-03-19,Mortgage,Conventional fixed mortgage,"Loan servicing, payments, escrow account",,Wells Fargo & Company,FL,322XX,Closed with explanation,Mortgage-Conventional fixed mortgage,"Loan servicing, payments, escrow account-nan",Closed with explanation,True
21005,2015-03-19,Student loan,Non-federal student loan,Can't repay my loan,Can't get flexible payment options,Genesis Lending,FL,331XX,Closed with explanation,Student loan-Non-federal student loan,Can't repay my loan-Can't get flexible payment...,Closed with explanation,True
21025,2015-03-19,Debt collection,I do not know,Cont'd attempts collect debt not owed,Debt is not mine,"AFS Acceptance, LLC",MD,207XX,Closed with explanation,Debt collection-I do not know,Cont'd attempts collect debt not owed-Debt is ...,Closed with explanation,True
21027,2015-03-19,Mortgage,FHA mortgage,Settlement process and costs,,"Carrington Mortgage Holdings, LLC.",MN,553XX,Closed with explanation,Mortgage-FHA mortgage,Settlement process and costs-nan,Closed with explanation,True


In [157]:
combineddf = ftrain.append(ftest, ignore_index=False)
combineddf.head()

Unnamed: 0,Date_received,Product,Sub-product,Issue,Sub-issue,Company,State,ZIP_code,Company_response_to_consumer,combined_product,combined_issue,Prediction,Confident
21011,2015-03-19,Credit reporting,,Incorrect information on credit report,Account status,Experian,SC,293XX,Closed with explanation,Credit reporting-nan,Incorrect information on credit report-Account...,Closed with explanation,False
20919,2015-03-19,Credit card,,Credit determination,,"Citizens Financial Group, Inc.",MA,023XX,Closed with explanation,Credit card-nan,Credit determination-nan,Closed with explanation,False
20925,2015-03-19,Credit card,,Billing statement,,JPMorgan Chase & Co.,FL,337XX,Closed with explanation,Credit card-nan,Billing statement-nan,Closed with explanation,False
20999,2015-03-19,Bank account or service,Checking account,"Account opening, closing, or management",,Ally Financial Inc.,FL,331XX,Closed with explanation,Bank account or service-Checking account,"Account opening, closing, or management-nan",Closed with explanation,False
41544,2015-03-19,Credit reporting,,Incorrect information on credit report,Reinserted previously deleted info,Equifax,TX,751XX,Closed with explanation,Credit reporting-nan,Incorrect information on credit report-Reinser...,Closed with explanation,False


In [158]:
#remove other features not needed in experiment
combineddf = combineddf.drop('Date_received', 1)
combineddf = combineddf.drop('Sub-issue', 1)
combineddf = combineddf.drop('combined_product', 1)
combineddf = combineddf.drop('Sub-product', 1)
combineddf = combineddf.drop('combined_issue', 1)
combineddf = combineddf.drop('ZIP_code', 1)
combineddf = combineddf.drop('Prediction', 1)
combineddf = combineddf.drop('Confident', 1)

combineddf.head()

Unnamed: 0,Product,Issue,Company,State,Company_response_to_consumer
21011,Credit reporting,Incorrect information on credit report,Experian,SC,Closed with explanation
20919,Credit card,Credit determination,"Citizens Financial Group, Inc.",MA,Closed with explanation
20925,Credit card,Billing statement,JPMorgan Chase & Co.,FL,Closed with explanation
20999,Bank account or service,"Account opening, closing, or management",Ally Financial Inc.,FL,Closed with explanation
41544,Credit reporting,Incorrect information on credit report,Equifax,TX,Closed with explanation


In [159]:
#Override null entries to avoid negative numbers when converting to category
#combineddf.State.replace(np.NaN, 'XX', inplace=True)
#combineddf.State.replace(np.NaN, 'XX', inplace=True)
#cftrain.ZIP_code.replace(np.NaN, 'XXXXX', inplace=True)

#cftrain.dtypes

In [160]:
#convert column types to 'category'
combineddf['Company'] = combineddf['Company'].astype('category')
combineddf['State'] = combineddf['State'].astype('category')
combineddf['Company_response_to_consumer'] = combineddf['Company_response_to_consumer'].astype('category')
combineddf['Product'] = combineddf['Product'].astype('category')
combineddf['Issue'] = combineddf['Issue'].astype('category')
#combineddf['Sub-product'] = combineddf['Sub-product'].astype('category')
#combineddf['Sub-issue'] = combineddf['Sub-issue'].astype('category')
#combineddf['ZIP_code'] = combineddf['ZIP_code'].astype('category')

In [161]:
#select columns of 'category' type
cat_columns = combineddf.select_dtypes(['category']).columns
cat_columns

Index(['Product', 'Issue', 'Company', 'State', 'Company_response_to_consumer'], dtype='object')

In [162]:
#assign number values to categories
combineddf[cat_columns] = combineddf[cat_columns].apply(lambda x: x.cat.codes)

combineddf.head()

Unnamed: 0,Product,Issue,Company,State,Company_response_to_consumer
21011,3,49,305,50,0
20919,2,28,188,25,0
20925,2,14,441,13,0
20999,0,1,41,13,0
41544,3,49,299,53,0


In [163]:
cutoff = len(cftrain.index)
ccftrain = combineddf.iloc[:cutoff]
ccftest = combineddf.iloc[cutoff +1:]

In [164]:
#seperate class/target column from rest of data
target_train = ccftrain["Company_response_to_consumer"] #dataframe with only consumer complaint
target_test = ccftest["Company_response_to_consumer"]
#remove other features not needed in experiment
vec_train = ccftrain.drop('Company_response_to_consumer', 1) #dataframe with all except consumer complaint
vec_test = ccftest.drop('Company_response_to_consumer', 1)

In [165]:
#convert *_vec dataframe to vector (any of the two sets of code will do)
vec_cftrain = vec_train.values
vec_cftest = vec_test.values

In [378]:
# create dictionary of class weight penalization
scaleclass = {0: 1.2, 1: 2., 2: 1.5}
# Initialize a Random Forest classifier with 100 trees
forest = RandomForestClassifier(n_estimators = 50,
                                criterion = "entropy",
                                min_samples_split = 50,
                                min_samples_leaf = 10,
                                class_weight = scaleclass,
                               n_jobs = -1) 

In [379]:
# This may take a few minutes to run
forest = forest.fit( vec_cftrain, target_train )

In [380]:
# Use the random forest to make predictions
result = forest.predict(vec_cftest)
result

array([0, 0, 0, ..., 0, 0, 0], dtype=int8)

In [381]:
#get actual labels of test set
actual = np.array(target_test)
actual

array([0, 0, 0, ..., 0, 0, 0], dtype=int8)

In [382]:
#count correct predictions
actualExp = 0 # counts actual 'closed with explanation' OR (0) labels
actualRel = 0 # counts actual 'closed with monetary relief' OR (1) labels
actualNon = 0 # counts actual 'closed with non-monetary relief' OR (2) labels
    
predExp = 0 # counts correct predicted 0 labels
predExp_Rel = 0 # counts incorrect predictions of 0 when 1
predExp_Non = 0 # counts incorrect predictions of 0 when 2
   
predRel = 0 # counts correct predicted 1 labels
predRel_Exp = 0 # counts incorrect predicitons of 1 when 0
predRel_Non = 0 # counts incorrect predictions of 1 when 2
    
predNon = 0 # counts correct predicted 2 labels
predNon_Exp = 0 # counts incorrect predictions of 2 when 0
predNon_Rel = 0 # counts incorrect predicitons of 2 when 1

In [383]:
for i in range (len(result)):
    act = actual[i]
    pred = result[i]
    
    if pred == act:
        if act == 0:
            predExp = predExp + 1
            actualExp = actualExp + 1
        elif act == 1:
            predRel = predRel + 1
            actualRel = actualRel + 1
        elif act == 2:
            predNon = predNon + 1
            actualNon = actualNon + 1
    
    else:
        if act == 0:
            actualExp = actualExp + 1
            if pred == 1:
                predRel_Exp = predRel_Exp + 1
            elif pred == 2:
                predNon_Exp = predNon_Exp + 1
        if act == 1:
            actualRel = actualRel + 1
            if pred == 0:
                predExp_Rel = predExp_Rel + 1
            elif pred == 2:
                predNon_Rel = predNon_Rel + 1
        if act == 2:
            actualNon = actualNon + 1
            if pred == 0:
                predExp_Non = predExp_Non + 1
            elif pred == 1:
                predRel_Non = predRel_Non + 1

In [384]:
print ("predExp: %d" % ( predExp ))
print ("predExp_Rel: %d" % ( predExp_Rel ))
print ("predExp_Non: %d" % ( predExp_Non ))
print ("predRel: %d" % ( predRel ))
print ("predRel_Exp: %d" % ( predRel_Exp ))
print ("predRel_Non: %d" % ( predRel_Non ))
print ("predNon: %d" % ( predNon ))
print ("predNon_Exp: %d" % ( predNon_Exp ))
print ("predNon_Rel: %d" % ( predNon_Rel ))
print ("actualExp: %d" % ( actualExp ))
print ("actualRel: %d" % ( actualRel ))
print ("actualNon: %d" % ( actualNon ))

accuracy = (predExp+predRel+predNon)/(actualExp+actualRel+actualNon)
accuracy

predExp: 11828
predExp_Rel: 1293
predExp_Non: 1926
predRel: 597
predRel_Exp: 624
predRel_Non: 76
predNon: 726
predNon_Exp: 884
predNon_Rel: 92
actualExp: 13336
actualRel: 1982
actualNon: 2728


0.7287487531863017

In [84]:
mnb = MultinomialNB()
mnb.fit(vec_cftrain, target_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [85]:
mnb_pred = mnb.predict(vec_cftest)
mnb_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int8)

In [86]:
#count correct predictions
actualExp = 0 # counts actual 'closed with explanation' OR (0) labels
actualRel = 0 # counts actual 'closed with monetary relief' OR (1) labels
actualNon = 0 # counts actual 'closed with non-monetary relief' OR (2) labels
    
predExp = 0 # counts correct predicted 0 labels
predExp_Rel = 0 # counts incorrect predictions of 0 when 1
predExp_Non = 0 # counts incorrect predictions of 0 when 2
   
predRel = 0 # counts correct predicted 1 labels
predRel_Exp = 0 # counts incorrect predicitons of 1 when 0
predRel_Non = 0 # counts incorrect predictions of 1 when 2
    
predNon = 0 # counts correct predicted 2 labels
predNon_Exp = 0 # counts incorrect predictions of 2 when 0
predNon_Rel = 0 # counts incorrect predicitons of 2 when 1

In [87]:
for i in range (len(mnb_pred)):
    act = actual[i]
    pred = mnb_pred[i]
    
    if pred == act:
        if act == 0:
            predExp = predExp + 1
            actualExp = actualExp + 1
        elif act == 1:
            predRel = predRel + 1
            actualRel = actualRel + 1
        elif act == 2:
            predNon = predNon + 1
            actualNon = actualNon + 1
    
    else:
        if act == 0:
            actualExp = actualExp + 1
            if pred == 1:
                predRel_Exp = predRel_Exp + 1
            elif pred == 2:
                predNon_Exp = predNon_Exp + 1
        if act == 1:
            actualRel = actualRel + 1
            if pred == 0:
                predExp_Rel = predExp_Rel + 1
            elif pred == 2:
                predNon_Rel = predNon_Rel + 1
        if act == 2:
            actualNon = actualNon + 1
            if pred == 0:
                predExp_Non = predExp_Non + 1
            elif pred == 1:
                predRel_Non = predRel_Non + 1

In [88]:
print ("predExp: %d" % ( predExp ))
print ("predExp_Rel: %d" % ( predExp_Rel ))
print ("predExp_Non: %d" % ( predExp_Non ))
print ("predRel: %d" % ( predRel ))
print ("predRel_Exp: %d" % ( predRel_Exp ))
print ("predRel_Non: %d" % ( predRel_Non ))
print ("predNon: %d" % ( predNon ))
print ("predNon_Exp: %d" % ( predNon_Exp ))
print ("predNon_Rel: %d" % ( predNon_Rel ))
print ("actualExp: %d" % ( actualExp ))
print ("actualRel: %d" % ( actualRel ))
print ("actualNon: %d" % ( actualNon ))

accuracy = (predExp+predRel+predNon)/(actualExp+actualRel+actualNon)
accuracy

predExp: 9681
predExp_Rel: 1262
predExp_Non: 2016
predRel: 656
predRel_Exp: 3296
predRel_Non: 652
predNon: 60
predNon_Exp: 359
predNon_Rel: 64
actualExp: 13336
actualRel: 1982
actualNon: 2728


0.5761387565111382