In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
%matplotlib inline

from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD

import re
from nltk.corpus import stopwords # Import the stop word list
from nltk.stem import PorterStemmer, WordNetLemmatizer



In [2]:
data = pd.read_csv('Consumer_Complaints_with_Consumer_Complaint_Narratives.csv')

In [3]:
data.head(1)

Unnamed: 0,Date_received,Product,Sub-product,Issue,Sub-issue,Consumer_complaint_narrative,Company_public_response,Company,State,ZIP_code,Tags,Consumer_consent_provided?,Submitted_via,Date_sent_to_company,Company_response_to_consumer,Timely_response?,Consumer_disputed?,Complaint_ID
0,12/2/2016,Consumer Loan,Vehicle loan,Taking out the loan or lease,,Capitol One 360 keeps changing what I owe on m...,,Capital One,OH,453XX,,Consent provided,Web,12/2/2016,Closed with explanation,Yes,No,2233355


In [4]:
#filter and retain records with desired company response labels
responselist = ['Closed with explanation', 'Closed with non-monetary relief', 'Closed with monetary relief']
df = data[data.Company_response_to_consumer.isin(responselist)]

In [5]:
len(df.index) #number of records

134855

In [6]:
#drop features/columns not needed for experiment
df = df.drop('Product', 1)
df = df.drop('Issue', 1)
df = df.drop('Company', 1)
df = df.drop('State', 1)
df = df.drop('ZIP_code', 1)

df = df.drop('Sub-product', 1)
df = df.drop('Sub-issue', 1)
df = df.drop('Company_public_response', 1)
df = df.drop('Tags', 1)
df = df.drop('Consumer_consent_provided?', 1)
df = df.drop('Submitted_via', 1)
df = df.drop('Date_sent_to_company', 1)
df = df.drop('Timely_response?', 1)
df = df.drop('Consumer_disputed?', 1)
df = df.drop('Complaint_ID', 1)

In [7]:
df.head(3)
#df[234:236]

Unnamed: 0,Date_received,Consumer_complaint_narrative,Company_response_to_consumer
0,12/2/2016,Capitol One 360 keeps changing what I owe on m...,Closed with explanation
1,12/15/2016,I requested all XXXX reports. I got through th...,Closed with non-monetary relief
2,11/13/2016,I received a forberance on my loans last sprin...,Closed with explanation


In [10]:
#convert column types to 'category'
df['Company_response_to_consumer'] = df['Company_response_to_consumer'].astype('category')
df.dtypes

Date_received                   datetime64[ns]
Consumer_complaint_narrative            object
Company_response_to_consumer          category
dtype: object

In [11]:
#select columns of 'category' type
cat_columns = df.select_dtypes(['category']).columns
cat_columns

Index(['Company_response_to_consumer'], dtype='object')

In [12]:
#assign number values to categories
df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes)
#-->df

In [13]:
#sort by date recieved and split data to train and test set at this point
df['Date_received'] = pd.to_datetime(df['Date_received'])
df = df.sort_values('Date_received')
cutoff = int(df.shape[0] * 0.8)
train = df.iloc[:cutoff]
test = df.iloc[cutoff +1:]
train.tail()

Unnamed: 0,Date_received,Consumer_complaint_narrative,Company_response_to_consumer
119648,2016-09-28,My name is XXXX XXXX and I filed a debit card ...,0
123539,2016-09-28,I have paid my bills on time and have a good r...,0
116496,2016-09-28,I have filed a dispute regarding a Judgement t...,0
111382,2016-09-28,Included in this complaint are documents to va...,0
115242,2016-09-28,Merchants and Medical is attempting to collect...,0


In [14]:
#remove other features not needed in experiment
train = train.drop('Date_received', 1)
test = test.drop('Date_received', 1)

test.head(3)

Unnamed: 0,Consumer_complaint_narrative,Company_response_to_consumer
117122,Experian continues to list items on my credit ...,2
115040,"I sent request for information, about payment ...",0
115253,I contacted Capital One on XXXX/XXXX/16 to set...,0


In [15]:
# function to do text preprocessing
def narrative_to_words( raw_narrative ):
    # Function to convert a raw narrative to a string of words
    # The input is a single string (a raw complaint narrative), and 
    # the output is a single string (a preprocessed complaint narrative)
    #
    # 1. Remove HTML--- not needed in this case
    # review_text = BeautifulSoup(raw_review).get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", raw_narrative )
    #
    # 3. Convert to lower case, split into individual words
    lower_case = letters_only.lower()        # Convert to lower case
    words = lower_case.split()               # Split into words   
    #
    # remove small and large words
    words = [x for x in words if len(x) > 3 and len(x) < 15 ]
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stopset = set(stopwords.words("english")) 
    stopset.update(['xxxx', 'xx', 'alreadi', 'also', ])
    # 
    # 5. Remove stop words
    words = [w for w in words if not w in stopset]   
    #
    # 6. stem words
    port = PorterStemmer()
    stemwords = " ".join(port.stem(i) for i in words)
    return( stemwords)
    #
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    #return( " ".join( meaningful_words ))  

In [16]:
count = 0
# Now loop through dataset and use function to clean
# Get the number of reviews based on the dataframe column size
num_narr = train["Consumer_complaint_narrative"].size
print ("Cleaning and parsing the training set movie reviews...\n")
# Initialize an empty list to hold the clean reviews
clean_train_narr = []

# Loop over each review; create an index i that goes from 0 to the length
# of the movie review list 
#for i in range( num_narr ):
for item, frame in train["Consumer_complaint_narrative"].iteritems():
    # If the count is evenly divisible by 50000, print a message
    count += 1
    if( (count+1)%5000 == 0 ):
        print ("Train narrative %d of %d" % ( count+1, num_narr ))
    # Call our function for each one, and add the result to the list of
    # clean reviews
    if item in train.index:
        clean_train_narr.append( narrative_to_words( frame ) )

Cleaning and parsing the training set movie reviews...

Narrative 65000 of 107884
Narrative 25000 of 107884
Narrative 45000 of 107884
Narrative 5000 of 107884
Narrative 70000 of 107884
Narrative 30000 of 107884
Narrative 10000 of 107884
Narrative 50000 of 107884
Narrative 55000 of 107884
Narrative 75000 of 107884
Narrative 35000 of 107884
Narrative 15000 of 107884
Narrative 20000 of 107884
Narrative 80000 of 107884
Narrative 90000 of 107884
Narrative 95000 of 107884
Narrative 100000 of 107884
Narrative 115000 of 107884


In [17]:
print ("Creating the bag of words...\n")

# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000) 

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
train_data_features = vectorizer.fit_transform(clean_train_narr)

# Numpy arrays are easy to work with, so convert the result to an 
# array
train_data_features = train_data_features.toarray()
print (train_data_features.shape)

Creating the bag of words...

(107884, 5000)


In [18]:
# Take a look at the words in the vocabulary
vocab = vectorizer.get_feature_names()
print (vocab)

['aa', 'aadvantag', 'aaf', 'abandon', 'abc', 'abet', 'abid', 'abil', 'abl', 'abroad', 'abruptli', 'absenc', 'absent', 'absolut', 'absolv', 'absorb', 'absurd', 'abus', 'ac', 'academ', 'academi', 'acc', 'acceler', 'accent', 'accept', 'access', 'accid', 'accident', 'accommod', 'accompani', 'accomplish', 'accord', 'accordingli', 'account', 'accout', 'accredit', 'accru', 'accrual', 'acct', 'accumul', 'accur', 'accuraci', 'accus', 'ace', 'ach', 'achiev', 'acknowledg', 'acount', 'acquir', 'acquisit', 'acr', 'across', 'act', 'action', 'activ', 'actual', 'acura', 'ad', 'adam', 'adamantli', 'add', 'addendum', 'addit', 'address', 'adequ', 'adher', 'adjud', 'adjust', 'admin', 'administ', 'administr', 'admiss', 'admit', 'admittedli', 'adopt', 'adress', 'adult', 'advanc', 'advantag', 'advers', 'advertis', 'advic', 'advis', 'advisor', 'advoc', 'advocaci', 'ae', 'affadavit', 'affair', 'affect', 'affiant', 'affidavit', 'affili', 'affirm', 'afford', 'afloat', 'afni', 'aforement', 'afraid', 'afternoon', 

In [19]:
# Initialize a Random Forest classifier with 100 trees
forest = RandomForestClassifier(n_estimators = 100) 

# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
forest = forest.fit( train_data_features, train["Company_response_to_consumer"] )

In [20]:
# preprocess test narrative data
count = 0
# Create an empty list and append the clean reviews one by one
num_narr = len(test["Consumer_complaint_narrative"])
clean_test_narr = [] 

print ("Cleaning and parsing the test set movie reviews...\n")
for item, frame in test["Consumer_complaint_narrative"].iteritems():
    # If the count is evenly divisible by 50000, print a message
    count += 1
    if( (count+1)%5000 == 0 ):
        print ("Test narrative %d of %d" % ( count+1, num_narr ))
    # Call our function for each one, and add the result to the list of
    # clean reviews
    if item in test.index:
        clean_test_narr.append( narrative_to_words( frame ) )

Cleaning and parsing the test set movie reviews...

Test narrative 5000 of 107884
Test narrative 10000 of 107884
Test narrative 15000 of 107884
Test narrative 20000 of 107884
Test narrative 25000 of 107884


In [51]:
# Get a bag of words for the test set, and convert to a numpy array
test_data_features = vectorizer.transform(clean_test_narr)
test_data_features = test_data_features.toarray()

# Use the random forest to make sentiment label predictions
result = forest.predict(test_data_features)
result

## Copy the results to a pandas dataframe with an "id" column and
## a "sentiment" column
#output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )

## Use pandas to write the comma-separated output file
#output.to_csv( "Bag_of_Words_model.csv", index=False, quoting=3 )

array([0, 0, 0, ..., 0, 0, 0], dtype=int8)

In [22]:
#get actual labels of test set in array form
actual = np.array(test["Company_response_to_consumer"])
actual

array([2, 0, 0, ..., 0, 2, 0], dtype=int8)

In [None]:
#count correct predictions
actualExp = 0 # counts actual 'closed with explanation' OR (0) labels
actualRel = 0 # counts actual 'closed with monetary relief' OR (1) labels
actualNon = 0 # counts actual 'closed with non-monetary relief' OR (2) labels
    
predExp = 0 # counts correct predicted 0 labels
predExp_Rel = 0 # counts incorrect predictions of 0 when 1
predExp_Non = 0 # counts incorrect predictions of 0 when 2
   
predRel = 0 # counts correct predicted 1 labels
predRel_Exp = 0 # counts incorrect predicitons of 1 when 0
predRel_Non = 0 # counts incorrect predictions of 1 when 2
    
predNon = 0 # counts correct predicted 2 labels
predNon_Exp = 0 # counts incorrect predictions of 2 when 0
predNon_Rel = 0 # counts incorrect predicitons of 2 when 1

In [67]:
    for i in range (len(result)):
        act = actual[i]
        pred = result[i]
        
        if pred == act:
            if act == 0:
                predExp = predExp + 1
                actualExp = actualExp + 1
            elif act == 1:
                predRel = predRel + 1
                actualRel = actualRel + 1
            elif act == 2:
                predNon = predNon + 1
                actualNon = actualNon + 1
        
        else:
            if act == 0:
                actualExp = actualExp + 1
                if pred == 1:
                    predRel_Exp = predRel_Exp + 1
                elif pred == 2:
                    predNon_Exp = predNon_Exp + 1
            if act == 1:
                actualRel = actualRel + 1
                if pred == 0:
                    predExp_Rel = predExp_Rel + 1
                elif pred == 2:
                    predNon_Rel = predNon_Rel + 1
            if act == 2:
                actualNon = actualNon + 1
                if pred == 0:
                    predExp_Non = predExp_Non + 1
                elif pred == 1:
                    predRel_Non = predRel_Non + 1

In [68]:
print ("predExp: %d" % ( predExp ))
print ("predExp_Rel: %d" % ( predExp_Rel ))
print ("predExp_Non: %d" % ( predExp_Non ))
print ("predRel: %d" % ( predRel ))
print ("predRel_Exp: %d" % ( predRel_Exp ))
print ("predRel_Non: %d" % ( predRel_Non ))
print ("predNon: %d" % ( predNon ))
print ("predNon_Exp: %d" % ( predNon_Exp ))
print ("predNon_Rel: %d" % ( predNon_Rel ))
print ("actualExp: %d" % ( actualExp ))
print ("actualRel: %d" % ( actualRel ))
print ("actualNon: %d" % ( actualNon ))

accuracy = (predExp+predRel+predNon)/(actualExp+actualRel+actualNon)
accuracy

predExp: 21591
predExp_Rel: 2121
predExp_Non: 2893
predRel: 52
predRel_Exp: 25
predRel_Non: 1
predNon: 159
predNon_Exp: 121
predNon_Rel: 7
actualExp: 21737
actualRel: 2180
actualNon: 3053


0.8083796811271784