# Reading Consumer Financial Protection Bureau’s consumer complaint data

In [1]:
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", 200)
pd.set_option("display.max_colwidth", 200)



In [2]:
!ls

[34m-[m[m
[34mAlgorithms[m[m
[34mArticlesToRemember[m[m
BogyungKim_Finding&InterviewingSources.docx
[34mCV, Resume[m[m
Cisco AnyConnect Secure Mobility Client
[34mConsumerComplaint[m[m
[34mConsumerCreditRating[m[m
[34mDD_FinalProject[m[m
Data_Questions.docx
[34mGitHub Desktop.app[m[m
Google Chrome
[34mHomeownership[m[m
[34mIdeas[m[m
[34mInsurance[m[m
Insurance_ContactList.docx
[34mJ-School[m[m
JeremiaVersion_VapeShops.ipynb
JobOpp..docx
KakaoTalk
Kim_AlgosFinalProject.ipynb
[34mLittle Geocoder.app[m[m
[34mMaster's Project[m[m
Motor_Vehicle_Collisions_-_Crashes.csv
[34mNarrative Writing[m[m
NatureOfSuit.png
OpenRefine
[34mPicsFromPhone2021[m[m
[34mPitch1_PPP[m[m
[34mPitch3_CityFundedShelters[m[m
[34mPitch3_NYC_PE[m[m
[34mPitch4_Conviction[m[m
[34mPitch5_311Reports[m[m
[34mPithch4_Arrests2020[m[m
Postgres
Project_Template.docx
Python Wrangler
QGIS
Research_Highlights.docx
[34mSoma's 

In [3]:
import os
os.chdir("ConsumerComplaint")

In [4]:
!ls

CC_Forbearance.csv
CC_MORTGAGE.docx
CC_STUDENTLOANS.docx
ConsumerComplaint.ipynb
ConsumerComplaintData2020.csv
ConsumerComplaintData2020_new.csv
ConsumerComplaintData_ProbCompInvest.csv
Original_complaints.csv
StudentLoan.csv


In [5]:
df = pd.read_csv("ConsumerComplaintData2020.csv")

In [6]:
df.shape

(5708, 18)

## Cleaning df

In [7]:
# The bureau replaced Xs for the names of companies other the ones each consumer filed a complaint againt.
# So I'll removed the Xs in the 'Consumer complaint narrative' column.

In [8]:
df['Consumer complaint narrative'] = df['Consumer complaint narrative'].str.replace('XXXX', '', regex=False)

In [9]:
df.head(2)

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,2020-03-20,"Credit reporting, credit repair services, or other personal consumer reports",Credit reporting,Problem with a credit reporting company's investigation into an existing problem,Their investigation did not fix an error on your report,", NY Dear , The reason why I am contacting you is to discuss a couple of late payments that greatly affected my current credit score. Through my four leases with my payments have alwa...",,"EQUIFAX, INC.",NY,143XX,,Consent provided,Web,2020-03-20,Closed with explanation,Yes,,3574222
1,2020-01-16,"Credit reporting, credit repair services, or other personal consumer reports",Credit reporting,Incorrect information on your report,Information belongs to someone else,PLEASE REMOVE THE DISPUTED ACCOUNTS OFF MY CREDIT REPORT. THE TRANSACTIONS ARE INACCURATE AND I WOULD LIKE THOSE ACCOUNTS OFF MY FILE. THEY HAVE BEEN HINDERING MY CREDIT AND HAVE CAUSED HARM OFF A...,,"EQUIFAX, INC.",NY,115XX,,Consent provided,Web,2020-01-16,Closed with non-monetary relief,Yes,,3499342


In [10]:
df['Sub-issue'].value_counts()

Information belongs to someone else                                                      1964
Their investigation did not fix an error on your report                                   957
Account status incorrect                                                                  416
Investigation took more than 30 days                                                      409
Credit inquiries on your report that you don't recognize                                  364
Account information incorrect                                                             363
Was not notified of investigation status or results                                       280
Problem getting your free annual credit report                                            138
Difficulty submitting a dispute or getting information about a dispute over the phone     120
Old information reappears or never goes away                                              111
Personal information incorrect                              

# Looking for a correlation between complaint narratives and the issues consumers raised

### Of the issues, I'm going to examine 'Account status incorrect' & 'Investigation took more than 30 days'

In [11]:
complaint_df = df[df['Sub-issue'].isin(['Account status incorrect', 'Investigation took more than 30 days'])].dropna(subset=['Consumer complaint narrative'])
complaint_df['Sub-issue'].value_counts()

Account status incorrect                416
Investigation took more than 30 days    409
Name: Sub-issue, dtype: int64

In [12]:
complaint_df['is_AccountStatusIncorrect'] =  (df['Sub-issue'] == 'Account status incorrect').astype(int)
complaint_df['is_AccountStatusIncorrect'].value_counts()

1    416
0    409
Name: is_AccountStatusIncorrect, dtype: int64

In [13]:
pd.DataFrame({
    'late': complaint_df['Consumer complaint narrative'].str.contains("late", na=False).astype(int),
    'wrong': complaint_df['Consumer complaint narrative'].str.contains("wrong", na=False).astype(int),
    'inaccurate': complaint_df['Consumer complaint narrative'].str.contains("inaccurate", na=False).astype(int),
})

Unnamed: 0,late,wrong,inaccurate
7,0,0,0
22,0,0,0
28,1,0,0
30,0,0,0
34,0,0,0
...,...,...,...
5672,1,0,0
5673,0,1,0
5695,0,0,0
5700,0,0,0


# TfidfVectorizer

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', min_df=100)

matrix = vectorizer.fit_transform(complaint_df['Consumer complaint narrative'])

words_df = pd.DataFrame(matrix.toarray(),
                        columns=vectorizer.get_feature_names())
words_df

Unnamed: 0,00,2020,30,account,accounts,agencies,bureaus,closed,company,credit,date,day,days,did,dispute,filed,help,inaccurate,incorrect,information,investigation,items,late,letter,letters,loan,negative,paid,payment,payments,receive,received,regards,remove,removed,report,reported,reporting,reports,request,respond,response,results,score,sent,showing,time,xx
0,0.000000,0.180463,0.000000,0.000000,0.000000,0.196769,0.161291,0.0,0.000000,0.240092,0.000000,0.000000,0.000000,0.000000,0.312301,0.408723,0.000000,0.177449,0.0,0.147574,0.191663,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.224500,0.145878,0.431787,0.0,0.0,0.000000,0.000000,0.262733,0.217791,0.000000,0.0,0.000000,0.214043,0.000000,0.000000,0.000000,0.000000,0.264710
1,0.000000,0.000000,0.000000,0.216426,0.000000,0.000000,0.000000,0.0,0.000000,0.127526,0.000000,0.000000,0.000000,0.335389,0.000000,0.000000,0.000000,0.000000,0.0,0.235154,0.000000,0.000000,0.000000,0.303988,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.357733,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.582595,0.351202,0.293212,0.000000
2,0.086424,0.000000,0.000000,0.560760,0.072068,0.000000,0.000000,0.0,0.087873,0.033042,0.000000,0.000000,0.000000,0.173799,0.000000,0.000000,0.000000,0.000000,0.0,0.060929,0.000000,0.000000,0.000000,0.000000,0.0,0.089919,0.0,0.0,0.000000,0.000000,0.000000,0.060228,0.000000,0.0,0.0,0.042440,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.090996,0.151943,0.765031
3,0.000000,0.000000,0.000000,0.205536,0.264152,0.000000,0.000000,0.0,0.000000,0.121109,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.282187,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.441510,0.000000,0.0,0.0,0.311109,0.000000,0.000000,0.000000,0.000000,0.0,0.630115,0.323909,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.000000,0.000000,0.000000,0.458510,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.527139,0.000000,0.000000,0.000000,0.0,0.249093,0.000000,0.000000,0.000000,0.322008,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.347010,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.361287,0.000000,0.308565,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
820,0.244991,0.000000,0.000000,0.000000,0.000000,0.000000,0.094385,0.0,0.000000,0.093666,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.106285,0.000000,0.0,0.000000,0.0,0.0,0.528022,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.060153,0.111122,0.000000,0.000000,0.127825,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.774524
821,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.276469,0.415832,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.284587,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.518844,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.246666,0.170642,0.000000,0.000000,0.0,0.000000,0.000000,0.286297,0.000000,0.000000,0.478048,0.000000
822,0.000000,0.000000,0.000000,0.153739,0.000000,0.000000,0.000000,0.0,0.000000,0.181177,0.252541,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.493046,0.0,0.0,0.000000,0.226059,0.254117,0.000000,0.000000,0.0,0.0,0.000000,0.214943,0.148697,0.000000,0.000000,0.0,0.000000,0.000000,0.249477,0.000000,0.000000,0.624852,0.000000
823,0.000000,0.271265,0.371875,0.000000,0.000000,0.147888,0.121223,0.0,0.000000,0.180448,0.000000,0.336409,0.105479,0.000000,0.234720,0.307189,0.000000,0.133367,0.0,0.110914,0.144051,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.168730,0.109639,0.324523,0.0,0.0,0.000000,0.000000,0.197465,0.163688,0.000000,0.0,0.156475,0.000000,0.000000,0.000000,0.000000,0.000000,0.397903


In [15]:
# features
X = words_df
# labels
y = complaint_df['is_AccountStatusIncorrect']

# RandomForestClassifier

In [16]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=10)
clf.fit(X, y)

RandomForestClassifier(n_estimators=10)

In [17]:
clf.score(X, y)

0.9987878787878788

In [18]:
from sklearn.metrics import confusion_matrix

y_true = y
y_pred = clf.predict(X)
matrix = confusion_matrix(y_true, y_pred)

label_names = pd.Series(['AccountStatusIncorrect', 'InvestgationLonger30Days'])

pd.DataFrame(matrix,
     columns='Predicted ' + label_names,
     index='Is ' + label_names)

Unnamed: 0,Predicted AccountStatusIncorrect,Predicted InvestgationLonger30Days
Is AccountStatusIncorrect,409,0
Is InvestgationLonger30Days,1,415


In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [20]:
clf.fit(X_train, y_train)

RandomForestClassifier(n_estimators=10)

In [21]:
clf.score(X_test, y_test)

0.9468599033816425

In [22]:
from sklearn.metrics import confusion_matrix

y_true = y_test
y_pred = clf.predict(X_test)
matrix = confusion_matrix(y_true, y_pred)

label_names = pd.Series(['AccountStatusIncorrect', 'InvestgationLonger30Days'])

pd.DataFrame(matrix,
     columns='Predicted ' + label_names,
     index='Is ' + label_names)

Unnamed: 0,Predicted AccountStatusIncorrect,Predicted InvestgationLonger30Days
Is AccountStatusIncorrect,88,8
Is InvestgationLonger30Days,3,108


# LinearSVC

In [23]:
from sklearn.svm import LinearSVC

clf = LinearSVC()

clf.fit(X_train, y_train)

LinearSVC()

In [24]:
clf.score(X_test, y_test)

0.9178743961352657

In [25]:
from sklearn.metrics import confusion_matrix

y_true = y_test
y_pred = clf.predict(X_test)
matrix = confusion_matrix(y_true, y_pred)

label_names = pd.Series(['AccountStatusIncorrect', 'InvestgationLonger30Days'])

pd.DataFrame(matrix,
     columns='Predicted ' + label_names,
     index='Is ' + label_names)

Unnamed: 0,Predicted AccountStatusIncorrect,Predicted InvestgationLonger30Days
Is AccountStatusIncorrect,87,9
Is InvestgationLonger30Days,8,103


In [26]:
import eli5

feature_names = list(words_df.columns)
eli5.show_weights(clf, feature_names=feature_names)

Weight?,Feature
+1.692,loan
+1.433,late
+1.266,paid
+1.243,remove
+1.150,reported
+0.861,payment
+0.840,closed
+0.764,company
+0.761,did
… 11 more positive …,… 11 more positive …


# ExtraTreeClassifier

In [27]:
from sklearn.tree import ExtraTreeClassifier

clf = ExtraTreeClassifier()

clf.fit(X_train, y_train)

ExtraTreeClassifier()

In [28]:
clf.score(X_test, y_test)

0.9178743961352657

In [29]:
from sklearn.metrics import confusion_matrix

y_true = y_test
y_pred = clf.predict(X_test)
matrix = confusion_matrix(y_true, y_pred)

label_names = pd.Series(['AccountStatusIncorrect', 'InvestgationLonger30Days'])

pd.DataFrame(matrix,
     columns='Predicted ' + label_names,
     index='Is ' + label_names)

Unnamed: 0,Predicted AccountStatusIncorrect,Predicted InvestgationLonger30Days
Is AccountStatusIncorrect,86,10
Is InvestgationLonger30Days,7,104


# ExtraTreesClassifier

In [30]:
from sklearn.ensemble import ExtraTreesClassifier

clf = ExtraTreesClassifier()

clf.fit(X_train, y_train)

ExtraTreesClassifier()

In [31]:
clf.score(X_test, y_test)

0.9516908212560387

In [32]:
from sklearn.metrics import confusion_matrix

y_true = y_test
y_pred = clf.predict(X_test)
matrix = confusion_matrix(y_true, y_pred)

label_names = pd.Series(['AccountStatusIncorrect', 'InvestgationLonger30Days'])

pd.DataFrame(matrix,
     columns='Predicted ' + label_names,
     index='Is ' + label_names)

Unnamed: 0,Predicted AccountStatusIncorrect,Predicted InvestgationLonger30Days
Is AccountStatusIncorrect,87,9
Is InvestgationLonger30Days,1,110
