In [3]:
import os 
import pandas as pd 
import statsmodels.api as sm 
import sklearn 

pd.set_option('display.max_rows',10)
pd.set_option('display.max_columns',10)
pd.set_option('display.notebook_repr_html',True)



#### Hypothesis: Type of product will predict whether or not a consumer files a complaint. 

#Background 
The Consumer Financial Protection Bureau 
receives customer complaints about financial products 
and refers them to the financial institutions. But, it is unclear if there are underlying patterns in the complaints. It will matter to anyone who has struggled with their financial institution and the CFPB. 

#What type of problem
I am predicting a binary classifier for each category. It seems to reside in the classification realm of machine learning. 


#What kind of impact do you think it could have?
There could be evidence for better customer support, regulation or enforcement around banking in particular product areas. 


What do you think will have the most impact in predicting the value you are interested in solving for?





In [6]:
df = pd.read_csv(os.path.join('..',"data","consumer_complaints.csv.gz"))

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
df.head()

Unnamed: 0,date_received,product,sub_product,issue,sub_issue,...,date_sent_to_company,company_response_to_consumer,timely_response,consumer_disputed?,complaint_id
0,08/30/2013,Mortgage,Other mortgage,"Loan modification,collection,foreclosure",,...,09/03/2013,Closed with explanation,Yes,Yes,511074
1,08/30/2013,Mortgage,Other mortgage,"Loan servicing, payments, escrow account",,...,09/03/2013,Closed with explanation,Yes,Yes,511080
2,08/30/2013,Credit reporting,,Incorrect information on credit report,Account status,...,09/18/2013,Closed with explanation,Yes,No,510473
3,08/30/2013,Student loan,Non-federal student loan,Repaying your loan,Repaying your loan,...,08/30/2013,Closed with explanation,Yes,Yes,510326
4,08/30/2013,Debt collection,Credit card,False statements or representation,Attempted to collect wrong amount,...,08/30/2013,Closed with explanation,Yes,Yes,511067


In [10]:
df.sample(n = 1%10)

Unnamed: 0,date_received,product,sub_product,issue,sub_issue,...,date_sent_to_company,company_response_to_consumer,timely_response,consumer_disputed?,complaint_id
86402,06/23/2014,Credit reporting,,Incorrect information on credit report,Account terms,...,06/26/2014,Closed with non-monetary relief,Yes,No,907337


In [7]:
#what columns do we have 
df.columns


Index(['date_received', 'product', 'sub_product', 'issue', 'sub_issue',
       'consumer_complaint_narrative', 'company_public_response', 'company',
       'state', 'zipcode', 'tags', 'consumer_consent_provided',
       'submitted_via', 'date_sent_to_company', 'company_response_to_consumer',
       'timely_response', 'consumer_disputed?', 'complaint_id'],
      dtype='object')

In [39]:
df

Unnamed: 0_level_0,date_received,product,sub_product,issue,sub_issue,...,submitted_via,date_sent_to_company,company_response_to_consumer,timely_response,consumer_disputed?
complaint_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
511074,08/30/2013,Mortgage,Other mortgage,"Loan modification,collection,foreclosure",,...,Referral,09/03/2013,Closed with explanation,Yes,Yes
511080,08/30/2013,Mortgage,Other mortgage,"Loan servicing, payments, escrow account",,...,Referral,09/03/2013,Closed with explanation,Yes,Yes
510473,08/30/2013,Credit reporting,,Incorrect information on credit report,Account status,...,Postal mail,09/18/2013,Closed with explanation,Yes,No
510326,08/30/2013,Student loan,Non-federal student loan,Repaying your loan,Repaying your loan,...,Email,08/30/2013,Closed with explanation,Yes,Yes
511067,08/30/2013,Debt collection,Credit card,False statements or representation,Attempted to collect wrong amount,...,Web,08/30/2013,Closed with explanation,Yes,Yes
...,...,...,...,...,...,...,...,...,...,...,...
919529,07/01/2014,Mortgage,Other mortgage,"Loan modification,collection,foreclosure",,...,Referral,07/07/2014,Closed with explanation,Yes,No
918447,07/01/2014,Mortgage,Other mortgage,"Loan servicing, payments, escrow account",,...,Referral,07/23/2014,Closed with explanation,No,No
114550,07/10/2012,Mortgage,Conventional fixed mortgage,"Loan modification,collection,foreclosure",,...,Phone,11/18/2013,Closed with explanation,Yes,No
1329963,04/14/2015,Debt collection,I do not know,Communication tactics,Frequent or repeated calls,...,Phone,04/14/2015,Untimely response,No,No


In [11]:
#change complaint id to index 
#df = df.set_index('complaint_id')

In [9]:
#perhaps i should drop some of these? but, how many? 

df.groupby('issue')

#wdf.issue

<pandas.core.groupby.DataFrameGroupBy object at 0x12175af60>

In [10]:
#the most common issue is loan modification, collection or foreclosure 
df.issue.value_counts(dropna = False)

Loan modification,collection,foreclosure    97191
Incorrect information on credit report      66718
Loan servicing, payments, escrow account    60375
Cont'd attempts collect debt not owed       42285
Account opening, closing, or management     26661
                                            ...  
Lost or stolen money order                     25
Incorrect exchange rate                        16
Lender sold the property                        5
Lender damaged or destroyed vehicle             5
Lender damaged or destroyed property            1
Name: issue, dtype: int64

In [11]:
#looks like mortgages are the greatest number of complaints 

df['product'].value_counts(dropna = False)

Mortgage                   186475
Debt collection            101052
Credit reporting            91854
Credit card                 66468
Bank account or service     62563
                            ...  
Student loan                15839
Payday loan                  3877
Money transfers              3812
Prepaid card                 2470
Other financial service       557
Name: product, dtype: int64

In [12]:
df.isnull().sum()

df.drop

<bound method NDFrame.drop of        date_received           product                  sub_product  \
0         08/30/2013          Mortgage               Other mortgage   
1         08/30/2013          Mortgage               Other mortgage   
2         08/30/2013  Credit reporting                          NaN   
3         08/30/2013      Student loan     Non-federal student loan   
4         08/30/2013   Debt collection                  Credit card   
...              ...               ...                          ...   
555952    07/01/2014          Mortgage               Other mortgage   
555953    07/01/2014          Mortgage               Other mortgage   
555954    07/10/2012          Mortgage  Conventional fixed mortgage   
555955    04/14/2015   Debt collection                I do not know   
555956    08/14/2014   Debt collection                I do not know   

                                           issue  \
0       Loan modification,collection,foreclosure   
1       Loan 

In [38]:
#df = df.set_index('complaint_id')
df.columns

Index(['date_received', 'product', 'sub_product', 'issue', 'sub_issue',
       'consumer_complaint_narrative', 'company_public_response', 'company',
       'state', 'zipcode', 'tags', 'consumer_consent_provided',
       'submitted_via', 'date_sent_to_company', 'company_response_to_consumer',
       'timely_response', 'consumer_disputed?'],
      dtype='object')

In [42]:

#create new dataframes & join 'em for a super cohort heavy df
df_product = pd.get_dummies(df['product'])
#df_issue = pd.get_dummies(df['issue'])
df_timely_response = pd.get_dummies(df['timely_response'], prefix = 'timely_response')
df_consumer_disputed = pd.get_dummies(df['consumer_disputed?'], prefix = 'consumer_disputed')

df = df.join([df_product, df_timely_response, df_consumer_disputed])

In [48]:
#df = df.merge(df_product, df_timely_response, df_consumer_disputed, how="left")

In [43]:
df.head()

Unnamed: 0_level_0,date_received,product,sub_product,issue,sub_issue,...,Student loan,timely_response_No,timely_response_Yes,consumer_disputed_No,consumer_disputed_Yes
complaint_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
511074,08/30/2013,Mortgage,Other mortgage,"Loan modification,collection,foreclosure",,...,0.0,0.0,1.0,0.0,1.0
511080,08/30/2013,Mortgage,Other mortgage,"Loan servicing, payments, escrow account",,...,0.0,0.0,1.0,0.0,1.0
510473,08/30/2013,Credit reporting,,Incorrect information on credit report,Account status,...,0.0,0.0,1.0,1.0,0.0
510326,08/30/2013,Student loan,Non-federal student loan,Repaying your loan,Repaying your loan,...,1.0,0.0,1.0,0.0,1.0
511067,08/30/2013,Debt collection,Credit card,False statements or representation,Attempted to collect wrong amount,...,0.0,0.0,1.0,0.0,1.0


In [45]:
# how often is there no timely response about each type of product 


#the vast majority of folks writing about their loans do not dispute their response 
df.groupby(['Student loan']).sum()

Unnamed: 0_level_0,Bank account or service,Consumer Loan,Credit card,Credit reporting,Debt collection,...,Prepaid card,timely_response_No,timely_response_Yes,consumer_disputed_No,consumer_disputed_Yes
Student loan,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0.0,62563.0,20990.0,66468.0,91854.0,101052.0,...,2470.0,13859.0,526259.0,431087.0,109031.0
1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,189.0,15650.0,12736.0,3103.0


In [None]:
df.groupby(['Mortgage', 'zipcode']).complaint_id.count()

In [51]:
df.columns

Index(['date_received', 'product', 'sub_product', 'issue', 'sub_issue',
       'consumer_complaint_narrative', 'company_public_response', 'company',
       'state', 'zipcode', 'tags', 'consumer_consent_provided',
       'submitted_via', 'date_sent_to_company', 'company_response_to_consumer',
       'timely_response', 'consumer_disputed?', 'Bank account or service',
       'Consumer Loan', 'Credit card', 'Credit reporting', 'Debt collection',
       'Money transfers', 'Mortgage', 'Other financial service', 'Payday loan',
       'Prepaid card', 'Student loan', 'timely_response_No',
       'timely_response_Yes', 'consumer_disputed_No', 'consumer_disputed_Yes'],
      dtype='object')

In [None]:
#which types of products have the most complaints?
df.groupby.

#which products are the least likely to get a quick response 
