In [78]:
import os
import pandas as pd
import statsmodels
from sklearn import preprocessing, linear_model, model_selection, metrics

import matplotlib
import matplotlib.pyplot as plt

pd.set_option('display.max_rows',10)
pd.set_option('display.max_columns',10)
pd.set_option('display.notebook_repr_html',True)
%matplotlib notebook

matplotlib.style.use('ggplot')

df = pd.read_csv(os.path.join('..',"data","consumer_complaints.csv.gz"))



  interactivity=interactivity, compiler=compiler, result=result)


In [79]:
df

Unnamed: 0,date_received,product,sub_product,issue,sub_issue,...,date_sent_to_company,company_response_to_consumer,timely_response,consumer_disputed?,complaint_id
0,08/30/2013,Mortgage,Other mortgage,"Loan modification,collection,foreclosure",,...,09/03/2013,Closed with explanation,Yes,Yes,511074
1,08/30/2013,Mortgage,Other mortgage,"Loan servicing, payments, escrow account",,...,09/03/2013,Closed with explanation,Yes,Yes,511080
2,08/30/2013,Credit reporting,,Incorrect information on credit report,Account status,...,09/18/2013,Closed with explanation,Yes,No,510473
3,08/30/2013,Student loan,Non-federal student loan,Repaying your loan,Repaying your loan,...,08/30/2013,Closed with explanation,Yes,Yes,510326
4,08/30/2013,Debt collection,Credit card,False statements or representation,Attempted to collect wrong amount,...,08/30/2013,Closed with explanation,Yes,Yes,511067
...,...,...,...,...,...,...,...,...,...,...,...
555952,07/01/2014,Mortgage,Other mortgage,"Loan modification,collection,foreclosure",,...,07/07/2014,Closed with explanation,Yes,No,919529
555953,07/01/2014,Mortgage,Other mortgage,"Loan servicing, payments, escrow account",,...,07/23/2014,Closed with explanation,No,No,918447
555954,07/10/2012,Mortgage,Conventional fixed mortgage,"Loan modification,collection,foreclosure",,...,11/18/2013,Closed with explanation,Yes,No,114550
555955,04/14/2015,Debt collection,I do not know,Communication tactics,Frequent or repeated calls,...,04/14/2015,Untimely response,No,No,1329963


In [80]:
df_datetime = pd.to_datetime(df['date_received'])

In [81]:
df_datetime.head()

0   2013-08-30
1   2013-08-30
2   2013-08-30
3   2013-08-30
4   2013-08-30
Name: date_received, dtype: datetime64[ns]

In [82]:
products_df = df[ ['product', 'complaint_id'] ].groupby('product').count().rename(columns = {'complaint_id': 'count'}).reset_index()

### Mortgages are product complained about most

In [83]:
products_df = products_df.sort_values(by = 'count', ascending = False)

In [142]:
products_df.plot(x = 'product', y ='count', kind = 'bar', alpha = .75, rot = 90)
plt.tight_layout()




<IPython.core.display.Javascript object>

In [85]:
df_mortgage_issues = df[df['product'] == 'Mortgage'][['issue','complaint_id']].groupby('issue').count().rename(columns = {'complaint_id': 'count'}).reset_index()

In [86]:
df_mortgage_issues = df_mortgage_issues.sort_values(by = 'count', ascending = False)

### The most common issues are foreclosure & mortgage broker complaints

In [141]:
df_mortgage_issues.plot(x='issue', y= 'count', kind = 'bar', alpha=0.75, rot=90)

plt.tight_layout()




<IPython.core.display.Javascript object>

In [88]:
df_mortgage_issues_NY = df[df['product'] == 'Mortgage'][df['state'] == 'NY'][['issue','complaint_id']].groupby('issue').count().reset_index()

  if __name__ == '__main__':


# New York Issues 

In [143]:
#these look similar to the national average

df_mortgage_issues_NY.plot(x='issue', y= 'complaint_id', kind = 'bar', alpha=0.75, rot=90)
plt.tight_layout()




<IPython.core.display.Javascript object>

In [90]:
#plot all mortgage complaints responded to in a timely-manner
#compare all mortgage complaints not responded to in a timely manner


#df_mortgage_timely_response = df[df['product'] == 'Mortgage'][df['timely_response'== 'yes']][['issue','complaint_id']].groupby('issue').count().reset_index()

In [91]:
df_product = pd.get_dummies(df['product'])

In [92]:
df_product.columns

Index(['Bank account or service', 'Consumer Loan', 'Credit card',
       'Credit reporting', 'Debt collection', 'Money transfers', 'Mortgage',
       'Other financial service', 'Payday loan', 'Prepaid card',
       'Student loan'],
      dtype='object')

In [93]:
df_product = pd.get_dummies(df, columns=['product', 'issue', 'sub_product', 'company_public_response', 'company', 'state'])

In [94]:
df_product.columns

Index(['date_received', 'sub_issue', 'consumer_complaint_narrative', 'zipcode',
       'tags', 'consumer_consent_provided', 'submitted_via',
       'date_sent_to_company', 'company_response_to_consumer',
       'timely_response',
       ...
       'state_TN', 'state_TX', 'state_UT', 'state_VA', 'state_VI', 'state_VT',
       'state_WA', 'state_WI', 'state_WV', 'state_WY'],
      dtype='object', length=3841)

In [95]:
#i want to know what issues people in new york have. are there more foreclosures? 
#foreclosure top 10 : delaware, new jersey, maryland, illinois, south carolina, nevada, florida, ohio, pennsylvania, georgia 



In [96]:
df_with_expanded_product = pd.get_dummies(df, columns=['product'])

In [97]:
df_with_expanded_product.columns

Index(['date_received', 'sub_product', 'issue', 'sub_issue',
       'consumer_complaint_narrative', 'company_public_response', 'company',
       'state', 'zipcode', 'tags', 'consumer_consent_provided',
       'submitted_via', 'date_sent_to_company', 'company_response_to_consumer',
       'timely_response', 'consumer_disputed?', 'complaint_id',
       'product_Bank account or service', 'product_Consumer Loan',
       'product_Credit card', 'product_Credit reporting',
       'product_Debt collection', 'product_Money transfers',
       'product_Mortgage', 'product_Other financial service',
       'product_Payday loan', 'product_Prepaid card', 'product_Student loan'],
      dtype='object')

## Complaints by state

In [98]:
df_mortgage_counts_state = df_with_expanded_product.groupby('state').sum()

In [99]:
df_mortgage_counts_state

Unnamed: 0_level_0,complaint_id,product_Bank account or service,product_Consumer Loan,product_Credit card,product_Credit reporting,...,product_Mortgage,product_Other financial service,product_Payday loan,product_Prepaid card,product_Student loan
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
AA,7534741,1.0,0.0,2.0,0.0,...,4.0,0.0,0.0,0.0,2.0
AE,214480925,21.0,15.0,24.0,47.0,...,68.0,0.0,0.0,1.0,10.0
AK,587575953,66.0,19.0,98.0,120.0,...,157.0,0.0,5.0,3.0,17.0
AL,5759014037,600.0,373.0,552.0,1096.0,...,1395.0,8.0,79.0,27.0,164.0
AP,160372554,7.0,9.0,20.0,28.0,...,39.0,0.0,0.0,0.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...
VT,853127304,128.0,43.0,156.0,113.0,...,302.0,0.0,1.0,5.0,55.0
WA,11191050471,1094.0,275.0,1284.0,1829.0,...,4315.0,11.0,48.0,50.0,312.0
WI,5754954013,572.0,266.0,993.0,798.0,...,1955.0,7.0,75.0,45.0,209.0
WV,1425126201,139.0,72.0,182.0,322.0,...,342.0,6.0,0.0,7.0,83.0


In [100]:
# read in census data 

df_pop_state = pd.read_csv(os.path.join('..',"data","state_pop_abbreviations.csv"))


In [101]:
df_pop_state.head()



Unnamed: 0,State,State_,Pop
0,.Alabama,AL,4863300
1,.Alaska,AK,741894
2,.Arizona,AZ,6931071
3,.Arkansas,AR,2988248
4,.California,CA,39250017


In [102]:
df_pop_state

Unnamed: 0,State,State_,Pop
0,.Alabama,AL,4863300
1,.Alaska,AK,741894
2,.Arizona,AZ,6931071
3,.Arkansas,AR,2988248
4,.California,CA,39250017
...,...,...,...
46,.Virginia,VT,8411808
47,.Washington,WA,7288000
48,.West Virginia,WI,1831102
49,.Wisconsin,WV,5778708


In [103]:
df_mortgage_by_counts_state = df_with_expanded_product.groupby(['state']).sum()

In [104]:
df_mortgage_by_counts_state

Unnamed: 0_level_0,complaint_id,product_Bank account or service,product_Consumer Loan,product_Credit card,product_Credit reporting,...,product_Mortgage,product_Other financial service,product_Payday loan,product_Prepaid card,product_Student loan
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
AA,7534741,1.0,0.0,2.0,0.0,...,4.0,0.0,0.0,0.0,2.0
AE,214480925,21.0,15.0,24.0,47.0,...,68.0,0.0,0.0,1.0,10.0
AK,587575953,66.0,19.0,98.0,120.0,...,157.0,0.0,5.0,3.0,17.0
AL,5759014037,600.0,373.0,552.0,1096.0,...,1395.0,8.0,79.0,27.0,164.0
AP,160372554,7.0,9.0,20.0,28.0,...,39.0,0.0,0.0,0.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...
VT,853127304,128.0,43.0,156.0,113.0,...,302.0,0.0,1.0,5.0,55.0
WA,11191050471,1094.0,275.0,1284.0,1829.0,...,4315.0,11.0,48.0,50.0,312.0
WI,5754954013,572.0,266.0,993.0,798.0,...,1955.0,7.0,75.0,45.0,209.0
WV,1425126201,139.0,72.0,182.0,322.0,...,342.0,6.0,0.0,7.0,83.0


In [105]:
#create a series with summed number of complaints by state

df_mortgage_state_only = df_mortgage_by_counts_state['product_Mortgage']

In [106]:
type(df_mortgage_state_only)

pandas.core.series.Series

In [107]:
df_mortgage_plot = df_mortgage_state_only.to_frame()

In [108]:
df_mortgage_plot.plot(kind = 'bar')

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x1160742e8>

In [109]:
#turn object into dataframe
df_mortgage_state_complaints = df_mortgage_state_only.to_frame().reset_index()

In [110]:
df_mortgage_state_complaints

Unnamed: 0,state,product_Mortgage
0,AA,4.0
1,AE,68.0
2,AK,157.0
3,AL,1395.0
4,AP,39.0
...,...,...
57,VT,302.0
58,WA,4315.0
59,WI,1955.0
60,WV,342.0


In [111]:
df_mortgage_state_complaints.state.unique()

array(['AA', 'AE', 'AK', 'AL', 'AP', 'AR', 'AS', 'AZ', 'CA', 'CO', 'CT',
       'DC', 'DE', 'FL', 'FM', 'GA', 'GU', 'HI', 'IA', 'ID', 'IL', 'IN',
       'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MH', 'MI', 'MN', 'MO', 'MP',
       'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH',
       'OK', 'OR', 'PA', 'PR', 'PW', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT',
       'VA', 'VI', 'VT', 'WA', 'WI', 'WV', 'WY'], dtype=object)

In [112]:
df_pop_state.State_.unique()

array(['AL', 'AK ', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC', 'FL', 'GA',
       'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME',
       'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM',
       'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX',
       'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY'], dtype=object)

In [113]:
df_mortgage_state_complaints = df_mortgage_state_complaints.merge(right = df_pop_state, how = 'inner', left_on = 'state', right_on = 'State_')

In [114]:
df_mortgage_state_complaints.Pop = df_mortgage_state_complaints.Pop.apply(lambda x: int(x.replace(',', '')))

In [115]:
df_mortgage_state_complaints

Unnamed: 0,state,product_Mortgage,State,State_,Pop
0,AL,1395.0,.Alabama,AL,4863300
1,AR,619.0,.Arkansas,AR,2988248
2,AZ,4358.0,.Arizona,AZ,6931071
3,CA,32988.0,.California,CA,39250017
4,CO,3303.0,.Colorado,CO,5540545
...,...,...,...,...,...
45,VT,302.0,.Virginia,VT,8411808
46,WA,4315.0,.Washington,WA,7288000
47,WI,1955.0,.West Virginia,WI,1831102
48,WV,342.0,.Wisconsin,WV,5778708


In [116]:
df_mortgage_state_complaints

Unnamed: 0,state,product_Mortgage,State,State_,Pop
0,AL,1395.0,.Alabama,AL,4863300
1,AR,619.0,.Arkansas,AR,2988248
2,AZ,4358.0,.Arizona,AZ,6931071
3,CA,32988.0,.California,CA,39250017
4,CO,3303.0,.Colorado,CO,5540545
...,...,...,...,...,...
45,VT,302.0,.Virginia,VT,8411808
46,WA,4315.0,.Washington,WA,7288000
47,WI,1955.0,.West Virginia,WI,1831102
48,WV,342.0,.Wisconsin,WV,5778708


In [117]:
#create a new column that has the complaints per capita 
#complaints per capita == num of complaints/ population 

In [118]:
df_mortgage_state_complaints.head()




Unnamed: 0,state,product_Mortgage,State,State_,Pop
0,AL,1395.0,.Alabama,AL,4863300
1,AR,619.0,.Arkansas,AR,2988248
2,AZ,4358.0,.Arizona,AZ,6931071
3,CA,32988.0,.California,CA,39250017
4,CO,3303.0,.Colorado,CO,5540545


In [119]:
df_mortgage_state_complaints['mortgage_complaints_per_capita'] = df_mortgage_state_complaints['product_Mortgage']/ df_mortgage_state_complaints['Pop']


In [120]:
df_mortgage_state_complaints.head()

Unnamed: 0,state,product_Mortgage,State,State_,Pop,mortgage_complaints_per_capita
0,AL,1395.0,.Alabama,AL,4863300,0.000287
1,AR,619.0,.Arkansas,AR,2988248,0.000207
2,AZ,4358.0,.Arizona,AZ,6931071,0.000629
3,CA,32988.0,.California,CA,39250017,0.00084
4,CO,3303.0,.Colorado,CO,5540545,0.000596


In [121]:
state_mortgage_complaintspc = df_mortgage_state_complaints[['mortgage_complaints_per_capita', 'state']]


In [122]:
state_mortgage_complaintspc.set_index('state', inplace = True)


In [123]:
state_mortgage_complaintspc.plot(kind='bar')

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x117692550>

In [124]:
df_mortgage_by_counts_state['product_Mortgage'].plot(kind = "bar")

<matplotlib.axes._subplots.AxesSubplot at 0x117692550>

In [125]:
df_counts_by_state = df_with_expanded_product.groupby(['state']).count()

In [126]:
df_counts_by_state

Unnamed: 0_level_0,date_received,sub_product,issue,sub_issue,consumer_complaint_narrative,...,product_Mortgage,product_Other financial service,product_Payday loan,product_Prepaid card,product_Student loan
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
AA,9,7,9,2,1,...,9,9,9,9,9
AE,219,148,219,88,41,...,219,219,219,219,219
AK,638,420,638,287,66,...,638,638,638,638,638
AL,5635,3987,5635,2651,799,...,5635,5635,5635,5635,5635
AP,153,105,153,78,27,...,153,153,153,153,153
...,...,...,...,...,...,...,...,...,...,...,...
VT,943,674,943,300,95,...,943,943,943,943,943
WA,11554,8441,11554,4442,1518,...,11554,11554,11554,11554,11554
WI,6125,4334,6125,2241,677,...,6125,6125,6125,6125,6125
WV,1457,953,1457,692,203,...,1457,1457,1457,1457,1457


In [127]:
df_mortgage_by_counts_state['product_Payday loan'].plot(kind = "bar")

<matplotlib.axes._subplots.AxesSubplot at 0x117692550>

In [128]:
df_mortgage_by_counts_state['product_Credit card'].plot(kind = "bar")

<matplotlib.axes._subplots.AxesSubplot at 0x117692550>

## Group by Product 

In [129]:
#this is creating a new column that is summing something unknown ???!!!
#df_product['sum_of_complaints_by_product'] = df_product.sum(axis=1)

In [130]:
df_product

Unnamed: 0,date_received,sub_issue,consumer_complaint_narrative,zipcode,tags,...,state_VT,state_WA,state_WI,state_WV,state_WY
0,08/30/2013,,,95993,,...,0,0,0,0,0
1,08/30/2013,,,91104,,...,0,0,0,0,0
2,08/30/2013,Account status,,11764,,...,0,0,0,0,0
3,08/30/2013,Repaying your loan,,21402,,...,0,0,0,0,0
4,08/30/2013,Attempted to collect wrong amount,,30106,,...,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
555952,07/01/2014,,,,,...,0,0,0,0,0
555953,07/01/2014,,,,"Older American, Servicemember",...,0,0,0,0,0
555954,07/10/2012,,,,,...,0,0,0,0,0
555955,04/14/2015,Frequent or repeated calls,,,,...,0,0,0,0,0


In [131]:
df_with_expanded_product.columns

Index(['date_received', 'sub_product', 'issue', 'sub_issue',
       'consumer_complaint_narrative', 'company_public_response', 'company',
       'state', 'zipcode', 'tags', 'consumer_consent_provided',
       'submitted_via', 'date_sent_to_company', 'company_response_to_consumer',
       'timely_response', 'consumer_disputed?', 'complaint_id',
       'product_Bank account or service', 'product_Consumer Loan',
       'product_Credit card', 'product_Credit reporting',
       'product_Debt collection', 'product_Money transfers',
       'product_Mortgage', 'product_Other financial service',
       'product_Payday loan', 'product_Prepaid card', 'product_Student loan'],
      dtype='object')

In [132]:

df_issue = df_with_expanded_product.groupby(['issue']).plot()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

in singular transformations; automatically expanding.
left=133798.0, right=133798.0
  'left=%s, right=%s') % (left, right))


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [133]:
df_issue

issue
APR or interest rate                       Axes(0.125,0.1;0.775x0.8)
Account opening, closing, or management    Axes(0.125,0.1;0.775x0.8)
Account terms and changes                  Axes(0.125,0.1;0.775x0.8)
Adding money                               Axes(0.125,0.1;0.775x0.8)
Advertising and marketing                  Axes(0.125,0.1;0.775x0.8)
                                                     ...            
Unauthorized transactions/trans. issues    Axes(0.125,0.1;0.775x0.8)
Unexpected/Other fees                      Axes(0.125,0.1;0.775x0.8)
Unsolicited issuance of credit card        Axes(0.125,0.1;0.775x0.8)
Using a debit or ATM card                  Axes(0.125,0.1;0.775x0.8)
Wrong amount charged or received           Axes(0.125,0.1;0.775x0.8)
Length: 95, dtype: object

In [134]:
consumer_complaint_narrative = df_with_expanded_product[ ['issue', 'consumer_complaint_narrative'] ].groupby('product_Mortgage').count().rename(columns = {'complaint_id': 'count'}).reset_index()

KeyError: 'product_Mortgage'

## Predict who complains based on the type of product 

In [None]:
X = df_with_expanded_product[ ['state'] ]
y = df_with_expanded_product.product_Mortgage

In [None]:
type(X)

In [None]:
model = linear_model.LinearRegression()
model.fit(X,y)

print (model.intercept_)
print (model.coef_)

In [None]:
logistic regression -- time and state 
    

In [None]:
df_with_expanded_product.head()

# Random Forest 

In [144]:
from sklearn import tree, ensemble, metrics, model_selection, externals

In [148]:
df_with_expanded_product.head()

Unnamed: 0,date_received,sub_product,issue,sub_issue,consumer_complaint_narrative,...,product_Mortgage,product_Other financial service,product_Payday loan,product_Prepaid card,product_Student loan
0,08/30/2013,Other mortgage,"Loan modification,collection,foreclosure",,,...,1,0,0,0,0
1,08/30/2013,Other mortgage,"Loan servicing, payments, escrow account",,,...,1,0,0,0,0
2,08/30/2013,,Incorrect information on credit report,Account status,,...,0,0,0,0,0
3,08/30/2013,Non-federal student loan,Repaying your loan,Repaying your loan,,...,0,0,0,0,1
4,08/30/2013,Credit card,False statements or representation,Attempted to collect wrong amount,,...,0,0,0,0,0


In [150]:
df_with_expanded_product.columns

Index(['date_received', 'sub_product', 'issue', 'sub_issue',
       'consumer_complaint_narrative', 'company_public_response', 'company',
       'state', 'zipcode', 'tags', 'consumer_consent_provided',
       'submitted_via', 'date_sent_to_company', 'company_response_to_consumer',
       'timely_response', 'consumer_disputed?', 'complaint_id',
       'product_Bank account or service', 'product_Consumer Loan',
       'product_Credit card', 'product_Credit reporting',
       'product_Debt collection', 'product_Money transfers',
       'product_Mortgage', 'product_Other financial service',
       'product_Payday loan', 'product_Prepaid card', 'product_Student loan'],
      dtype='object')

In [145]:
X = df_with_expanded_product[df_with_expanded_product.columns.values]
X.drop('product_Mortgage', axis = 1, inplace = True)

y = df_with_expanded_product.product_Mortgage

In [146]:
train_X, test_X, train_y, test_y = model_selection.train_test_split(X, y, train_size = .6, random_state = 0)


In [147]:
model = tree.DecisionTreeRegressor(random_state = 0).fit(train_X, train_y)


ValueError: could not convert string to float: 'No'

In [139]:
train_y_hat = model.predict(train_X)
print (np.sqrt(metrics.mean_squared_error(train_y, train_y_hat)))

test_y_hat = model.predict(test_X)
print (np.sqrt(metrics.mean_squared_error(test_y, test_y_hat)))

NameError: name 'model' is not defined

In [140]:
dot_data = externals.six.StringIO()
tree.export_graphviz(model, out_file = dot_data, feature_names = X.columns)
%dotstr dot_data.getvalue()

NameError: name 'externals' is not defined