In [1]:
import pandas as pd
from pandas import DataFrame, Series
import numpy as np
import matplotlib.pyplot as plt
import datetime
import random

from pyzipcode import ZipCodeDatabase
zipcode=ZipCodeDatabase()

%matplotlib inline

In [2]:
def fill_missing_data(df):
    """
    Take input df and fill all missing/NaN or non-sensical data with something
    that is useful
    INPUT - dataframe
    OUTPUT - dataframe
    """
    df['Product'].fillna('Not Provided', inplace=True)
    df['Sub-product'].fillna('Not Provided', inplace=True)
    df['Sub-issue'].fillna('Not Provided', inplace=True)
    df['Issue'].fillna('Not Provided', inplace=True)
    df['Consumer complaint narrative'].fillna('Not Provided', inplace=True)
    df['Company public response'].fillna('Not Provided', inplace=True)
    df['Company'].fillna('Not Provided', inplace=True)
    # df['State'].fillna('Not Provided', inplace=True)
    ## Modified by "find_state_by_zip" function
    # df['ZIP code'].fillna('Not Provided', inplace=True)
    ## Modified by "find_state_by_zip" function
    df['Tags'].fillna('Not Provided', inplace=True)
    df['Consumer consent provided?'].fillna('Not Provided', inplace=True)
    df['Submitted via'].fillna('Not Provided',inplace=True)
    df['Consumer disputed?'].fillna('Not Provided', inplace=True)

    return df

In [3]:
def date_cleaning(df):
    """
    Clean dates, transform to datetime
    """
    df['Recieved Year'] = df['Date received'].apply(lambda x: x.year)
    df['Recieved Month'] = df['Date received'].apply(lambda x: x.month)
    df['Recieved Day'] = df['Date received'].apply(lambda x: x.day)

    df['Submitted Year'] = df['Date sent to company'].apply(lambda x: x.year)
    df['Submitted Month'] = df['Date sent to company'].apply(lambda x: x.month)
    df['Submitted Day'] = df['Date sent to company'].apply(lambda x: x.day)

    return df

In [4]:
def modify_categoricals(df):
    """
    Turn categorical variables into Yes/No, True/False, or 1/0 for input into models.
    INPUT - dataframe
    OUTPUT - dataframe
    """

    # MODIFY 'Consumer consent provided?' to be T/F
    df['Consumer consent provided?'] = df['Consumer consent provided?'].apply(lambda x:'Consent not provided' if x=='Other' or x=='Consent withdrawn' or x=='Not Provided' else x)
    
    replace_consent = {'Consent provided': True, "Consent not provided": False}
    df['Consumer consent provided?'] = df['Consumer consent provided?'].apply(lambda x: replace_consent[x])

    # MODIFY 'Consumer disputed?' to be yes/no
    df['Consumer disputed?'] = df['Consumer disputed?'].apply(lambda x: 'No' if x=='Not Provided' else x)

    replace_Y_N_to_TF = {'Yes': True, 'No':False}

    df['Consumer disputed?'] = df['Consumer disputed?'].apply(lambda x: replace_Y_N_to_TF[x])

#     MODIFY 'Timely response?'
#     df['Timely response?'] = df['Timely response?'].apply(lambda x: replace[x])

    return df


In [5]:
def count_company_complaints(df):
    """
    Create count of complaints for each company, add column
    """
    count_company_complaints = df['Company'].value_counts()
    df['Count of Company Complaints'] = df['Company'].apply(lambda x: count_company_complaints[x])

    return df

In [6]:
def find_state_by_zip(df):
    zipcode = ZipCodeDatabase
    for item in df[pd.isnull(df['State']) & pd.notnull(df['ZIP code'])].index:
        try:
            df['State'][i] = str(zip[df['ZIP code'][i]].state)
        except:
            continue

    #  Fill in empties that can't be filled with pyzipcode
    df['State'].fillna('Not provided', inplace=True)
    df['ZIP code'].fillna('Not provided', inplace=True)

    return df

In [7]:
def create_numerical_features(df):
    """
    Create numerical values for columns with many different values.
    ['Product', 'Sub-product','Issue','Sub-issue','Tags', 'State']
    Count unique items (see EDA for more info) and assign number
    INPUT - dataframe
    OUTPUT - dataframe
    """
    column=['Product', 'Sub-product','Issue','Sub-issue','Tags', 'State']
    for name in column:
        repl={}
        i=0
        for value in df[name].unique():
            repl[value] = i
            i+=1

    df[name] = df[name].apply(lambda x: repl[x])
#     df_model[name] = df[name].astype('category')

    return df


In [8]:
df = pd.read_csv('../data/Consumer_Complaints_with_Consumer_Complaint_Narratives.csv')

In [9]:
fill_missing_data(df)
# date_cleaning(df)
modify_categoricals(df)
count_company_complaints(df)
find_state_by_zip(df)
create_numerical_features(df)

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID,Count of Company Complaints
0,03/24/2015,Credit card,Not Provided,Other,Not Provided,Received Capital One charge card offer XXXX. A...,Not Provided,Capital One,0,440XX,Not Provided,True,Web,03/30/2015,Closed with explanation,Yes,False,1297939,1907
1,03/23/2015,Debt collection,"Other (i.e. phone, health club, etc.)",Improper contact or sharing of info,Contacted me after I asked not to,I do n't know how they got my cell number. I t...,Not Provided,"CCS Financial Services, Inc.",1,727XX,Not Provided,True,Web,03/23/2015,Closed with explanation,Yes,False,1296593,139
2,03/23/2015,Credit card,Not Provided,Rewards,Not Provided,I 'm a longtime member of Charter One Bank/RBS...,Not Provided,"Citizens Financial Group, Inc.",2,482XX,Not Provided,True,Web,03/23/2015,Closed with explanation,Yes,True,1296693,305
3,03/23/2015,Credit reporting,Not Provided,Incorrect information on credit report,Information is not mine,"After looking at my credit report, I saw a col...",Company chooses not to provide a public response,Experian,3,331XX,Not Provided,True,Web,03/27/2015,Closed with non-monetary relief,Yes,False,1296955,5075
4,03/23/2015,Debt collection,Payday loan,Improper contact or sharing of info,Talked to a third party about my debt,I received a call from a XXXX XXXX from XXXX @...,Not Provided,"Big Picture Loans, LLC",4,291XX,Not Provided,True,Web,03/23/2015,Closed with explanation,Yes,False,1296727,131
5,03/23/2015,Student loan,Non-federal student loan,Dealing with my lender or servicer,Having problems with customer service,Was not contacted 4 years later about some pri...,Not Provided,Oliphant Financial Corporation,5,605XX,Not Provided,True,Web,03/25/2015,Closed with explanation,Yes,False,1296773,5
6,03/23/2015,Debt collection,Medical,Cont'd attempts collect debt not owed,Debt was paid,Collection Consultants is reporting a collecti...,Not Provided,Collection Consultants of California,6,923XX,Not Provided,True,Web,04/30/2015,Closed with explanation,Yes,False,1296774,20
7,03/23/2015,Credit reporting,Not Provided,Incorrect information on credit report,Information is not mine,I had my purse stolen in 2007. They never foun...,Company chooses not to provide a public response,Experian,7,044XX,Not Provided,True,Web,03/23/2015,Closed with explanation,Yes,False,1296785,5075
8,03/23/2015,Credit card,Not Provided,Other,Not Provided,I attempted to apply for a Discover Card Onlin...,Not Provided,Discover,8,217XX,Not Provided,True,Web,03/23/2015,Closed with non-monetary relief,Yes,False,1295056,730
9,03/23/2015,Debt collection,"Other (i.e. phone, health club, etc.)",Cont'd attempts collect debt not owed,Debt resulted from identity theft,Continued attempts by XXXX XXXX XXXX to collec...,Not Provided,Stellar Recovery Inc.,9,060XX,Not Provided,True,Web,03/25/2015,Closed with explanation,Yes,False,1296777,135


In [10]:
df.head()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID,Count of Company Complaints
0,03/24/2015,Credit card,Not Provided,Other,Not Provided,Received Capital One charge card offer XXXX. A...,Not Provided,Capital One,0,440XX,Not Provided,True,Web,03/30/2015,Closed with explanation,Yes,False,1297939,1907
1,03/23/2015,Debt collection,"Other (i.e. phone, health club, etc.)",Improper contact or sharing of info,Contacted me after I asked not to,I do n't know how they got my cell number. I t...,Not Provided,"CCS Financial Services, Inc.",1,727XX,Not Provided,True,Web,03/23/2015,Closed with explanation,Yes,False,1296593,139
2,03/23/2015,Credit card,Not Provided,Rewards,Not Provided,I 'm a longtime member of Charter One Bank/RBS...,Not Provided,"Citizens Financial Group, Inc.",2,482XX,Not Provided,True,Web,03/23/2015,Closed with explanation,Yes,True,1296693,305
3,03/23/2015,Credit reporting,Not Provided,Incorrect information on credit report,Information is not mine,"After looking at my credit report, I saw a col...",Company chooses not to provide a public response,Experian,3,331XX,Not Provided,True,Web,03/27/2015,Closed with non-monetary relief,Yes,False,1296955,5075
4,03/23/2015,Debt collection,Payday loan,Improper contact or sharing of info,Talked to a third party about my debt,I received a call from a XXXX XXXX from XXXX @...,Not Provided,"Big Picture Loans, LLC",4,291XX,Not Provided,True,Web,03/23/2015,Closed with explanation,Yes,False,1296727,131


# Create 'df_no_text' for non-text analysis

In [16]:
def create_df_no_text(df):
    """
    Take input df and fill all missing/NaN or non-sensical data with something
    that is useful in a model.  Also create labels column 'Company response to consumer', with 3 categories.
    INPUT - dataframe
    OUTPUT - 'df_no_text' dataframe for use in modeling non-text features
    """
    df_no_text = pd.DataFrame()  # Create empty df to fill
    
    df['Product'].fillna('Not Provided', inplace=True)
    df['Sub-product'].fillna('Not Provided', inplace=True)
    df['Sub-issue'].fillna('Not Provided', inplace=True)
    df['Issue'].fillna('Not Provided', inplace=True)
    df['Consumer complaint narrative'].fillna('Not Provided', inplace=True)
    df['Company public response'].fillna('Not Provided', inplace=True)
    df['Company'].fillna('Not Provided', inplace=True)
    df['State'].fillna('Not Provided', inplace=True)
    df['ZIP code'].fillna('Not Provided', inplace=True)
    df['Tags'].fillna('Not Provided', inplace=True)
    df['Consumer consent provided?'].fillna('Not Provided', inplace=True)
    df['Submitted via'].fillna('Not Provided',inplace=True)
    df['Consumer disputed?'].fillna('Not Provided', inplace=True)
    
    column=['Product', 'Sub-product','Issue','Sub-issue', 'Company', 'Tags', 'State']
    for name in column:
        repl={}
        i=0
        for value in df[name].unique():
            repl[value] = i
            i+=1

        df[name] = df[name].apply(lambda x: repl[x])
        df_no_text[name] = df[name].astype('category')
        
    cust_resp_dict ={'Closed':0,
                 'Untimely response':0,
                 'Closed with explanation':1,
                 'Closed with non-monetary relief':2,
                 'Closed with monetary relief':2}
    
    df_no_text['Company response to consumer'] = df['Company response to consumer'].apply(lambda x: cust_resp_dict[x])

    return df_no_text



In [17]:
# df

In [18]:
# df_no_text = pd.DataFrame()

In [19]:
create_df_no_text(df);

In [20]:
df_no_text.head()

NameError: name 'df_no_text' is not defined

# Create 'df_text' for text modeling

In [26]:
def create_df_text(df):
    df_text = pd.DataFrame()  # Create empty df to fill
    
    df_text['Consumer complaint narrative'] = df['Consumer complaint narrative']
    
    cust_resp_dict ={'Closed':0,
                 'Untimely response':0,
                 'Closed with explanation':1,
                 'Closed with non-monetary relief':2,
                 'Closed with monetary relief':2}
    
    df_text['Company response to consumer'] = df['Company response to consumer'].apply(lambda x: cust_resp_dict[x])
    
    return df_text
    

In [27]:
create_df_text(df)

Unnamed: 0,Consumer complaint narrative,Company response to consumer
0,Received Capital One charge card offer XXXX. A...,1
1,I do n't know how they got my cell number. I t...,1
2,I 'm a longtime member of Charter One Bank/RBS...,1
3,"After looking at my credit report, I saw a col...",2
4,I received a call from a XXXX XXXX from XXXX @...,1
5,Was not contacted 4 years later about some pri...,1
6,Collection Consultants is reporting a collecti...,1
7,I had my purse stolen in 2007. They never foun...,1
8,I attempted to apply for a Discover Card Onlin...,2
9,Continued attempts by XXXX XXXX XXXX to collec...,1
