## Data Analysis Question: What customers complaints might be hurting the company and what's the size of these complaints?

Data Source: data.gov

Data Analyst: Alex Idachaba

In [101]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import datetime

In [102]:
# Load data
complaints = pd.read_csv('consumer_complaints.csv')
complaints.head(3)

Unnamed: 0.1,Unnamed: 0,Complaint ID,Product,Sub-product,Issue,Sub-issue,State,ZIP code,Date received,Date sent to company,Company,Company response,Timely response?,Consumer disputed?
0,0,1291006,Debt collection,,Communication tactics,Frequent or repeated calls,TX,76119.0,2015-03-19,2015-03-19,"Premium Asset Services, LLC",In progress,Yes,
1,1,1290580,Debt collection,Medical,Cont'd attempts collect debt not owed,Debt is not mine,TX,77479.0,2015-03-19,2015-03-19,Accounts Receivable Consultants Inc.,Closed with explanation,Yes,
2,2,1290564,Mortgage,FHA mortgage,"Application, originator, mortgage broker",,MA,2127.0,2015-03-19,2015-03-19,RBS Citizens,Closed with explanation,Yes,Yes


In [103]:
# Lets look at the data summary info
complaints.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28156 entries, 0 to 28155
Data columns (total 14 columns):
Unnamed: 0              28156 non-null int64
Complaint ID            28156 non-null int64
Product                 28156 non-null object
Sub-product             17582 non-null object
Issue                   28154 non-null object
Sub-issue               13211 non-null object
State                   27735 non-null object
ZIP code                27876 non-null float64
Date received           28156 non-null object
Date sent to company    28156 non-null object
Company                 28156 non-null object
Company response        28156 non-null object
Timely response?        28156 non-null object
Consumer disputed?      6006 non-null object
dtypes: float64(1), int64(2), object(11)
memory usage: 3.0+ MB


In [104]:
# Lets retrieve paypal holdings company data
complaints = complaints[complaints['Company'] == 'Bank of America']
complaints.head(3)

Unnamed: 0.1,Unnamed: 0,Complaint ID,Product,Sub-product,Issue,Sub-issue,State,ZIP code,Date received,Date sent to company,Company,Company response,Timely response?,Consumer disputed?
73,73,1287644,Mortgage,Conventional adjustable mortgage (ARM),"Loan modification,collection,foreclosure",,CO,80516.0,2015-03-16,2015-03-17,Bank of America,In progress,Yes,
81,81,1283979,Mortgage,FHA mortgage,"Loan modification,collection,foreclosure",,OH,43204.0,2015-03-16,2015-03-16,Bank of America,In progress,Yes,
637,637,1275316,Mortgage,Other mortgage,"Loan servicing, payments, escrow account",,CA,91325.0,2015-03-10,2015-03-10,Bank of America,Closed with explanation,Yes,


In [105]:
# Check for duplicate rows
complaints.duplicated().any()

False

In [106]:
# Check drop null values
complaints.dropna(how='all', inplace=True)

In [107]:
# Fill 'None' for null values
complaints.fillna(value='None', inplace=True)
complaints.head(2)

Unnamed: 0.1,Unnamed: 0,Complaint ID,Product,Sub-product,Issue,Sub-issue,State,ZIP code,Date received,Date sent to company,Company,Company response,Timely response?,Consumer disputed?
73,73,1287644,Mortgage,Conventional adjustable mortgage (ARM),"Loan modification,collection,foreclosure",,CO,80516,2015-03-16,2015-03-17,Bank of America,In progress,Yes,
81,81,1283979,Mortgage,FHA mortgage,"Loan modification,collection,foreclosure",,OH,43204,2015-03-16,2015-03-16,Bank of America,In progress,Yes,


In [108]:
# Change date recieved to datetime
complaints['Date received'] = pd.to_datetime(complaints['Date received'], yearfirst=True)

In [109]:
# Sort by date recieved
complaints.sort_values('Date received', ascending=True, inplace=True)
complaints.head(3)

Unnamed: 0.1,Unnamed: 0,Complaint ID,Product,Sub-product,Issue,Sub-issue,State,ZIP code,Date received,Date sent to company,Company,Company response,Timely response?,Consumer disputed?
28138,28138,1178167,Credit card,,Billing disputes,,NJ,7042,2015-01-01,2015-01-09,Bank of America,Closed with explanation,Yes,
28136,28136,1179173,Credit card,,Billing disputes,,DC,20017,2015-01-01,2015-01-02,Bank of America,Closed with explanation,Yes,No
28040,28040,1177932,Credit card,,Billing disputes,,FL,33544,2015-01-01,2015-01-01,Bank of America,Closed with explanation,Yes,No


In [110]:
# Reset index then drop 'index' column created after reset
complaints = complaints.reset_index().drop(columns=['Unnamed: 0', 'index'])

In [111]:
# Create a new column 'Total Complaints' with 'Open' and 'Closed' as it's values.
complaints['Company response'].unique()

array(['Closed with explanation', 'Closed with non-monetary relief',
       'Closed with monetary relief', 'Closed', 'In progress'],
      dtype=object)

In [112]:
def totalComplaints(value):
    if value == 'Closed with monetary relief':
        return 'Closed'
    if value == 'Closed with explanation':
        return 'Closed'
    if value == 'Closed with non-monetary relief':
        return 'Closed'
    if value == 'Closed':
        return 'Closed'
    if value == 'In progress':
        return 'Open'

In [113]:
complaints['Total Complaints'] = complaints['Company response'].apply(totalComplaints)
complaints.head(3)

Unnamed: 0,Complaint ID,Product,Sub-product,Issue,Sub-issue,State,ZIP code,Date received,Date sent to company,Company,Company response,Timely response?,Consumer disputed?,Total Complaints
0,1178167,Credit card,,Billing disputes,,NJ,7042,2015-01-01,2015-01-09,Bank of America,Closed with explanation,Yes,,Closed
1,1179173,Credit card,,Billing disputes,,DC,20017,2015-01-01,2015-01-02,Bank of America,Closed with explanation,Yes,No,Closed
2,1177932,Credit card,,Billing disputes,,FL,33544,2015-01-01,2015-01-01,Bank of America,Closed with explanation,Yes,No,Closed


In [114]:
# Lets find out the total complaints status
closed = complaints['Total Complaints'] == 'Closed'
open_com = complaints['Total Complaints'] == 'Open'
len(complaints[closed])

1329

In [115]:
len(complaints[open_com])

189

### The Bank of America resolved 1329 complaints and had 189 complaints in progress.

In [116]:
# Lets see the top 15 Complaints reasons
reason = complaints.groupby(['Issue'])
top_fifteen = reason['Total Complaints'].count().nlargest(15)
top_fifteen

Issue
Loan modification,collection,foreclosure    424
Loan servicing, payments, escrow account    203
Account opening, closing, or management     181
Deposits and withdrawals                    110
Problems caused by my funds being low        52
Billing disputes                             49
Other                                        43
Making/receiving payments, sending money     42
Application, originator, mortgage broker     36
Using a debit or ATM card                    35
Settlement process and costs                 30
Cont'd attempts collect debt not owed        27
Closing/Cancelling account                   25
Identity theft / Fraud / Embezzlement        25
Credit decision / Underwriting               20
Name: Total Complaints, dtype: int64

Loan modification,collection,foreclosure is the number one issue of complaints by customers, with a toatal complaints of 424

In [117]:
# Lets see complaints by states
state = complaints.groupby(['State'])
state['Total Complaints'].count().nlargest(15)

State
CA    259
FL    197
TX    103
NY     97
NJ     74
GA     65
PA     57
MA     47
MD     45
VA     43
IL     39
NC     39
AZ     38
OH     32
CT     28
Name: Total Complaints, dtype: int64

### California leads in the number of complaints with 259 complaints

In [118]:
# Lets Groupby complaints by months (You can change the 'M' to 'Y' if you want the Year)
period = complaints['Date received'].dt.to_period('M')
year = complaints.groupby(period)

In [119]:
year['Total Complaints'].count()

Date received
2015-01    710
2015-02    701
2015-03    107
Freq: M, Name: Total Complaints, dtype: int64

### The month of January 2015 has the highest number of customer complaints with 710 complaints

In [120]:
# Save the new csv file
complaints.to_csv('complaints_new.csv', encoding='utf-8', index=False)