In [1]:
import pandas as pd
import numpy as np
import copy
import seaborn as sns
import plotly.plotly as py
import plotly.graph_objs as go
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression

from patsy.contrasts import Sum

In [18]:
dataset = pd.read_csv("/home/andrija/Desktop/customer_complaints/notebooks/data_interim.csv")
dataset.drop(columns=['Unnamed: 0'], inplace=True)

dataset_processed = pd.read_csv("/home/andrija/Desktop/customer_complaints/notebooks/processed_data_label.csv")
dataset_processed.drop(columns=['Unnamed: 0'], inplace=True)

### Proportion of Disputes (Obvious data imbalance in dependent variable)

In [3]:
labels = ['Consumer did not dispute', 'Consumer disputed']
values = np.array(dataset['Consumer disputed?'].value_counts())
colors = ['rgb(55, 83, 109)']

trace = go.Pie(labels=labels, values=values, hoverinfo='label+percent', textinfo='value', 
               textfont=dict(size=18), marker=dict(colors=colors))
py.iplot([trace])


Consider using IPython.display.IFrame instead



### Complaints vs Disputes by Year

In [20]:
complaints_year = np.array(dataset_processed['Year received'].value_counts())
x0 = np.array(dataset_processed['Year received'].value_counts().index)

In [21]:
disputes_year = np.array(dataset_processed.groupby('Year received')['Consumer disputed?'].sum())
x1 = np.array(dataset_processed.groupby('Year received')['Consumer disputed?'].sum().index)

In [22]:
trace0 = go.Bar(x=x0, y=complaints_year, name='Complaints by Year', marker=dict(color='rgb(55, 83, 109)'))
trace1 = go.Bar(x=x1, y=disputes_year, name='Disputes by Year', marker=dict(color='rgb(26, 118, 255)'))

data = [trace0, trace1]
layout = go.Layout(title='Complaints vs Disputes by Year', barmode='group')

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='Disputes by Year')


Consider using IPython.display.IFrame instead



### Complaints vs Disputes by Month

In [23]:
complaints_month = np.array(dataset_processed['Month received'].value_counts())
x0 = np.array(dataset_processed['Month received'].value_counts().index)

In [24]:
disputes_month = np.array(dataset_processed.groupby('Month received')['Consumer disputed?'].sum())
x1 = np.array(dataset_processed.groupby('Month received')['Consumer disputed?'].sum().index)

In [25]:
trace0 = go.Bar(x=x0, y=complaints_month, name='Complaints by Month', marker=dict(color='rgb(55, 83, 109)'))
trace1 = go.Bar(x=x1, y=disputes_month, name='Disputes by Month', marker=dict(color='rgb(26, 118, 255)'))

data = [trace0, trace1]
layout = go.Layout(title='Complaints vs Disputes by Month', barmode='group')

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='Disputes by Month')


Consider using IPython.display.IFrame instead



### Complaints vs Disputes by Day

In [26]:
complaints_day = np.array(dataset['Day received names'].value_counts())
x0 = np.array(dataset['Day received names'].value_counts().index)

In [27]:
disputes_day = np.array(dataset.groupby('Day received names')['Consumer disputed?'].sum())
x1 = np.array(dataset.groupby('Day received names')['Consumer disputed?'].sum().index)

In [28]:
trace0 = go.Bar(x=x0, y=complaints_day, name='Complaints by Day', marker=dict(color='rgb(55, 83, 109)'))
trace1 = go.Bar(x=x1, y=disputes_day, name='Disputes by Day', marker=dict(color='rgb(26, 118, 255)'))

data = [trace0, trace1]
layout = go.Layout(title='Complaints vs Disputes by Day', barmode='group')

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='Disputes by Day')


Consider using IPython.display.IFrame instead



### Complaints vs Disputes by State

In [29]:
complaints_state = np.array(dataset['State'].value_counts())[:13]
x0 = np.array(dataset['State'].value_counts().index)[:13]

In [30]:
disputes_state = np.array(dataset.groupby('State')['Consumer disputed?'].sum().sort_values(ascending=False))[:13]
x1 = np.array(dataset.groupby('State')['Consumer disputed?'].sum().sort_values(ascending=False).index)
x1 = np.copy(x1[:13]) #(:-6)

In [31]:
trace0 = go.Bar(x=x0, y=complaints_state, name='Complaints by State', marker=dict(color='rgb(55, 83, 109)'))
trace1 = go.Bar(x=x1, y=disputes_state, name='Disputes by State', marker=dict(color='rgb(26, 118, 255)'))

data = [trace0, trace1]
layout = go.Layout(title='Complaints vs Disputes by State', barmode='group')

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='Disputes by State')


Consider using IPython.display.IFrame instead



In [32]:
disputes_state = np.array(dataset.groupby('State')['Consumer disputed?'].sum().sort_values(ascending=False))[:20]

In [33]:
states = np.array(dataset.groupby('State')['Consumer disputed?'].sum().sort_values(ascending=False).index)
states = np.copy(states[:20]) #(:-6)

In [34]:
trace = go.Bar(x=states, y=disputes_state, name='Disputes by State', marker=dict(color='rgb(55, 83, 109)'))

data = [trace]
layout = go.Layout(xaxis=dict(tickangle=45), barmode='group', title='Disputes by State')

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='Disputes by State')


Consider using IPython.display.IFrame instead



### Complaints vs Disputes by Product 

In [21]:
complaints_product = np.array(dataset['Product'].value_counts())
x0 = np.array(dataset['Product'].value_counts().index)

In [22]:
disputes_product = np.array(dataset.groupby('Product')['Consumer disputed?'].sum())
x1 = np.array(dataset.groupby('Product')['Consumer disputed?'].sum().index)

In [23]:
trace0 = go.Bar(x=x0, y=complaints_product, name='Complaints by Product', marker=dict(color='rgb(55, 83, 109)'))
trace1 = go.Bar(x=x1, y=disputes_product, name='Disputes by Product', marker=dict(color='rgb(26, 118, 255)'))

data = [trace0, trace1]
layout = go.Layout(title='Complaints vs Disputes by Product', barmode='group')

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='Disputes by Product')


Consider using IPython.display.IFrame instead



### Consumer Complaint Narrative vs Dispute

In [24]:
complaint_dispute = np.array(dataset.groupby('Company response to consumer')['Consumer disputed?'].sum().sort_values(ascending=False))
complaint_response = np.array(dataset.groupby('Company response to consumer')['Consumer disputed?'].sum().sort_values(ascending=False).index)

In [25]:
response = np.copy(np.array(dataset['Company response to consumer'].value_counts().index)[:-1])
response_values = np.copy(np.array(dataset['Company response to consumer'].value_counts())[:-1])

trace0 = go.Bar(x=response, y=response_values, name='Company responses', marker=dict(color='rgb(55, 83, 109)'))

trace1 = go.Bar(x=complaint_response, y=complaint_dispute, name='Disputes by Company Response',
                marker=dict(color='rgb(26, 118, 255)'))

data = [trace0, trace1]
layout = go.Layout(title='Disputes by Company Response', barmode='group')

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='Disputes vs Total Products')


Consider using IPython.display.IFrame instead



### Disputes by Issue

In [26]:
disputes_issue = np.copy(np.array(dataset.groupby('Issue')['Consumer disputed?'].sum().sort_values(ascending=False))[:-9])

In [27]:
issues = np.array(dataset.groupby('Issue')['Consumer disputed?'].sum().sort_values(ascending=False).index)

In [28]:
trace = go.Bar(x=issues, y=disputes_issue, name='Disputes by Issue', marker=dict(color='rgb(55, 83, 109)'))

data = [trace]
layout = go.Layout(xaxis=dict(tickangle=45), barmode='group', title='Disputes by Issue')

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='Disputes by Issue')


Consider using IPython.display.IFrame instead

