In [90]:
import pandas as pd
import numpy as np
import copy
import datetime

from sklearn.preprocessing import LabelEncoder
from category_encoders.target_encoder import TargetEncoder
from category_encoders.woe import WOEEncoder
from category_encoders.hashing import HashingEncoder
from category_encoders.leave_one_out import LeaveOneOutEncoder

# Preprocessing

## Data Import & Consumer Dispute Values Clean

In [91]:
dataset = pd.read_csv("/home/andrija/Desktop/customer_complaints/complaints-2017-12-08_11_01.csv")

In [92]:
dataset.dropna(axis=0 ,subset=['Consumer disputed?'], inplace=True)

dataset.reset_index(inplace=True, drop=True)

In [93]:
data = dataset.copy(deep=True)

## Data Report

In [94]:
def missing_values(data):
    
    total = data.isnull().sum().sort_values(ascending = False)
    percent = (data.isnull().sum() / data.isnull().count() * 100 ).sort_values(ascending = False)
    
    df = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    
    print("Total missing values by feature: ")
    print (df[~(df['Total'] == 0)])
    
# data.isnull().sum()    

In [95]:
# missing_values(data)

In [96]:
# import pandas_profiling
# pandas_profiling.ProfileReport(data, correlations={"cramers": False})

## Categorical Data Cleaning 

In [97]:
data.drop(['Complaint ID', 'ZIP code', 'Company',
           'Consumer consent provided?', 'Sub-issue'], axis=1, inplace=True)

### Target Variable: Consumer disputed?

In [98]:
data['Consumer disputed?'].replace(to_replace=['Yes', 'No'], value=[1, 0], inplace=True)

### Date received

In [99]:
data['Date received'] = pd.to_datetime(data['Date received'], format='%m/%d/%y')

In [100]:
data.insert(0, 'Day received', data['Date received'].dt.dayofweek)
data.insert(0, 'Month received', data['Date received'].dt.month)
data.insert(0, 'Year received', data['Date received'].dt.year)

### Data for Visual Analysis

In [101]:
day_name = data['Date received'].dt.day_name()

In [102]:
data_interim = data.copy(deep=True)
data_interim.drop(['Day received', 'Date received'], axis=1, inplace=True)
data_interim.insert(2, 'Day received names', day_name)

In [103]:
data.drop(['Date received'], inplace=True, axis=1)

### Date sent to company

In [104]:
data['Date sent to company'] = pd.to_datetime(data['Date sent to company'], format='%m/%d/%y')

In [105]:
day = data['Date sent to company'].dt.dayofweek

In [106]:
data.insert(3, 'Day sent to company', day)

In [107]:
data.drop(['Date sent to company'], inplace=True, axis=1)

### Index reset

In [108]:
data.sort_values(by=['Year received', 'Month received', 'Day received', 'Day sent to company'], inplace=True)
data.reset_index(inplace=True, drop=True)

### Timely response?

In [109]:
data['Timely response?'].replace(to_replace=['Yes', 'No'], value=[1, 0], inplace=True)

### Tags (If not NaN, put 1, else 0)

In [110]:
data['Tags'].replace(to_replace=['Servicemember', 'Older American', 'Older American, Servicemember', np.nan],
                     value=[1, 1, 1, 0], inplace=True)

### Consumer complaint narrative (Indicate whether there is narrative or not)

In [111]:
data['Consumer complaint narrative'].fillna(value=0, inplace=True)

In [112]:
data['Consumer complaint narrative'].loc[data['Consumer complaint narrative'] != 0] = 1

### Sub-issue (Indicate whether there is sub-issue stated or not)

In [113]:
# data['Sub-issue'].fillna(value=0, inplace=True)
# data['Sub-issue'].loc[data['Sub-issue'] != 0] = 1

In [114]:
# data_interim['Sub-issue'].fillna(value=0, inplace=True)
# data_interim['Sub-issue'].loc[data_interim['Sub-issue'] != 0] = 1

## Encoding of Residual Categorical Features

In [115]:
X = data[['Product', 'Company response to consumer', 'Sub-product', 'Issue', 'Company public response',
          'State', 'Submitted via']]
y = data['Consumer disputed?']

In [116]:
data['Sub-product'].fillna(value='zero', inplace=True)
data['Company public response'].fillna(value='zero', inplace=True)
data['State'].fillna(value='zero', inplace=True)

### Label Encoder

In [86]:
# transformed = data[['Product', 'Company response to consumer', 'Sub-product', 'Issue', 'Company public response',
#       'State', 'Submitted via']].transform(lambda x:LabelEncoder().fit_transform(x))

# data[['Product', 'Company response to consumer', 'Sub-product', 'Issue', 'Company public response',
#       'State', 'Submitted via']] = transformed

### Target (Probabilistic) Encoding

In [117]:
X = data[['Product', 'Company response to consumer', 'Sub-product', 'Issue', 'Company public response',
          'State', 'Submitted via']]
y = data['Consumer disputed?']

enc = TargetEncoder(smoothing=2).fit(X, y)

X_transformed = enc.transform(X)

data[['Product', 'Company response to consumer', 'Sub-product', 'Issue', 'Company public response',
      'State', 'Submitted via']] = X_transformed

### Weight of Evidence Encoder

In [517]:
enc = WOEEncoder().fit(X, y)

In [518]:
X_transformed = enc.transform(X)

In [519]:
data[['Product', 'Company response to consumer', 'Sub-product', 'Issue', 'Company public response',
      'State', 'Submitted via']] = X_transformed

### Leave One Out Encoder

In [562]:
enc = LeaveOneOutEncoder().fit(X, y)

In [563]:
X_transformed = enc.transform(X)

In [564]:
data[['Product', 'Company response to consumer', 'Sub-product', 'Issue', 'Company public response',
      'State', 'Submitted via']] = X_transformed

### Final modifications for Label Encoder

In [358]:
# ind1 = np.array(data['Sub-product'] == 0)
# ind2 = np.array(data['Sub-product'] == 46)

# s1 = data['Sub-product'].copy(deep=False)
# s1[ind2] = 0
# s1[ind1] = 46

In [359]:
# ind1 = np.array(data['Company public response'] == 0)
# ind2 = np.array(data['Company public response'] == 3)

# s1 = data['Company public response'].copy(deep=False)
# s1[ind2] = 0
# s1[ind1] = 3

In [360]:
# ind1 = np.array(data['State'] == 0)
# ind2 = np.array(data['State'] == 62)

# s1 = data['State'].copy(deep=False)
# s1[ind2] = 0
# s1[ind1] = 62

### Processed Dataset

In [121]:
data_interim.to_csv("/home/andrija/Desktop/customer_complaints/notebooks/data_interim.csv")
data.to_csv("/home/andrija/Desktop/customer_complaints/notebooks/processed_data_target.csv")