# Data Audit Report

This is the first part of our Competition 2 where we performed our preprocessing steps on the data. More details can be found in our [README.MD](README.md) file.

### Importing Our Required Packages and things

In [25]:
#importing required libraries and packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import backend as bk
from sklearn import preprocessing
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from pandas import ExcelWriter
import xgboost as xgb
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import KFold
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn import metrics
from sklearn.model_selection import cross_val_score

#set plot style to 'ggplot' and don't limit the view of DF when diaplyhign them to screen
plt.style.use('ggplot')
pd.options.display.max_columns = None

### Read the Data In

In [2]:
#importing our data and resetting our index
df = pd.read_excel('Data/Comp2_Raw_Data.xls')

### Renaming and Dropping Columns

To make our dataset cleaner, we renamed our columns and we also dropped the ID column because it had no value to our model

In [3]:
df = df.drop(columns = ['ID'])
df.columns = ['Credit_Limit', 'Gender', 'Education', 'Marriage',  'Age', 'Pay_Sept', 'Pay_Aug', 'Pay_Jul', 'Pay_Jun', 'Pay_May', 'Pay_Apr', 
             'Bill_Amt_Sept', 'Bill_Amt_Aug', 'Bill_Amt_Jul', 'Bill_Amt_Jun', 'Bill_Amt_May', 'Bill_Amt_Apr','Pay_Amt_Sept', 'Pay_Amt_Aug',
             'Pay_Amt_Jul', 'Pay_Amt_Jun', 'Pay_Amt_May', 'Pay_Amt_Apr', 'Default']

### Creating A Target DF and a Feature DF 

We seperated our data into a df_target which held all of our target variables. This way we don't accidently scale/transform them or include them as a feature in our feature selection/reduction  below

In [4]:
#Copy our target variables to their own df
df_target = df[['Default']].copy()
df_target.head(10)
#change the data type to categorical
df_target['Default'] = pd.Categorical(df_target.Default)
#Drop Default from our target variable df
df = df.drop(['Default'], axis=1)

### Changing Our DataTypes

We want to make our fields from integer to float so we do that here

In [5]:
#chnage column datatypes to float
for col in df:
    df[col]=pd.to_numeric(df[col], errors='coerce', downcast='float')

### EDA On Our Data

Here we check for missing values as well as begin our preocessing steps to transofrm and scale our data. 

We will want to scale and transform our continous fields. We copy these to a new dataframe so we don't impact our categorical variables

In [6]:
#scale our continous fields
columns = ['Bill_Amt_Apr', 'Bill_Amt_May', 'Bill_Amt_Jun', 'Bill_Amt_Jul', 'Bill_Amt_Aug', 'Bill_Amt_Sept', 
           'Pay_Amt_Apr', 'Pay_Amt_May', 'Pay_Amt_Jun', 'Pay_Amt_Jul', 'Pay_Amt_Aug', 'Pay_Amt_Sept', 'Credit_Limit']
#Copy our target variables to their own df
df_cont = df[columns].copy()

First thing we do is handle outliers, on all of our continous columns. We move all the data to be within 3 std dev of the mean. 

In [7]:
for col in df_cont.columns:
    u_bound = df_cont[col].mean() + 3* df_cont[col].std()
    l_bound = df_cont[col].mean() - 3* df_cont[col].std()
    df_cont[col][df_cont[col] > u_bound] = u_bound
    df_cont[col][df_cont[col] < l_bound] = l_bound

Here we normalize our data to remove the skewness. We Use Sckitlearns `normalize` function

In [8]:
df_cont = pd.DataFrame(preprocessing.normalize(df_cont,norm='l2'),columns = df_cont.columns)

Here we use `MinMaxScaler` to scale all of oour data so it is between `[0,1]`

In [9]:
min_max_scaler = preprocessing.MinMaxScaler()
df_cont = pd.DataFrame(min_max_scaler.fit_transform(df_cont),columns = df_cont.columns)

In [10]:
pay_cols = ['Pay_Amt_Apr', 'Pay_Amt_May', 'Pay_Amt_Jun', 'Pay_Amt_Jul', 'Pay_Amt_Aug', 'Pay_Amt_Sept']
for col in pay_cols:
     df_cont[col]=np.log(df_cont[col]+.001)

In [11]:
df_cont.reset_index(drop=True, inplace=True)
for col in df_cont:
    df[col] = df_cont[col]

In [12]:
X_train, x_test, y_train, y_test = train_test_split(df,df_target, test_size= 0.20, random_state=2019)
oversample = pd.concat([X_train,y_train],axis=1)
max_size = oversample['Default'].value_counts().max()
lst = [oversample]
    
for class_index, group in oversample.groupby('Default'):
    lst.append(group.sample(max_size-len(group), replace=True))
X_train = pd.concat(lst)
y_train=pd.DataFrame.copy(X_train['Default'])
del X_train['Default']

In [13]:
xgb_model = xgb.XGBClassifier(seed=2019)

In [None]:
xgb_model.fit(X_train,y_train)
y_pred = xgb_model.predict(x_test)

In [15]:
print(classification_report(y_test,y_pred)) 

             precision    recall  f1-score   support

          0       0.88      0.82      0.85      4710
          1       0.48      0.61      0.54      1290

avg / total       0.80      0.77      0.78      6000



In [16]:
print(confusion_matrix(y_test,y_pred)) 

[[3856  854]
 [ 504  786]]


In [17]:
print(metrics.accuracy_score(y_test,y_pred))

0.7736666666666666


In [None]:
kfold = model_selection.KFold(n_splits=10, random_state=7)
scores = cross_val_score(xgb_model, x_test, y_test.Default.ravel(), cv=kfold)

In [27]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.83 (+/- 0.04)


## Trying to Optimze

Below we try to optimze our hyperparameters using RandomSearchCV

In [28]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn import model_selection
from sklearn.model_selection import cross_val_score

In [29]:
#setting parameters we want to tune
cv_params = {'max_depth': [1,2,3,4,5,6], 'min_child_weight': [1,2,3,4], 'learning_rate': [0.1,0.2,0.3], 'n_estimators': [50,100,150]}    # parameters to be tries in the grid search
fix_params = {'learning_rate': 0.2, 'n_estimators': 100, 'objective': 'binary:logistic'}   #other parameters, fixed for the moment 
csv = RandomizedSearchCV(xgb.XGBClassifier(**fix_params), cv_params, scoring = 'f1', cv = 5)

In [None]:
csv.fit(X_train, y_train)

In [31]:
#get the best parameters
csv.best_params_

{'n_estimators': 100,
 'min_child_weight': 4,
 'max_depth': 6,
 'learning_rate': 0.3}

In [37]:
xgb_model = xgb.XGBClassifier(seed=2019, min_child_weight=4, max_depth=6, learning_rate= 0.3, n_estimators=100, objective='binary:logistic')
xgb_model.fit(X_train,y_train)
y_pred = xgb_model.predict(x_test)

  if diff:


In [38]:
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(metrics.accuracy_score(y_test,y_pred))

             precision    recall  f1-score   support

          0       0.88      0.82      0.85      4710
          1       0.47      0.57      0.52      1290

avg / total       0.79      0.77      0.78      6000

[[3872  838]
 [ 549  741]]
0.7688333333333334


In [None]:
kfold = model_selection.KFold(n_splits=10, random_state=7)
scores = cross_val_score(xgb_model, x_test, y_test.Default.ravel(), cv=kfold)

In [40]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.82 (+/- 0.04)


In [41]:
xgb_model.feature_importances_

array([0.08562401, 0.00758294, 0.01484992, 0.00947867, 0.05876777,
       0.01958926, 0.00726698, 0.00947867, 0.0056872 , 0.00663507,
       0.00695103, 0.08404423, 0.0600316 , 0.06066351, 0.06477093,
       0.05276461, 0.06192733, 0.07677725, 0.05845182, 0.07772512,
       0.0600316 , 0.05466035, 0.05624013], dtype=float32)