# Data Audit Report

This is the first part of our Competition 2 where we performed our preprocessing steps on the data. More details can be found in our [README.MD](README.md) file.

### Importing Our Required Packages and things

In [1]:
#importing required libraries and packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import backend as bk
from sklearn import preprocessing
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from pandas import ExcelWriter
from sklearn.model_selection import train_test_split
from sklearn import svm



#set plot style to 'ggplot' and don't limit the view of DF when diaplyhign them to screen
plt.style.use('ggplot')
pd.options.display.max_columns = None

### Read the Data In

In [2]:
#importing our data and resetting our index
df = pd.read_excel('Data/Comp2_Raw_Data.xls')
df.head(1)

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,1,20000,2,2,1,24,2,2,-1,-1,-2,-2,3913,3102,689,0,0,0,0,689,0,0,0,0,1


### Renaming and Dropping Columns

To make our dataset cleaner, we renamed our columns and we also dropped the ID column because it had no value to our model

In [3]:
df = df.drop(columns = ['ID'])
df.columns = ['Credit_Limit', 'Gender', 'Education', 'Marriage',  'Age', 'Pay_Sept', 'Pay_Aug', 'Pay_Jul', 'Pay_Jun', 'Pay_May', 'Pay_Apr', 
             'Bill_Amt_Sept', 'Bill_Amt_Aug', 'Bill_Amt_Jul', 'Bill_Amt_Jun', 'Bill_Amt_May', 'Bill_Amt_Apr','Pay_Amt_Sept', 'Pay_Amt_Aug',
             'Pay_Amt_Jul', 'Pay_Amt_Jun', 'Pay_Amt_May', 'Pay_Amt_Apr', 'Default']
df.head(5)

Unnamed: 0,Credit_Limit,Gender,Education,Marriage,Age,Pay_Sept,Pay_Aug,Pay_Jul,Pay_Jun,Pay_May,Pay_Apr,Bill_Amt_Sept,Bill_Amt_Aug,Bill_Amt_Jul,Bill_Amt_Jun,Bill_Amt_May,Bill_Amt_Apr,Pay_Amt_Sept,Pay_Amt_Aug,Pay_Amt_Jul,Pay_Amt_Jun,Pay_Amt_May,Pay_Amt_Apr,Default
0,20000,2,2,1,24,2,2,-1,-1,-2,-2,3913,3102,689,0,0,0,0,689,0,0,0,0,1
1,120000,2,2,2,26,-1,2,0,0,0,2,2682,1725,2682,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,90000,2,2,2,34,0,0,0,0,0,0,29239,14027,13559,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,50000,2,2,1,37,0,0,0,0,0,0,46990,48233,49291,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,50000,1,2,1,57,-1,0,-1,0,0,0,8617,5670,35835,20940,19146,19131,2000,36681,10000,9000,689,679,0


### Creating A Target DF and a Feature DF 

We seperated our data into a df_target which held all of our target variables. This way we don't accidently scale/transform them or include them as a feature in our feature selection/reduction  below

In [4]:
#Copy our target variables to their own df
df_target = df[['Default']].copy()
df_target.head(10)
#change the data type to categorical
df_target['Default'] = pd.Categorical(df_target.Default)
#Drop Default from our target variable df
df = df.drop(['Default'], axis=1)

In [5]:
df = df[['Credit_Limit', 'Gender', 'Education', 'Marriage', 'Age']].copy()
df.head(5)

Unnamed: 0,Credit_Limit,Gender,Education,Marriage,Age
0,20000,2,2,1,24
1,120000,2,2,2,26
2,90000,2,2,2,34
3,50000,2,2,1,37
4,50000,1,2,1,57


In [6]:
#chacking that our data was transfered properly.
df_target.head(5)

Unnamed: 0,Default
0,1
1,1
2,0
3,0
4,0


### Changing Our DataTypes

We want to make our fields from integer to float so we do that here

In [7]:
df['Gender_M'] = np.where(df['Gender'] ==1, 1, 0)
df['Gender_F'] = np.where(df['Gender'] ==2, 1, 0)

df['Education'] = np.where(df['Education'] == 2,1,0)
df['Edu_College'] = np.where(df['Education']==1,1,0)
df['Edu_Other'] = np.where(df['Education']==0,1,0)

df['Marriage'] = np.where(df['Marriage']==1,1,0)
df['Married'] = np.where(df['Marriage'] ==1, 1, 0)
df['Married_Other'] = np.where(df['Marriage']==0,1,0)

df['Binned_Age'] = pd.qcut(df['Age'], q=4, labels=[0,1,2,3])
df['Age_Range0'] = np.where(df['Binned_Age'] ==0, 1, 0)
df['Age_Range1'] = np.where(df['Binned_Age'] ==1, 1, 0)
df['Age_Range2'] = np.where(df['Binned_Age'] ==2, 1, 0)
df['Age_Range3'] = np.where(df['Binned_Age'] ==3, 1, 0)

df = df.drop(columns = ['Gender', 'Education', 'Marriage', 'Binned_Age', 'Age'])

In [8]:
df.head()

Unnamed: 0,Credit_Limit,Gender_M,Gender_F,Edu_College,Edu_Other,Married,Married_Other,Age_Range0,Age_Range1,Age_Range2,Age_Range3
0,20000,0,1,1,0,1,0,1,0,0,0
1,120000,0,1,1,0,0,1,1,0,0,0
2,90000,0,1,1,0,0,1,0,1,0,0
3,50000,0,1,1,0,1,0,0,0,1,0
4,50000,1,0,1,0,1,0,0,0,0,1


## Bagging Ensamble

In [9]:
cols = list(df.columns)
X = df[cols[:]]
X.head()

Unnamed: 0,Credit_Limit,Gender_M,Gender_F,Edu_College,Edu_Other,Married,Married_Other,Age_Range0,Age_Range1,Age_Range2,Age_Range3
0,20000,0,1,1,0,1,0,1,0,0,0
1,120000,0,1,1,0,0,1,1,0,0,0
2,90000,0,1,1,0,0,1,0,1,0,0
3,50000,0,1,1,0,1,0,0,0,1,0
4,50000,1,0,1,0,1,0,0,0,0,1


In [10]:

y = df_target.Default
y.head()

0    1
1    1
2    0
3    0
4    0
Name: Default, dtype: category
Categories (2, int64): [0, 1]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=2019)

In [12]:
# Begin oversampling
oversample = pd.concat([X_train,y_train],axis=1)
max_size = oversample['Default'].value_counts().max()
lst = [oversample]

for class_index, group in oversample.groupby('Default'):
    lst.append(group.sample(max_size-len(group), replace=True))
X_train = pd.concat(lst)
y_train=pd.DataFrame.copy(X_train['Default'])
del X_train['Default']

In [None]:
#Create a svm Classifier
clf = svm.SVC(kernel='linear', random_state=2019) # Linear Kernel

#Train the model using the training sets
clf.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:
# import required package for data handling
import pandas as pd
import numpy as np

# import required packages for splitting data
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

# import required packages for evaluating models
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support

# import `logistic regression` model
from sklearn.linear_model import LogisticRegression

In [None]:
# lists for f1-score and AUC
f1_score_lst = []
auc_lst = []


#loop to calculate f1 and auc scores and present averages after 10 runs
for count in range (1,10):
    print(count)
    #Model building
    clf1 = svm.SVC(kernel='linear', random_state=2019)

    
    # Splitting data into testing and training
    X2_train, X2_test, y2_train, y2_test = train_test_split(X, y, test_size=0.2, random_state=123)
    
    # Begin oversampling
    oversample = pd.concat([X2_train,y2_train],axis=1)
    max_size = oversample['Default'].value_counts().max()
    lst = [oversample]
    for class_index, group in oversample.groupby('Default'):
        lst.append(group.sample(max_size-len(group), replace=True))
    X2_train = pd.concat(lst)
    y2_train=pd.DataFrame.copy(X2_train['Default'])
    del X2_train['Default']
    
    # fitting model on oversampled data
    clf1.fit(X2_train, y2_train)
    
    y2_pred = clf1.predict(X2_test)
    
    
    #10-fold cross validation
    kfold = model_selection.KFold(n_splits=10, random_state=123)
    scoring = 'accuracy'
    results = model_selection.cross_val_score(clf1, X2_train, y2_train, cv=kfold, scoring=scoring)
    
    #calculate f1-score and AUC
    
    clf1_roc_auc = roc_auc_score(y2_test, y2_pred)
    
    
    #calculate average f1-score and AUC
    f1_score_lst.append(precision_recall_fscore_support(y2_test, y2_pred, average='weighted')[2])
    auc_lst.append(clf1_roc_auc)
    
    
print('F1 {:.4f}; AUC {:.4f} '.format(np.mean(f1_score_lst),np.mean(auc_lst)))

confusion_matrix_y2 = confusion_matrix(y2_test, y2_pred)


print('Accuracy of classifier on test set: {:.3f}'.format(clf1.score(X2_test, y2_test)))

print("10-fold cross validation average accuracy of clf1: %.3f" % (results.mean()))

print('Confusion Matrix for Classfier:')
print(confusion_matrix_y2)

print('Classification Report for Classfier:')
print(classification_report(y2_test, y2_pred))


