# Imports

In [122]:
import pandas as pd
import numpy as np

# Data Cleaning

## Selecting Relevant Fields
the dataset will be loaded and transformed and relevant dimensions will remain

In [123]:
#TODO: remove the nrows argument when done testing
df = pd.read_csv('database.csv', nrows=2500)[[
    'NAME_CONTRACT_STATUS',
    'CODE_GENDER',
    'FLAG_OWN_CAR',
    'FLAG_OWN_REALTY',
    'CNT_CHILDREN',
    'AMT_INCOME_TOTAL',
    'AMT_CREDIT',
    # 'NAME_INCOME_TYPE',
    # 'NAME_EDUCATION_TYPE',
    # 'NAME_FAMILY_STATUS',
    # 'NAME_HOUSING_TYPE',
    'DAYS_BIRTH',
    'DAYS_EMPLOYED',
    # 'OCCUPATION_TYPE',
    'CNT_FAM_MEMBERS'
]]

# keep only approved and refused examples (then reset the pandas row indices - which does not happen automatically by default)
df = df.loc[(df['NAME_CONTRACT_STATUS'] == 'Approved') | (df['NAME_CONTRACT_STATUS'] == 'Refused')].reset_index().drop(labels=['index'],axis=1)

df.head(10)

Unnamed: 0,NAME_CONTRACT_STATUS,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,OCCUPATION_TYPE,CNT_FAM_MEMBERS
0,Approved,F,N,Y,0,171000.0,491580.0,State servant,Secondary / secondary special,Married,House / apartment,-14548,-1187,Medicine staff,2.0
1,Approved,F,Y,Y,1,175500.0,29700.0,State servant,Higher education,Married,House / apartment,-11081,-3244,High skill tech staff,3.0
2,Approved,F,N,Y,0,135000.0,48600.0,Working,Secondary / secondary special,Married,House / apartment,-12939,-629,Sales staff,2.0
3,Approved,M,N,N,0,180000.0,196740.0,Working,Secondary / secondary special,Married,Rented apartment,-8945,-672,Sales staff,2.0
4,Refused,M,Y,N,0,225000.0,774229.5,Pensioner,Secondary / secondary special,Married,House / apartment,-23919,365243,,2.0
5,Refused,M,Y,N,1,225000.0,36166.5,Working,Secondary / secondary special,Married,House / apartment,-15173,-3397,Drivers,3.0
6,Approved,F,N,Y,1,90000.0,120582.0,Pensioner,Secondary / secondary special,Married,House / apartment,-18834,365243,,3.0
7,Approved,M,N,N,0,135000.0,30550.5,Working,Secondary / secondary special,Married,House / apartment,-9950,-146,Laborers,2.0
8,Approved,F,N,N,0,54000.0,112500.0,Pensioner,Secondary / secondary special,Widow,House / apartment,-23154,365243,,1.0
9,Approved,M,N,Y,0,315000.0,26811.0,Commercial associate,Secondary / secondary special,Married,House / apartment,-17154,-4006,Drivers,2.0


## Reductions to Binary Variables

The target variable, NAME_CONTRACT_STATUS, will be reduced from one of 4 possible values, to one of two generic but still correct values – for example, the dataset distingushes between cancelled and rejected and granted loans, however we will only distinguish between granted and not granted loans. Values that are binary but that don't use the binary alphabet will be transformed to use the binary alphabet as well.

Defining a function that will return a copy of the dataframe with reduced fields.

In [124]:
def reduce(df, name, value):
    if type(name) != str:
        raise Exception('only one dimension is reduced at a time')
    idx_name = df.columns.get_loc(name)
    reduced = [(1 if df[name][i] == value else 0) for i in range(len(df))]
    df_reduced = df.drop(labels=[name], axis=1)
    df_reduced.insert(loc=idx_name, column=name, value=reduced)
    return df_reduced

In [125]:
df0 = reduce(df, 'NAME_CONTRACT_STATUS', 'Approved')  # 1 if approved else 0
df1 = reduce(df0, 'CODE_GENDER', 'M')  # 1 if male else 0
df2 = reduce(df1, 'FLAG_OWN_CAR', 'Y')  # 1 if owns car else 0
df3 = reduce(df2, 'FLAG_OWN_REALTY', 'Y')  # 1 if owns property else 0

## One-Hot Encoding
defining a function that returns a copy of the input dataframe with a specific dimension one-hot encoded

In [126]:
def one_hot_encode(df, name):
    if type(name) != str:
        raise Exception('one hot encoding applies to one dimension at a time')
    if len(df) == 0:
        raise Exception('dataframe is empty')
    
    df = df.copy()
    values = df[name].unique()
    
    #for each unique value, we create a new column where df[row][new column] is 1 if the value of df[row][value] == new column
    for v in values:
        one_hot_column = [(1 if df[name][i] == v else 0) for i in range(len(df))]
        df.insert(loc=len(df.loc[0]), column=v, value=one_hot_column)

    return df.drop(labels=[name], axis=1)

performing one-hot encoding on any dimension whose values are one of a set of string values

In [127]:
# df4 = one_hot_encode(df3, 'NAME_INCOME_TYPE')
# df5 = one_hot_encode(df4, 'NAME_EDUCATION_TYPE')
# df6 = one_hot_encode(df5, 'NAME_FAMILY_STATUS')
# df7 = one_hot_encode(df6, 'NAME_HOUSING_TYPE')
# df8 = one_hot_encode(df7, 'OCCUPATION_TYPE')
df8=df3

# Training the models

## Split the dataframe into X and y as numpy arrays

convert the dataframe into a numpy tensor

In [128]:
data = np.array(df8)

#undersample the target 1's
num_zero = len(data[data[:,0]==0])
num_one = num_zero

#balance the amount of approvals and rejections
data = np.vstack((
    data[data[:,0]==1][:num_one,:],
    data[data[:,0]==0]
))
print(num_zero, num_one/float(len(data)))

X = data[:,2:]
y = data[:,1].reshape((-1,1)).astype(np.int32).ravel()

409 0.5


normalize the data

In [129]:
from sklearn import preprocessing
X = preprocessing.MinMaxScaler().fit_transform(X)

ValueError: could not convert string to float: 'State servant'

## Creating the test class
We will create a class that collects everything we need to build and to analyze a model. The class will point to our data, and to a model building algorithm, and will provide functionality for presenting results.

In [113]:
from sklearn.model_selection import KFold

class Test:
    def __init__(self, name, X, y, algorithm, args):
        self.name = name
        self.X = X  #pointer, not copy
        self.y = y
        self.algorithm = algorithm
        self.args = args
        self.prediction=None
        self.avg_acc_train=None
        self.avg_acc_test=None
    
    def get_best_prediction(self):
        if self.prediction is None:
            raise Exception('the algorithm has not yet been run')
        return self.prediction
    
    def get_avg_accuracies(self):
        if self.avg_acc_train is None or self.avg_acc_test is None:
            raise Exception('the algorithm has not yet been run')
        return (self.avg_acc_train, self.avg_acc_test)
    
    def get_stats(self):
        return (*self.get_avg_accuracies(), self.get_best_prediction())

    #obtain accuracy using k-fold cross validation
    def run(self, n_splits=5):
        accs_train = []
        accs_test = []
        best_acc_test = 0
        best_acc_index = -1
        predictions = []
        kfold_model = KFold(n_splits=n_splits, random_state=None, shuffle=False)
        i = 0
        for train_index, test_index in kfold_model.split(self.X):
            X_train = self.X[train_index]
            y_train = self.y[train_index]
            X_test = self.X[test_index]
            y_test = self.y[test_index]
            model = self.algorithm(**self.args)
            model.fit(X_train, y_train)
            accs_train.append(model.score(X_train,y_train))
            acc_test = model.score(X_test,y_test)
            accs_test.append(acc_test)
            if acc_test > best_acc_test:
                best_acc_test = acc_test
                best_acc_index = i
            predictions.append(model.predict(X_test))
            i += 1
        self.avg_acc_train = sum(accs_train)/float(len(accs_train))
        self.avg_acc_test = sum(accs_test)/float(len(accs_test))
        self.prediction = predictions[best_acc_index]
        return self.get_stats()
    
    def display(self):
        print(f'\n\n{self.name}\n' + '='*len(self.name))
        print(f'Average training accuracy: {self.avg_acc_train}')
        print(f'Average test accuracy: {self.avg_acc_test}')
        print(f'Best prediction: {self.prediction}')

## Support Vector Machine Modelling
The motivation behind support vector machines is that we are building a line of best fit between two datasets, where "best" is defined by an objective function of distance between our line of best fit and between critical points, called support vectors, of these datasets. Support vectors are the closest points to a line of best fit. Our best fit line is also a decision boundary.

In [114]:
from sklearn import svm

### Tests comparing the three kernels

In [115]:
svm_basic_tests = [
    Test(
        name='SVM linear',
        X=X,
        y=y,
        algorithm=svm.SVC,
        args={'kernel': 'linear', 'C': 10, 'class_weight': {0:1,1:2}}
    ),
    Test(
        name='SVM rbf',
        X=X,
        y=y,
        algorithm=svm.SVC,
        args={'kernel': 'rbf', 'C': 10, 'class_weight': {0:1,1:2}}
    ),
]

for test in svm_basic_tests:
    test.run()
    test.display()



SVM linear
Average training accuracy: 0.7839223101524382
Average test accuracy: 0.7421068382462965
Best prediction: [1 1 1 1 1 1 0 0 0 1 1 1 1 1 0 0 0 0 1 1 0 1 1 1 0 0 0 0 0 1 1 0 1 0 1 0 0
 0 0 1 0 0 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 0 1 0 0 1 1 1 0 0 0
 1 1 0 1 0 0 0 1 1 0 1 1 0 1 1 0 0 1 1 0 0 0 0 1 1 1 0 1 1 0 0 0 1 1 0 0 0
 0 0 1 0 0 1 0 1 1 1 1 0 1 0 0 0 1 0 1 1 1 1 0 1 1 0 1 1 1 0 0 0 1 0 0 1 1
 0 0 1 1 0 0 1 1 1 0 1 0 0 0 1]


SVM rbf
Average training accuracy: 0.8783598291196864
Average test accuracy: 0.7567335029178512
Best prediction: [0 0 1 0 0 1 1 1 0 0 0 0 0 1 1 1 1 1 0 1 1 0 1 0 1 1 0 0 0 1 1 0 1 0 0 0 0
 0 0 0 0 1 0 0 0 1 0 1 1 1 1 1 0 0 0 0 0 1 1 0 0 1 0 0 0 1 1 1 1 1 0 1 1 0
 1 1 1 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1 0 0 0 1 1
 0 1 0 0 0 1 1 0 0 1 0 1 0 0 1 1 0 0 1 0 0 1 0 0 1 1 1 0 0 1 0 0 0 0 0 1 1
 0 0 1 0 0 0 1 1 0 0 0 0 0 0 1 1]


### Tests comparing different degrees of the polynomial kernel

In [120]:
svm_poly_tests = [
    Test(
        name=f'SVM poly of degree {i}',
        X=X,
        y=y,
        algorithm=svm.SVC,
        args={'class_weight': {0:1,1:2}, 'kernel': 'poly', 'C': 0.01, 'degree': i}
    )
    for i in range(1,11)
]

for test in svm_poly_tests:
    test.run()
    test.display()



SVM poly of degree 1
Average training accuracy: 0.3997586198846791
Average test accuracy: 0.39608708663773756
Best prediction: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1]


SVM poly of degree 2
Average training accuracy: 0.5718206223591755
Average test accuracy: 0.5660332186143947
Best prediction: [1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 0 0 1 1 0 1 0 0 0 0 1 0 0 1 1 0 1 1 1 0 1
 0 1 1 0 1 1 1 1 1 1 0 1 1 0 0 0 0 1 0 1 0 0 1 1 1 1 1 0 1 0 0 1 1 1 1 0 0
 0 1 0 1 0 1 0 1 1 0 1 1 0 1 1 0 0 1 1 0 0 0 1 1 1 1 0 1 1 0 0 1 1 1 1 1 0
 1 1 1 0 1 1 0 1 1 0 1 0 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 0 1 1
 0 0 1 1 0 0 1 1 1 0 1 0 1 0 1]


SVM poly of degree 3
Average training accuracy: 0.6913359945841212
Average te

## Logistic Regression Modelling

In [117]:
from sklearn import linear_model
from sklearn.exceptions import ConvergenceWarning
import warnings
warnings.filterwarnings(action='ignore', category=ConvergenceWarning)

In [118]:
logreg_l1_tests = [
    Test(
        name=f'Logreg l1 penalty with C={i}',
        X=X,
        y=y,
        algorithm=linear_model.LogisticRegression,
        args={'class_weight': {1:10}, 'penalty':'l1', 'solver':'saga', 'C':i}
    )
    for i in [0.0001, 0.001, 0.01, 0.1, 1, 10]
]

for test in logreg_l1_tests:
    test.run()
    test.display()



Logreg l1 penalty with C=0.0001
Average training accuracy: 0.35085603567009827
Average test accuracy: 0.3508603920395032
Best prediction: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


Logreg l1 penalty with C=0.001
Average training accuracy: 0.35085603567009827
Average test accuracy: 0.3508603920395032
Best prediction: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


Logreg l1 penalty with C=0.01
Average training accuracy

In [119]:
logreg_l2_tests = [
    Test(
        name=f'Logreg l2 penalty with C={i}',
        X=X,
        y=y,
        algorithm=linear_model.LogisticRegression,
        args={'class_weight': {1:10}, 'penalty':'l2', 'solver':'saga', 'C':i}
    )
    for i in [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
]

for test in logreg_l2_tests:
    test.run()
    test.display()



Logreg l2 penalty with C=0.0001
Average training accuracy: 0.35085603567009827
Average test accuracy: 0.3508603920395032
Best prediction: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


Logreg l2 penalty with C=0.001
Average training accuracy: 0.35085603567009827
Average test accuracy: 0.3508603920395032
Best prediction: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


Logreg l2 penalty with C=0.01
Average training accuracy