# Imports

In [110]:
import pandas as pd
import numpy as np

# Data Cleaning

## Selecting Relevant Fields
the dataset will be loaded and transformed and relevant dimensions will remain

In [111]:
#TODO: remove the nrows argument when done testing
df = pd.read_csv('database.csv', nrows=2500)[[
    'NAME_CONTRACT_STATUS',
    'CODE_GENDER',
    'FLAG_OWN_CAR',
    'FLAG_OWN_REALTY',
    'CNT_CHILDREN',
    'AMT_INCOME_TOTAL',
    'AMT_CREDIT',
    # 'NAME_INCOME_TYPE',
    # 'NAME_EDUCATION_TYPE',
    # 'NAME_FAMILY_STATUS',
    # 'NAME_HOUSING_TYPE',
    'DAYS_BIRTH',
    'DAYS_EMPLOYED',
    # 'OCCUPATION_TYPE',
    'CNT_FAM_MEMBERS'
]]

## Reductions to Binary Variables

The target variable, NAME_CONTRACT_STATUS, will be reduced from one of 4 possible values, to one of two generic but still correct values – for example, the dataset distingushes between cancelled and rejected and granted loans, however we will only distinguish between granted and not granted loans. Values that are binary but that don't use the binary alphabet will be transformed to use the binary alphabet as well.

Defining a function that will return a copy of the dataframe with reduced fields.

In [112]:
def reduce(df, name, value):
    if type(name) != str:
        raise Exception('only one dimension is reduced at a time')
    idx_name = df.columns.get_loc(name)
    reduced = [(1 if df[name][i] == value else 0) for i in range(len(df))]
    df_reduced = df.drop(labels=[name], axis=1)
    df_reduced.insert(loc=idx_name, column=name, value=reduced)
    return df_reduced

In [113]:
df0 = reduce(df, 'NAME_CONTRACT_STATUS', 'Approved')  # 1 if approved else 0
df1 = reduce(df0, 'CODE_GENDER', 'M')  # 1 if male else 0
df2 = reduce(df1, 'FLAG_OWN_CAR', 'Y')  # 1 if owns car else 0
df3 = reduce(df2, 'FLAG_OWN_REALTY', 'Y')  # 1 if owns property else 0

## One-Hot Encoding
defining a function that returns a copy of the input dataframe with a specific dimension one-hot encoded

In [114]:
def one_hot_encode(df, name):
    if type(name) != str:
        raise Exception('one hot encoding applies to one dimension at a time')
    if len(df) == 0:
        raise Exception('dataframe is empty')
    
    df = df.copy()
    values = df[name].unique()
    
    #for each unique value, we create a new column where df[row][new column] is 1 if the value of df[row][value] == new column
    for v in values:
        one_hot_column = [(1 if df[name][i] == v else 0) for i in range(len(df))]
        df.insert(loc=len(df.loc[0]), column=v, value=one_hot_column)

    return df.drop(labels=[name], axis=1)

performing one-hot encoding on any dimension whose values are one of a set of string values

In [115]:
# df4 = one_hot_encode(df3, 'NAME_INCOME_TYPE')
# df5 = one_hot_encode(df4, 'NAME_EDUCATION_TYPE')
# df6 = one_hot_encode(df5, 'NAME_FAMILY_STATUS')
# df7 = one_hot_encode(df6, 'NAME_HOUSING_TYPE')
# df8 = one_hot_encode(df7, 'OCCUPATION_TYPE')
df8=df3

# Training the models

## Split the dataframe into X and y as numpy arrays

convert the dataframe into a numpy tensor

In [155]:
data = np.array(df8)

num_zero = len(data[data[:,0]==0])
num_one = int(num_zero / 3)

#balance the amount of approvals and rejections
data = np.vstack((
    data[data[:,0]==1][:num_one,:],
    data[data[:,0]==0]
))
print(num_zero, num_one/float(len(data)))

X = data[:,1:]
y = data[:,0].reshape((-1,1)).astype(np.int32).ravel()

888 0.25


normalize the data

In [156]:
from sklearn import preprocessing
X = preprocessing.StandardScaler().fit_transform(X)

## Creating the test class
We will create a class that collects everything we need to build and to analyze a model. The class will point to our data, and to a model building algorithm, and will provide functionality for presenting results.

In [157]:
from sklearn.model_selection import KFold

class Test:
    def __init__(self, name, X, y, algorithm, args):
        self.name = name
        self.X = X  #pointer, not copy
        self.y = y
        self.algorithm = algorithm
        self.args = args
    
    # def run(self, split=0.8):
    #     X_train, X_test = train_test_split(self.X, train_size=split, test_size=(1.0-split))
    #     y_train, y_test = train_test_split(self.y, train_size=split, test_size=(1.0-split))
    #     model = self.algorithm(**self.args)
    #     print("before")
    #     model.fit(X_train, y_train.ravel())
    #     print("after", model.predict(X_test))
    #     acc_train = model.score(X_train,y_train.ravel())
    #     acc_test = model.score(X_test,y_test)
    #     return acc_train, acc_test

    #obtain accuracy using k-fold cross validation
    def run(self, n_splits=5):
        accs_train = []
        accs_test = []
        kfold_model = KFold(n_splits=n_splits, random_state=None, shuffle=False)
        for train_index, test_index in kfold_model.split(self.X):
            X_train = self.X[train_index]
            y_train = self.y[train_index]
            X_test = self.X[test_index]
            y_test = self.y[test_index]
            model = self.algorithm(**self.args)
            model.fit(X_train, y_train)
            accs_train.append(model.score(X_train,y_train))
            accs_test.append(model.score(X_test,y_test))
        avg_acc_train = sum(accs_train)/float(len(accs_train))
        avg_acc_test = sum(accs_test)/float(len(accs_test))
        return avg_acc_train, avg_acc_test

## Support Vector Machine Modelling
The motivation behind support vector machines is that we are building a line of best fit between two datasets, where "best" is defined by an objective function of distance between our line of best fit and between critical points, called support vectors, of these datasets. Support vectors are the closest points to a line of best fit. Our best fit line is also a decision boundary.

In [149]:
from sklearn import svm

### Tests comparing the three kernels

In [158]:
svm_basic_tests = [
    Test(
        name='SVM rbf',
        X=X,
        y=y,
        algorithm=svm.SVC,
        args={'kernel': 'rbf', 'C': 512}
    ),
]

for test in svm_basic_tests:
    print(f'\n\n{test.name}\n' + '='*len(test.name))
    acc_train, acc_test = test.run()
    print(f'Training accuracy: {acc_train}')
    print(f'Test accuracy: {acc_test}')



SVM rbf
Training accuracy: 0.9117435026889267
Test accuracy: 0.5862547378960166


### Tests comparing different degrees of the polynomial kernel

In [131]:
svm_poly_tests = [
    Test(
        name=f'SVM poly of degree {i}',
        X=X,
        y=y,
        algorithm=svm.SVC,
        args={'class_weight': {0:1,1:10}, 'kernel': 'poly', 'C': 0.01, 'degree': i}
    )
    for i in range(1,11)
]

for test in svm_poly_tests:
    print(f'\n\n{test.name}\n' + '='*len(test.name))
    acc_train, acc_test = test.run()
    print(f'Training accuracy: {acc_train}')
    print(f'Test accuracy: {acc_test}')



SVM poly of degree 1
Training accuracy: 0.7500131438831932
Test accuracy: 0.7502109704641351


SVM poly of degree 2
Training accuracy: 0.7500131438831932
Test accuracy: 0.7502109704641351


SVM poly of degree 3
Training accuracy: 0.7500131438831932
Test accuracy: 0.7502109704641351


SVM poly of degree 4
Training accuracy: 0.7504355303668258
Test accuracy: 0.7502109704641351


SVM poly of degree 5
Training accuracy: 0.7531803741773935
Test accuracy: 0.7476793248945148


SVM poly of degree 6
Training accuracy: 0.7559247724325986
Test accuracy: 0.7442930701566188


SVM poly of degree 7
Training accuracy: 0.75740267956995
Test accuracy: 0.738378745619681


SVM poly of degree 8
Training accuracy: 0.7588803639296201
Test accuracy: 0.7366909819066009


SVM poly of degree 9
Training accuracy: 0.7641590810866205
Test accuracy: 0.7375277122219839


SVM poly of degree 10
Training accuracy: 0.7690149662046257
Test accuracy: 0.7358327969677465


## Logistic Regression Modelling

In [15]:
from sklearn import linear_model

In [16]:
logreg_l1_tests = [
    Test(
        name=f'Logreg l1 penalty with C={i}',
        X=X,
        y=y,
        algorithm=linear_model.LogisticRegression,
        args={'class_weight': {1:10}, 'penalty':'l1', 'solver':'saga', 'C':i}
    )
    for i in [0.0001, 0.001, 0.01, 0.1, 1, 10]
]

for test in logreg_l1_tests:
    print(f'\n\n{test.name}\n' + '='*len(test.name))
    acc_train, acc_test = test.run()
    print(f'Training accuracy: {acc_train}')
    print(f'Test accuracy: {acc_test}')



Logreg l1 penalty with C=0.0001
before
after [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
Training accuracy: 0.5077464788732394
Test accuracy: 0.4691011235955056


Logreg l1 penalty with C=0.001
before
after [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 

In [17]:
logreg_l2_tests = [
    Test(
        name=f'Logreg l2 penalty with C={i}',
        X=X,
        y=y,
        algorithm=linear_model.LogisticRegression,
        args={'class_weight': {1:10}, 'penalty':'l2', 'solver':'saga', 'C':i}
    )
    for i in [0.0001, 0.001, 0.01, 0.1, 1, 10]
]

for test in logreg_l2_tests:
    print(f'\n\n{test.name}\n' + '='*len(test.name))
    acc_train, acc_test = test.run()
    print(f'Training accuracy: {acc_train}')
    print(f'Test accuracy: {acc_test}')



Logreg l2 penalty with C=0.0001
before
after [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
Training accuracy: 0.5183098591549296
Test accuracy: 0.42696629213483145


Logreg l2 penalty with C=0.001
before
after [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1