# Imports

In [1]:
import pandas as pd
import numpy as np

# Data Cleaning

## Selecting Relevant Fields
the dataset will be loaded and transformed and relevant dimensions will remain

In [2]:
#TODO: remove the nrows argument when done testing
df = pd.read_csv('database.csv', nrows=2500)[[
    'NAME_CONTRACT_STATUS',
    'CODE_GENDER',
    'FLAG_OWN_CAR',
    'FLAG_OWN_REALTY',
    'CNT_CHILDREN',
    'AMT_INCOME_TOTAL',
    'AMT_CREDIT',
    'NAME_INCOME_TYPE',
    'NAME_EDUCATION_TYPE',
    'NAME_FAMILY_STATUS',
    'NAME_HOUSING_TYPE',
    'DAYS_BIRTH',
    'DAYS_EMPLOYED',
    'OCCUPATION_TYPE',
    'CNT_FAM_MEMBERS'
]]

## Reductions to binary variables

The target variable, NAME_CONTRACT_STATUS, will be reduced from one of 4 possible values, to one of two generic but still correct values – for example, the dataset distingushes between cancelled and rejected and granted loans, however we will only distinguish between granted and not granted loans. Values that are binary but that don't use the binary alphabet will be transformed to use the binary alphabet as well.

Defining a function that will return a copy of the dataframe with reduced fields.

In [3]:
def reduce(df, name, value):
    if type(name) != str:
        raise Exception('only one dimension is reduced at a time')
    idx_name = df.columns.get_loc(name)
    reduced = [(1 if df[name][i] == value else 0) for i in range(len(df))]
    df_reduced = df.drop(labels=[name], axis=1)
    df_reduced.insert(loc=idx_name, column=name, value=reduced)
    return df_reduced

In [4]:
df0 = reduce(df, 'NAME_CONTRACT_STATUS', 'Approved')  # 1 if approved else 0
df1 = reduce(df0, 'CODE_GENDER', 'M')  # 1 if male else 0
df2 = reduce(df1, 'FLAG_OWN_CAR', 'Y')  # 1 if owns car else 0
df3 = reduce(df2, 'FLAG_OWN_REALTY', 'Y')  # 1 if owns property else 0

## One-Hot Encoding
defining a function that returns a copy of the input dataframe with a specific dimension one-hot encoded

In [5]:
def one_hot_encode(df, name):
    if type(name) != str:
        raise Exception('one hot encoding applies to one dimension at a time')
    if len(df) == 0:
        raise Exception('dataframe is empty')
    
    df = df.copy()
    values = df[name].unique()
    
    #for each unique value, we create a new column where df[row][new column] is 1 if the value of df[row][value] == new column
    for v in values:
        one_hot_column = [(1 if df[name][i] == v else 0) for i in range(len(df))]
        df.insert(loc=len(df.loc[0]), column=v, value=one_hot_column)

    return df.drop(labels=[name], axis=1)

performing one-hot encoding on any dimension whose values are one of a set of string values

In [6]:
df4 = one_hot_encode(df3, 'NAME_INCOME_TYPE')
df5 = one_hot_encode(df4, 'NAME_EDUCATION_TYPE')
df6 = one_hot_encode(df5, 'NAME_FAMILY_STATUS')
df7 = one_hot_encode(df6, 'NAME_HOUSING_TYPE')
df8 = one_hot_encode(df7, 'OCCUPATION_TYPE')

# Training the models

## Split the dataframe into X and y as numpy arrays

convert the dataframe into a numpy tensor

In [7]:
dataset_as_array = np.array(df8)
X = dataset_as_array[:,1:]
y = dataset_as_array[:,0].reshape((-1,1)).astype(np.int32)

normalize the data

In [8]:
from sklearn import preprocessing
X = preprocessing.StandardScaler().fit_transform(X)

## Creating the test class
We will create a class that collects everything we need to build and to analyze a model. The class will point to our data, and to a model building algorithm, and will provide functionality for presenting results.

In [9]:
from sklearn.model_selection import train_test_split

class Test:
    def __init__(self, name, X, y, algorithm, args):
        self.name = name
        self.X = X  #pointer, not copy
        self.y = y
        self.algorithm = algorithm
        self.args = args
    
    def run(self, split=0.8):
        X_train, X_test = train_test_split(self.X, train_size=split, test_size=(1.0-split))
        y_train, y_test = train_test_split(self.y, train_size=split, test_size=(1.0-split))
        model = self.algorithm(**self.args)
        print("before")
        model.fit(X_train, y_train.ravel())
        print("after")
        acc_train = model.score(X_train,y_train.ravel())
        acc_test = model.score(X_test,y_test)
        return acc_train, acc_test

## Support Vector Machine Modelling
The motivation behind support vector machines is that we are building a line of best fit between two datasets, where "best" is defined by an objective function of distance between our line of best fit and between critical points, called support vectors, of these datasets. Support vectors are the closest points to a line of best fit. Our best fit line is also a decision boundary.

In [10]:
from sklearn import svm

### Tests comparing the three kernels

In [11]:
svm_basic_tests = [
    Test(
        name='SVM linear',
        X=X,
        y=y,
        algorithm=svm.SVC,
        args={'probability': False, 'kernel': 'linear', 'C': 100}
    )
]

In [17]:
for test in svm_basic_tests:
    print(f'\n\n{test.name}\n' + '='*len(test.name))
    acc_train, acc_test = test.run()
    print(f'Training accuracy: {acc_train}')
    print(f'Test accuracy: {acc_test}')

NameError: name 'svm_basic_tests' is not defined

### Tests comparing different degrees of the polynomial kernel

In [13]:
svm_poly_tests = [
    Test(
        name=f'SVM poly of degree {i}',
        X=X,
        y=y,
        algorithm=svm.SVC,
        args={'probability': False, 'kernel': 'poly', 'C': 0.001, 'degree': i}
    )
    for i in range(1,11)
]

In [14]:
for test in svm_poly_tests:
    print(f'\n\n{test.name}\n' + '='*len(test.name))
    acc_train, acc_test = test.run()
    print(f'Training accuracy: {acc_train}')
    print(f'Test accuracy: {acc_test}')



SVM poly of degree 1
Training accuracy: 0.643
Test accuracy: 0.652


SVM poly of degree 2
Training accuracy: 0.6415
Test accuracy: 0.658


SVM poly of degree 3
Training accuracy: 0.632
Test accuracy: 0.696


SVM poly of degree 4
Training accuracy: 0.643
Test accuracy: 0.652


SVM poly of degree 5
Training accuracy: 0.6445
Test accuracy: 0.646


SVM poly of degree 6
Training accuracy: 0.6525
Test accuracy: 0.614


SVM poly of degree 7
Training accuracy: 0.643
Test accuracy: 0.652


SVM poly of degree 8
Training accuracy: 0.6475
Test accuracy: 0.634


SVM poly of degree 9
Training accuracy: 0.649
Test accuracy: 0.628


SVM poly of degree 10
Training accuracy: 0.644
Test accuracy: 0.648


### Linear Regression Modelling

In [16]:
from sklearn import linear_model

In [25]:
logreg_l1_tests = [
    Test(
        name=f'Logreg l1 penalty with C={i}',
        X=X,
        y=y,
        algorithm=linear_model.LogisticRegression,
        args={'penalty':'l1', 'solver':'saga', 'C':i}
    )
    for i in [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
]

for test in logreg_l1_tests:
    print(f'\n\n{test.name}\n' + '='*len(test.name))
    acc_train, acc_test = test.run()
    print(f'Training accuracy: {acc_train}')
    print(f'Test accuracy: {acc_test}')



Logreg l1 penalty with C=0.0001
Training accuracy: 0.6535
Test accuracy: 0.61


Logreg l1 penalty with C=0.001
Training accuracy: 0.6515
Test accuracy: 0.618


Logreg l1 penalty with C=0.01
Training accuracy: 0.635
Test accuracy: 0.684


Logreg l1 penalty with C=0.1
Training accuracy: 0.65
Test accuracy: 0.624


Logreg l1 penalty with C=1
Training accuracy: 0.6475
Test accuracy: 0.634


Logreg l1 penalty with C=10
Training accuracy: 0.6435
Test accuracy: 0.65


Logreg l1 penalty with C=100
Training accuracy: 0.652
Test accuracy: 0.616


Logreg l1 penalty with C=1000
Training accuracy: 0.6485
Test accuracy: 0.63


In [26]:
logreg_l2_tests = [
    Test(
        name=f'Logreg l2 penalty with C={i}',
        X=X,
        y=y,
        algorithm=linear_model.LogisticRegression,
        args={'penalty':'l1', 'solver':'saga', 'C':i}
    )
    for i in [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
]

for test in logreg_l2_tests:
    print(f'\n\n{test.name}\n' + '='*len(test.name))
    acc_train, acc_test = test.run()
    print(f'Training accuracy: {acc_train}')
    print(f'Test accuracy: {acc_test}')



Logreg l2 penalty with C=0.0001
Training accuracy: 0.641
Test accuracy: 0.66


Logreg l2 penalty with C=0.001
Training accuracy: 0.6455
Test accuracy: 0.642


Logreg l2 penalty with C=0.01
Training accuracy: 0.6495
Test accuracy: 0.626


Logreg l2 penalty with C=0.1
Training accuracy: 0.641
Test accuracy: 0.66


Logreg l2 penalty with C=1
Training accuracy: 0.6465
Test accuracy: 0.638


Logreg l2 penalty with C=10
Training accuracy: 0.6545
Test accuracy: 0.606


Logreg l2 penalty with C=100
Training accuracy: 0.6465
Test accuracy: 0.638


Logreg l2 penalty with C=1000
Training accuracy: 0.6505
Test accuracy: 0.622
