# Imports

In [64]:
import pandas as pd
import numpy as np

# Data Cleaning

## Selecting Relevant Fields
the dataset will be loaded and transformed and relevant dimensions will remain

In [65]:
#TODO: remove the nrows argument when done testing
df = pd.read_csv('database.csv', nrows=2500)[[
    'NAME_CONTRACT_STATUS',
    'CODE_GENDER',
    'FLAG_OWN_CAR',
    'FLAG_OWN_REALTY',
    'CNT_CHILDREN',
    'AMT_INCOME_TOTAL',
    'AMT_CREDIT',
    'NAME_INCOME_TYPE',
    'NAME_EDUCATION_TYPE',
    'NAME_FAMILY_STATUS',
    'NAME_HOUSING_TYPE',
    'DAYS_BIRTH',
    'DAYS_EMPLOYED',
    'OCCUPATION_TYPE',
    'CNT_FAM_MEMBERS'
]]

## Reductions to binary variables

The target variable, NAME_CONTRACT_STATUS, will be reduced from one of 4 possible values, to one of two generic but still correct values – for example, the dataset distingushes between cancelled and rejected and granted loans, however we will only distinguish between granted and not granted loans. Values that are binary but that don't use the binary alphabet will be transformed to use the binary alphabet as well.

Defining a function that will return a copy of the dataframe with reduced fields.

In [66]:
def reduce(df, name, value):
    if type(name) != str:
        raise Exception('only one dimension is reduced at a time')
    idx_name = df.columns.get_loc(name)
    reduced = [(1 if df[name][i] == value else 0) for i in range(len(df))]
    df_reduced = df.drop(labels=[name], axis=1)
    df_reduced.insert(loc=idx_name, column=name, value=reduced)
    return df_reduced

In [67]:
df0 = reduce(df, 'NAME_CONTRACT_STATUS', 'Approved')  # 1 if approved else 0
df1 = reduce(df0, 'CODE_GENDER', 'M')  # 1 if male else 0
df2 = reduce(df1, 'FLAG_OWN_CAR', 'Y')  # 1 if owns car else 0
df3 = reduce(df2, 'FLAG_OWN_REALTY', 'Y')  # 1 if owns property else 0

## One-Hot Encoding
defining a function that returns a copy of the input dataframe with a specific dimension one-hot encoded

In [68]:
def one_hot_encode(df, name):
    if type(name) != str:
        raise Exception('one hot encoding applies to one dimension at a time')
    if len(df) == 0:
        raise Exception('dataframe is empty')
    
    df = df.copy()
    values = df[name].unique()
    
    #for each unique value, we create a new column where df[row][new column] is 1 if the value of df[row][value] == new column
    for v in values:
        one_hot_column = [(1 if df[name][i] == v else 0) for i in range(len(df))]
        df.insert(loc=len(df.loc[0]), column=v, value=one_hot_column)

    return df.drop(labels=[name], axis=1)

performing one-hot encoding on any dimension whose values are one of a set of string values

In [69]:
df4 = one_hot_encode(df3, 'NAME_INCOME_TYPE')
df5 = one_hot_encode(df4, 'NAME_EDUCATION_TYPE')
df6 = one_hot_encode(df5, 'NAME_FAMILY_STATUS')
df7 = one_hot_encode(df6, 'NAME_HOUSING_TYPE')
df8 = one_hot_encode(df7, 'OCCUPATION_TYPE')

# Training the models

## Split the dataframe into X and y as numpy arrays

convert the dataframe into a numpy tensor

In [94]:
dataset_as_array = np.array(df8)
X = dataset_as_array[:,1:]
y = dataset_as_array[:,0].reshape((-1,1))

normalize the data

In [95]:
from sklearn import preprocessing
X_norm = preprocessing.normalize(X)
y_norm = preprocessing.normalize(y)

## Creating the test class
We will create a class that collects everything we need to build and to analyze a model. The class will point to our data, and to a model building algorithm, and will provide functionality for presenting results.

In [124]:
from sklearn.model_selection import train_test_split

class Test:
    def __init__(self, name, X, y, algorithm, args):
        self.name = name
        self.X = X  #pointer, not copy
        self.y = y
        self.algorithm = algorithm
        self.args = args
    
    def run(self, split=0.8):
        X_train, X_test = train_test_split(X_norm, train_size=split, test_size=(1.0-split))
        y_train, y_test = train_test_split(y_norm, train_size=split, test_size=(1.0-split))
        model = self.algorithm(**self.args)
        model.fit(X_train, y_train.ravel())
        acc_train = model.score(X_train,y_train)
        acc_test = model.score(X_test,y_test)
        return acc_train, acc_test

## Support Vector Machine Modelling
The motivation behind support vector machines is that we are building a line of best fit between two datasets, where "best" is defined by an objective function of distance between our line of best fit and between critical points, called support vectors, of these datasets. Support vectors are the closest points to a line of best fit. Our best fit line is also a decision boundary.

In [126]:
from sklearn import svm

In [136]:
svm_tests = [
    Test(
        name='SVM linear',
        X=X_norm,
        y=y_norm,
        algorithm=svm.SVC,
        args={'probability': False, 'kernel': 'linear', 'C': 1}
    ),
    Test(
        name='SVM rbf',
        X=X_norm,
        y=y_norm,
        algorithm=svm.SVC,
        args={'probability': False, 'kernel': 'rbf', 'C': 1}
    ),
    Test(
        name='SVM poly',
        X=X_norm,
        y=y_norm,
        algorithm=svm.SVC,
        args={'probability': False, 'kernel': 'poly', 'C': 1}
    )
]

### Run tests

In [137]:
for test in tests:
    print(f'\n\n{test.name}\n' + '='*10)
    acc_train, acc_test = test.run()
    print(f'Training accuracy: {acc_train}')
    print(f'Test accuracy: {acc_test}')



SVM linear
Training accuracy: 0.6485
Test accuracy: 0.63


SVM rbf
Training accuracy: 0.647
Test accuracy: 0.636


SVM poly
Training accuracy: 0.6415
Test accuracy: 0.658


SVM linear
Training accuracy: 0.64
Test accuracy: 0.664


SVM rbf
Training accuracy: 0.6495
Test accuracy: 0.626


SVM poly
Training accuracy: 0.642
Test accuracy: 0.656
