# Imports

In [3]:
import pandas as pd
import numpy as np
import sklearn as skl

# Data Cleaning

## Selecting Relevant Fields
the dataset will be loaded and transformed and relevant dimensions will remain

In [4]:
#TODO: remove the nrows argument when done testing
df = pd.read_csv('database.csv', nrows=1000)[[
    'NAME_CONTRACT_STATUS',
    'CODE_GENDER',
    'FLAG_OWN_CAR',
    'FLAG_OWN_REALTY',
    'CNT_CHILDREN',
    'AMT_INCOME_TOTAL',
    'AMT_CREDIT',
    'NAME_INCOME_TYPE',
    'NAME_EDUCATION_TYPE',
    'NAME_FAMILY_STATUS',
    'NAME_HOUSING_TYPE',
    'DAYS_BIRTH',
    'DAYS_EMPLOYED',
    'OCCUPATION_TYPE',
    'CNT_FAM_MEMBERS'
]]

## Reductions to binary variables

The target variable, NAME_CONTRACT_STATUS, will be reduced from one of 4 possible values, to one of two generic but still correct values – for example, the dataset distingushes between cancelled and rejected and granted loans, however we will only distinguish between granted and not granted loans. Values that are binary but that don't use the binary alphabet will be transformed to use the binary alphabet as well.

Defining a function that will return a copy of the dataframe with reduced fields.

In [5]:
def reduce(df, name, value):
    if type(name) != str:
        raise Exception('only one dimension is reduced at a time')
    idx_name = df.columns.get_loc(name)
    reduced = [(1 if df[name][i] == value else 0) for i in range(len(df))]
    df_reduced = df.drop(labels=[name], axis=1)
    df_reduced.insert(loc=idx_name, column=name, value=reduced)
    return df_reduced

In [6]:
df0 = reduce(df, 'NAME_CONTRACT_STATUS', 'APPROVED')  # 1 if approved else 0
df1 = reduce(df0, 'CODE_GENDER', 'M')  # 1 if male else 0
df2 = reduce(df1, 'FLAG_OWN_CAR', 'Y')  # 1 if owns car else 0
df3 = reduce(df2, 'FLAG_OWN_REALTY', 'Y')  # 1 if owns property else 0

## One-Hot Encoding
defining a function that returns a copy of the input dataframe with a specific dimension one-hot encoded

In [7]:
def one_hot_encode(df, name):
    if type(name) != str:
        raise Exception('one hot encoding applies to one dimension at a time')
    if len(df) == 0:
        raise Exception('dataframe is empty')
    
    df = df.copy()
    values = df[name].unique()
    
    #for each unique value, we create a new column where df[row][new column] is 1 if the value of df[row][value] == new column
    for v in values:
        one_hot_column = [(1 if df[name][i] == v else 0) for i in range(len(df))]
        df.insert(loc=len(df.loc[0]), column=v, value=one_hot_column)

    return df.drop(labels=[name], axis=1)

performing one-hot encoding on any dimension whose values are one of a set of string values

In [8]:
df4 = one_hot_encode(df3, 'NAME_INCOME_TYPE')
df5 = one_hot_encode(df4, 'NAME_EDUCATION_TYPE')
df6 = one_hot_encode(df5, 'NAME_FAMILY_STATUS')
df7 = one_hot_encode(df6, 'NAME_HOUSING_TYPE')
df8 = one_hot_encode(df7, 'OCCUPATION_TYPE')

# Training the models

## Split the dataframe into X and y as numpy arrays

In [9]:
dataset_as_array = np.array(df8)
X = dataset_as_array[:,1:]
y = dataset_as_array[:,0]