In [1]:
import pandas as pd
import sklearn
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import pickle

In [2]:
datatset = pd.read_csv('../data/merged_data.csv')

In [3]:
datatset.head()

Unnamed: 0,ID,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_HOUSING_TYPE,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,MONTHS_BALANCE,APPROVED
0,5008806,Y,Y,0,112500.0,Working,Secondary / secondary special,House / apartment,-1134,1,0,0,0,Security staff,2.0,0,1
1,5008806,Y,Y,0,112500.0,Working,Secondary / secondary special,House / apartment,-1134,1,0,0,0,Security staff,2.0,-1,1
2,5008806,Y,Y,0,112500.0,Working,Secondary / secondary special,House / apartment,-1134,1,0,0,0,Security staff,2.0,-2,1
3,5008806,Y,Y,0,112500.0,Working,Secondary / secondary special,House / apartment,-1134,1,0,0,0,Security staff,2.0,-3,1
4,5008806,Y,Y,0,112500.0,Working,Secondary / secondary special,House / apartment,-1134,1,0,0,0,Security staff,2.0,-4,1


In [4]:
datatset = datatset.drop('ID', axis=1)

In [5]:
train_set, val_set = train_test_split(datatset, test_size=0.25, random_state=42)

In [6]:
scaler = StandardScaler()
ordinal_encoder = OrdinalEncoder()
onehot_encoder = OneHotEncoder()

In [7]:
datatset['NAME_INCOME_TYPE'].unique()

array(['Working', 'Commercial associate', 'State servant', 'Student',
       'Pensioner'], dtype=object)

In [8]:
datatset['NAME_EDUCATION_TYPE'].unique()

array(['Secondary / secondary special', 'Higher education',
       'Incomplete higher', 'Lower secondary', 'Academic degree'],
      dtype=object)

In [9]:
datatset['NAME_HOUSING_TYPE'].unique()

array(['House / apartment', 'Rented apartment', 'Municipal apartment',
       'With parents', 'Co-op apartment', 'Office apartment'],
      dtype=object)

In [10]:
datatset['OCCUPATION_TYPE'].unique()

array(['Security staff', 'Sales staff', 'Accountants', 'Laborers',
       'Managers', 'Drivers', 'Core staff', 'High skill tech staff',
       'Cleaning staff', 'Cooking staff', 'Low-skill Laborers',
       'Medicine staff', 'Private service staff', 'Secretaries',
       'Waiters/barmen staff', 'HR staff', 'IT staff', 'Realty agents'],
      dtype=object)

In [11]:
train_set_encoded = pd.get_dummies(train_set, columns=['NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE'], dtype=int)
val_set_encoded = pd.get_dummies(val_set, columns=['NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE'], dtype=int)

In [12]:
train_set_encoded['FLAG_OWN_CAR'] = train_set_encoded['FLAG_OWN_CAR'].replace({'Y': 1, 'N': 0})
train_set_encoded['FLAG_OWN_REALTY'] = train_set_encoded['FLAG_OWN_REALTY'].replace({'Y': 1, 'N': 0})
val_set_encoded['FLAG_OWN_CAR'] = val_set_encoded['FLAG_OWN_CAR'].replace({'Y': 1, 'N': 0})
val_set_encoded['FLAG_OWN_REALTY'] = val_set_encoded['FLAG_OWN_REALTY'].replace({'Y': 1, 'N': 0})

In [13]:
train_set_encoded[['AMT_INCOME_TOTAL', 'DAYS_EMPLOYED', 'MONTHS_BALANCE']] = scaler.fit_transform(train_set_encoded[['AMT_INCOME_TOTAL', 'DAYS_EMPLOYED', 'MONTHS_BALANCE']])

In [14]:
val_set_encoded[['AMT_INCOME_TOTAL', 'DAYS_EMPLOYED', 'MONTHS_BALANCE']] = scaler.fit_transform(val_set_encoded[['AMT_INCOME_TOTAL', 'DAYS_EMPLOYED', 'MONTHS_BALANCE']])

In [15]:
train_set_encoded[['CNT_FAM_MEMBERS', 'CNT_CHILDREN']] = ordinal_encoder.fit_transform(train_set_encoded[['CNT_FAM_MEMBERS', 'CNT_CHILDREN']])

In [16]:
val_set_encoded[['CNT_FAM_MEMBERS', 'CNT_CHILDREN']] = ordinal_encoder.fit_transform(val_set_encoded[['CNT_FAM_MEMBERS', 'CNT_CHILDREN']])

In [17]:
column_names = list(train_set_encoded.columns.values)
column_names

['FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'CNT_CHILDREN',
 'AMT_INCOME_TOTAL',
 'DAYS_EMPLOYED',
 'FLAG_MOBIL',
 'FLAG_WORK_PHONE',
 'FLAG_PHONE',
 'FLAG_EMAIL',
 'CNT_FAM_MEMBERS',
 'MONTHS_BALANCE',
 'APPROVED',
 'NAME_INCOME_TYPE_Commercial associate',
 'NAME_INCOME_TYPE_Pensioner',
 'NAME_INCOME_TYPE_State servant',
 'NAME_INCOME_TYPE_Student',
 'NAME_INCOME_TYPE_Working',
 'NAME_EDUCATION_TYPE_Academic degree',
 'NAME_EDUCATION_TYPE_Higher education',
 'NAME_EDUCATION_TYPE_Incomplete higher',
 'NAME_EDUCATION_TYPE_Lower secondary',
 'NAME_EDUCATION_TYPE_Secondary / secondary special',
 'NAME_HOUSING_TYPE_Co-op apartment',
 'NAME_HOUSING_TYPE_House / apartment',
 'NAME_HOUSING_TYPE_Municipal apartment',
 'NAME_HOUSING_TYPE_Office apartment',
 'NAME_HOUSING_TYPE_Rented apartment',
 'NAME_HOUSING_TYPE_With parents',
 'OCCUPATION_TYPE_Accountants',
 'OCCUPATION_TYPE_Cleaning staff',
 'OCCUPATION_TYPE_Cooking staff',
 'OCCUPATION_TYPE_Core staff',
 'OCCUPATION_TYPE_Drivers',
 'OCCUPATION_T

In [18]:
rf_classifier = RandomForestClassifier(criterion="entropy", n_estimators=150, max_depth=5)
rf_classifier.fit(train_set_encoded.drop(columns=['APPROVED'], axis=1), train_set_encoded[['APPROVED']])

  return fit_method(estimator, *args, **kwargs)


In [19]:
accuracy_score(val_set_encoded[['APPROVED']], rf_classifier.predict(val_set_encoded.drop(columns=['APPROVED'], axis=1)))

0.9801515499425947

In [20]:
filename = '../output_model/model.sav'
pickle.dump(rf_classifier, open(filename, 'wb'))