# Importing phase

## Importing libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb

## Importing the dataset

In [2]:
data_train = pd.read_csv("application_train.csv")

# Preparing the data

## Removing missing values

Removing rows with at least one missing value

In [3]:
data_train = data_train.dropna(axis = 0)

## Transforming to dummies

Transforming non-numerical variables to dummies

In [4]:
data_train = pd.get_dummies(data_train, columns = ['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE'])

## Separating the target

We have to separate the 'TARGET' variable from the dataset, as it will be predicted

In [5]:
Y = data_train['TARGET']

Dropping the 'TARGET' variable from the original set

In [6]:
X = data_train.drop(columns = ['TARGET'])

## Splitting the data

We use scikit-learn to split the data into train and test sets

In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y)

# Building models

## Random Forest

Initiating the random forest model

In [8]:
randomforest = RandomForestClassifier(n_estimators = 1000)

Training the model

In [9]:
randomforest.fit(X_train, Y_train)

RandomForestClassifier(n_estimators=1000)

Predicting targets for test set

In [10]:
rf_targets = randomforest.predict(X_test)

Calculating accuracy using sklearn

In [11]:
print("Accuracy = ",metrics.accuracy_score(Y_test, rf_targets))

Accuracy =  0.9460715946071595


## Gradient Boosting

Initiating the gradient boosting model

In [12]:
gradientboost = GradientBoostingClassifier(n_estimators = 1000)

Training the model

In [13]:
gradientboost.fit(X_train, Y_train)

GradientBoostingClassifier(n_estimators=1000)

Predicting targets for test set

In [14]:
gb_targets = gradientboost.predict(X_test)

Calculating accuracy using sklearn

In [15]:
print("Accuracy = ",metrics.accuracy_score(Y_test, gb_targets))

Accuracy =  0.9386331938633193


## XGBoost

Initiating the XGBoost model

In [16]:
xgboost = xgb.XGBClassifier(n_estimators = 1000, use_label_encoder = False)

Training the model

In [17]:
xgboost.fit(X_train, Y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=1000, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

Predicting targets for test set

In [18]:
xgb_targets = xgboost.predict(X_test)

Calculating accuracy using sklearn

In [19]:
print("Accuracy = ",metrics.accuracy_score(Y_test, xgb_targets))

Accuracy =  0.9409576940957695
