# Capstone Project 1:
## Home Loan Credibility Assessment

## 2. Data Understanding
### 2.1 Import Libraries

In [1]:
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Handle table-like data and matrices
import numpy as np
import pandas as pd

# Modelling Algorithms
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier

# Modelling Helpers
from sklearn.preprocessing import LabelEncoder , MinMaxScaler , Normalizer , scale
from sklearn.model_selection import train_test_split , StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

# Visualisation
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns

# Configure visualisations
%matplotlib inline
mpl.style.use( 'ggplot' )
sns.set_style( 'white' )
pylab.rcParams[ 'figure.figsize' ] = 8 , 6

# pandas column display 
pd.set_option('display.max_columns', 122)

## Version 1.

In [2]:
X = pd.read_csv("./Data_Files/application_train.csv")

# Drop rows with null values. Missing value treatment.
X.dropna(inplace=True)

y = X['TARGET']

# Remove ID variables
X.drop(['SK_ID_CURR', 'TARGET'], axis=1, inplace=True)

# one-hot encoding of categorical variables
X = pd.get_dummies(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(X_train.shape , X_test.shape , y_train.shape , y_test.shape)

(6021, 233) (2581, 233) (6021,) (2581,)


In [3]:
log_reg = LogisticRegression()

# Train on the training data
log_reg.fit(X_train, y_train)

result = log_reg.score(X_test, y_test)
print("Accuracy of LogisticRegression: %.3f%%" % (result*100))

Accuracy of LogisticRegression: 93.956%


In [4]:
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train) 
result = rfc.score(X_test, y_test) 
print("Accuracy of RandomForestClassifier: %.3f%%" % (result*100))

Accuracy of RandomForestClassifier: 93.956%


## Version 2. 

In [5]:
X = pd.read_csv("./Data_Files/application_train.csv")
y = X['TARGET']

# Remove ID variables
X.drop(['SK_ID_CURR', 'TARGET'], axis=1, inplace=True)

# one-hot encoding of categorical variables
X = pd.get_dummies(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(X_train.shape , X_test.shape , y_train.shape , y_test.shape)

(215257, 244) (92254, 244) (215257,) (92254,)


In [6]:
# Feature names
features = list(X_train.columns)

# Copy of the testing data
test = X_test.copy()

# Median imputation of missing values
imputer = SimpleImputer(strategy = 'mean')

# Fit on the training data
imputer.fit(X_train)

# Transform both training and testing data
train = imputer.transform(X_train)
test = imputer.transform(X_test)

print('Training data shape: ', train.shape, 'Testing data shape: ', test.shape)

Training data shape:  (215257, 244) Testing data shape:  (92254, 244)


In [9]:
log_reg = LogisticRegression()

# Train on the training data
log_reg.fit(train, y_train)

result = log_reg.score(test, y_test)
print("Accuracy of LogisticRegression: %.3f%%" % (result*100))

Accuracy of LogisticRegression: 91.963%


In [11]:
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(train, y_train) 
result = rfc.score(test, y_test) 
print("Accuracy of RandomForestClassifier: %.3f%%" % (result*100))

Accuracy of RandomForestClassifier: 91.965%
