# Imports

In [62]:
import pandas as pd

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split
from sklearn.model_selection import learning_curve

from sklearn.ensemble import GradientBoostingClassifier

import pickle

# Data

In [32]:
df_clean = pd.read_csv("../data/train.csv")

In [64]:
df_clean

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
5,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...
609,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [33]:
df_clean = df_clean.drop(["Loan_ID"], axis=1)

In [34]:
df_clean_prep.info()

<class 'pandas.core.frame.DataFrame'>
Index: 614 entries, LP001002 to LP002990
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             601 non-null    object 
 1   Married            611 non-null    object 
 2   Dependents         599 non-null    object 
 3   Education          614 non-null    object 
 4   Self_Employed      582 non-null    object 
 5   ApplicantIncome    614 non-null    int64  
 6   CoapplicantIncome  614 non-null    float64
 7   LoanAmount         592 non-null    float64
 8   Loan_Amount_Term   600 non-null    float64
 9   Credit_History     564 non-null    float64
 10  Property_Area      614 non-null    object 
 11  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(7)
memory usage: 62.4+ KB


In [35]:
df_clean.dropna(inplace=True)

# Split

In [37]:
X = df_clean.drop(["Loan_Status"], axis=1)
y = df_clean.Loan_Status

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Pipeline

In [53]:
numeric_features = ["ApplicantIncome", "CoapplicantIncome", "LoanAmount",
                    "Loan_Amount_Term", "Credit_History"]
numeric_transformer = Pipeline(
    steps=[
        ("scaler", MinMaxScaler())]
)

categorical_features = ["Gender", "Married", "Dependents",
                        "Education", "Self_Employed", "Property_Area"]
categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder())
    ]
)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

In [54]:
GBC = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("classifier", GradientBoostingClassifier())]
)

In [55]:
GBC.fit(X_train, y_train)

# Evaluation

In [57]:
print("accuracy train : %.3f"%GBC.score(X_train, y_train))
print("accuracy test : %.3f"%GBC.score(X_test , y_test))

accuracy train : 0.896
accuracy test : 0.812


In [58]:
y_pred_GBC = GBC.predict(X_test) 

In [59]:
print(classification_report(y_test, y_pred_GBC))

              precision    recall  f1-score   support

           N       0.86      0.43      0.57        28
           Y       0.80      0.97      0.88        68

    accuracy                           0.81        96
   macro avg       0.83      0.70      0.73        96
weighted avg       0.82      0.81      0.79        96



In [60]:
y_pred_GBC

array(['Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y',
       'Y', 'N', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'N', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N',
       'Y', 'Y', 'Y', 'Y', 'Y'], dtype=object)

In [65]:
X_test

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
92,Male,Yes,2,Not Graduate,No,3273,1820.0,81.0,360.0,1.0,Urban
529,Male,No,0,Not Graduate,No,6783,0.0,130.0,360.0,1.0,Semiurban
505,Male,Yes,2,Graduate,No,3510,4416.0,243.0,360.0,1.0,Rural
358,Male,Yes,0,Not Graduate,No,3000,1666.0,100.0,480.0,0.0,Urban
512,Male,Yes,2,Graduate,No,3283,2035.0,148.0,360.0,1.0,Urban
...,...,...,...,...,...,...,...,...,...,...,...
281,Male,Yes,0,Graduate,No,3927,800.0,112.0,360.0,1.0,Semiurban
299,Male,Yes,1,Graduate,No,2014,2925.0,113.0,360.0,1.0,Urban
522,Male,Yes,3+,Graduate,Yes,5677,1424.0,100.0,360.0,1.0,Rural
33,Male,Yes,0,Graduate,No,3500,1667.0,114.0,360.0,1.0,Semiurban


# Pickle

In [63]:
# import pickle
with open('../models/model_2.pkl', 'wb') as f:
    pickle.dump(GBC, f)