In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
%matplotlib inline

### Input

In [40]:
df_train = pd.read_csv('./Datasets/train.csv')
df_train.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [41]:
df_test = pd.read_csv('./Datasets/test.csv')
df_test.head(3)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208,360.0,1.0,Urban


In [42]:
raw_train = df_train.copy() ## Creating copies of OG datasets
raw_test = df_test.copy()

### Basic Description

In [43]:
df_train.shape

(614, 13)

In [None]:
## Number of unique values per feature
df_train.nunique() ## Loan ID is unique identifier

Loan_ID              614
Gender                 2
Married                2
Dependents             4
Education              2
Self_Employed          2
ApplicantIncome      505
CoapplicantIncome    287
LoanAmount           203
Loan_Amount_Term      10
Credit_History         2
Property_Area          3
Loan_Status            2
dtype: int64

In [None]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [None]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 362 entries, 0 to 361
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            362 non-null    object 
 1   Gender             351 non-null    object 
 2   Married            362 non-null    object 
 3   Dependents         353 non-null    object 
 4   Education          362 non-null    object 
 5   Self_Employed      339 non-null    object 
 6   ApplicantIncome    362 non-null    int64  
 7   CoapplicantIncome  362 non-null    int64  
 8   LoanAmount         362 non-null    int64  
 9   Loan_Amount_Term   356 non-null    float64
 10  Credit_History     333 non-null    float64
 11  Property_Area      362 non-null    object 
dtypes: float64(2), int64(3), object(7)
memory usage: 34.1+ KB


### Processing Data


In [44]:
## Creating df for target feature
y_train = df_train['Loan_Status'].copy()
df_train.drop(columns=['Loan_Status'], inplace=True)

In [45]:
## Dropping unnecessary column of Loan ID
df_train.drop(columns=['Loan_ID'], inplace=True)
df_test.drop(columns=['Loan_ID'], inplace=True)

In [7]:
## Check for Duplicate Rows
df_train[df_train.duplicated()]

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area


In [8]:
## Check for Duplicate Rows
df_test[df_test.duplicated()]

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
192,Male,No,0,Graduate,Yes,5833,0,116,360.0,1.0,Urban


In [46]:
## Dropping Duplicates
df_test.drop_duplicates(inplace=True)

In [None]:
## Handling Missing Values
df_train.isnull().sum()

Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
dtype: int64

In [47]:
## Numerical & categorical cols
num_cols = ['LoanAmount','Loan_Amount_Term','ApplicantIncome','CoapplicantIncome']
cat_cols = ['Property_Area','Credit_History','Gender','Married','Dependents','Education','Self_Employed']

In [48]:
## Mode imputation for categorical values
for feature in cat_cols:
  df_train[feature] = df_train[feature].fillna(df_train[feature].mode()[0])
  df_test[feature] = df_test[feature].fillna(df_test[feature].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test[feature] = df_test[feature].fillna(df_test[feature].mode()[0], inplace=True)


In [49]:
## Median imputation for numerical values
for feature in num_cols:
  df_train[feature] = df_train[feature].fillna(df_train[feature].median())
  df_test[feature] = df_test[feature].fillna(df_test[feature].median())

In [50]:
df_train.isnull().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
dtype: int64

In [None]:
df_test.isnull().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
dtype: int64

In [51]:
## Combining Income for applicant and co-applicant
df_train['ApplicantIncome'] = df_train['ApplicantIncome'] + df_train['CoapplicantIncome']
df_test['ApplicantIncome'] = df_test['ApplicantIncome'] + df_test['CoapplicantIncome']

In [52]:
## Dropping 'CoapplicantIncome' feature
df_train.drop(columns=['CoapplicantIncome'], inplace=True)
df_test.drop(columns=['CoapplicantIncome'], inplace=True)

In [53]:
## Removing num cols list
num_cols.remove('CoapplicantIncome')

In [17]:
## Handling Categorical values
df_train[cat_cols].nunique() ## Number of labels per feature

Property_Area     3
Credit_History    2
Gender            2
Married           2
Dependents        4
Education         2
Self_Employed     2
dtype: int64

In [54]:
## Label Encoding catgeorical values
from sklearn.preprocessing import LabelEncoder
for feature in cat_cols:
  le = LabelEncoder()
  df_train[feature] = le.fit_transform(df_train[feature])
  df_test[feature] = le.fit_transform(df_test[feature])

In [55]:
df_train.head(3)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1,0,0,0,0,5849.0,128.0,360.0,1,2
1,1,1,1,0,0,6091.0,128.0,360.0,1,0
2,1,1,0,0,1,3000.0,66.0,360.0,1,2


In [56]:
## Log Transformation of numerical columns
df_train[num_cols] = np.log(df_train[num_cols])
df_test[num_cols] = np.log(df_test[num_cols])

In [57]:
## Scaling of values
from sklearn.preprocessing import MinMaxScaler
minmax = MinMaxScaler()
df_train = minmax.fit_transform(df_train)
df_test = minmax.transform(df_test)

In [58]:
df_train

array([[1.        , 0.        , 0.        , ..., 0.9220137 , 1.        ,
        1.        ],
       [1.        , 1.        , 0.33333333, ..., 0.9220137 , 1.        ,
        0.        ],
       [1.        , 1.        , 0.        , ..., 0.9220137 , 1.        ,
        1.        ],
       ...,
       [1.        , 1.        , 0.33333333, ..., 0.9220137 , 1.        ,
        1.        ],
       [1.        , 1.        , 0.66666667, ..., 0.9220137 , 1.        ,
        1.        ],
       [0.        , 0.        , 0.        , ..., 0.9220137 , 0.        ,
        0.5       ]])

### Building Model

In [59]:
## Creating the train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_train, y_train, test_size=0.3, random_state=0)

In [60]:
print(X_train.shape)
print(y_train.shape)

(429, 10)
(429,)


In [61]:
print(X_test.shape)
print(y_test.shape)

(185, 10)
(185,)


In [62]:
## Using Logistic Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [63]:
y_pred = lr.predict(X_test)

In [64]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test,y_pred)
print("Accuracy: ", acc)

Accuracy:  0.827027027027027


### Serialization and Deserialization of Model

In [65]:
import joblib

In [66]:
## Serialization
joblib.dump(lr, "trained_model_v1.pkl")

['trained_model_v1.pkl']

In [67]:
## Deserialization
final_model = joblib.load('./trained_model_v1.pkl')

In [68]:
## Comparing the stored model with the model created above to prove its same
lr.intercept_, lr.coef_

(array([-2.09255385]),
 array([[ 0.02341901,  0.35300139,  0.39406274, -0.4427042 , -0.04868874,
          0.01506398, -0.76181783,  0.24613226,  3.18032767,  0.22546141]]))

In [69]:
final_model.intercept_, final_model.coef_

(array([-2.09255385]),
 array([[ 0.02341901,  0.35300139,  0.39406274, -0.4427042 , -0.04868874,
          0.01506398, -0.76181783,  0.24613226,  3.18032767,  0.22546141]]))