## importing important libraries

In [259]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,plot_confusion_matrix,classification_report,f1_score
from sklearn.metrics import recall_score,accuracy_score,precision_score
from sklearn.metrics import roc_auc_score,roc_curve,precision_recall_curve

import pickle 
import json

## problem statement

## data gathering

In [178]:
df = pd.read_csv('Loan_Data.csv')
df

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [179]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [180]:
df=df.replace(r'^\s+$', np.nan, regex=True)
df

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


## eda

### gender

In [181]:
df['Gender'].isna().sum()

13

In [182]:
df['Gender']=df['Gender'].fillna(df['Gender'].mode()[0])

In [183]:
df['Gender'].value_counts().to_dict()

{'Male': 502, 'Female': 112}

In [209]:
df['Gender']=df['Gender'].replace({'Male': 1, 'Female': 0})

### Married

In [184]:
df['Married'].isna().sum()

3

In [185]:
df['Married']=df['Married'].fillna(df['Married'].mode()[0])

In [186]:
df['Married'].isna().sum()

0

In [210]:
df['Married'].value_counts().to_dict()

{'Yes': 401, 'No': 213}

In [211]:
df['Married']=df['Married'].replace({'Yes':1, 'No':0})

### Dependents

In [187]:
df['Dependents'].isna().sum()

15

In [188]:
df['Dependents'].value_counts().to_dict()

{'0': 345, '1': 102, '2': 101, '3+': 51}

In [189]:
df['Dependents'].replace({'0': 'zero', '1': 'one', '2': 'two', '3+': 'three_plus'},inplace=True)


In [190]:
df['Dependents']=df['Dependents'].fillna(df['Dependents'].mode()[0])

In [212]:
df['Dependents'].value_counts().to_dict()

{'zero': 360, 'one': 102, 'two': 101, 'three_plus': 51}

### Education

In [191]:
df['Education'].value_counts().to_dict()

{'Graduate': 480, 'Not Graduate': 134}

In [192]:
df['Education'].isna().sum()

0

In [213]:
df['Education']=df['Education'].replace({'Graduate':1, 'Not Graduate':0})

In [214]:
df['Education'].value_counts().to_dict()

{1: 480, 0: 134}

### Self_Employed

In [193]:
df['Self_Employed'].isna().sum()

32

In [194]:
df['Self_Employed'].value_counts().to_dict()

{'No': 500, 'Yes': 82}

In [195]:
df['Self_Employed']=df['Self_Employed'].fillna(df['Self_Employed'].mode()[0])

In [215]:
df['Self_Employed']=df['Self_Employed'].replace({'No':0, 'Yes':1})

In [216]:
df['Self_Employed'].value_counts().to_dict()

{0: 532, 1: 82}

### ApplicantIncome

In [196]:
df['ApplicantIncome'].isna().sum()

0

### CoapplicantIncome

In [197]:
df['CoapplicantIncome'].isna().sum()

0

### LoanAmount

In [198]:
df['LoanAmount'].isna().sum()

22

In [199]:
df['LoanAmount']=df['LoanAmount'].fillna(df['LoanAmount'].mean())

### Loan_Amount_Term

In [200]:
df['Loan_Amount_Term'].isna().sum()

14

In [201]:
df['Loan_Amount_Term'].value_counts().to_dict()

{360.0: 512,
 180.0: 44,
 480.0: 15,
 300.0: 13,
 240.0: 4,
 84.0: 4,
 120.0: 3,
 60.0: 2,
 36.0: 2,
 12.0: 1}

In [202]:
df['Loan_Amount_Term']=df['Loan_Amount_Term'].replace(r'^\s+$', np.nan, regex=True)

In [203]:
df['Loan_Amount_Term']=df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mean())

### Credit_History

In [204]:
df['Credit_History'].isna().sum()

50

In [205]:
df['Credit_History']=df['Credit_History'].fillna(df['Credit_History'].mode()[0])

In [217]:
df['Credit_History'].value_counts().to_dict()

{1.0: 525, 0.0: 89}

### Property_Area

In [206]:
df['Property_Area'].isna().sum()

0

### loan status

In [220]:
df['Loan_Status'].value_counts().to_dict()

{'Y': 422, 'N': 192}

In [221]:
df['Loan_Status']=df['Loan_Status'].replace({'Y':1, 'N':0})

## one hot encoding

In [227]:
df=pd.get_dummies(data=df, columns=['Dependents', 'Property_Area'])

## train test split

In [228]:
x=df.drop(['Loan_ID','Loan_Status'],axis=1)
x

Unnamed: 0,Gender,Married,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Dependents_one,Dependents_three_plus,Dependents_two,Dependents_zero,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
0,1,0,1,0,5849,0.0,146.412162,360.0,1.0,0,0,0,1,0,0,1
1,1,1,1,0,4583,1508.0,128.000000,360.0,1.0,1,0,0,0,1,0,0
2,1,1,1,1,3000,0.0,66.000000,360.0,1.0,0,0,0,1,0,0,1
3,1,1,0,0,2583,2358.0,120.000000,360.0,1.0,0,0,0,1,0,0,1
4,1,0,1,0,6000,0.0,141.000000,360.0,1.0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,0,0,1,0,2900,0.0,71.000000,360.0,1.0,0,0,0,1,1,0,0
610,1,1,1,0,4106,0.0,40.000000,180.0,1.0,0,1,0,0,1,0,0
611,1,1,1,0,8072,240.0,253.000000,360.0,1.0,1,0,0,0,0,0,1
612,1,1,1,0,7583,0.0,187.000000,360.0,1.0,0,0,1,0,0,0,1


In [229]:
y=df['Loan_Status']
y

0      1
1      0
2      1
3      1
4      1
      ..
609    1
610    1
611    1
612    1
613    0
Name: Loan_Status, Length: 614, dtype: int64

In [254]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25, random_state=10, stratify=y)
x_train

16

## model instance 

In [231]:
logistic_clf = LogisticRegression()
logistic_clf.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Evluation

In [232]:
y_pred = logistic_clf.predict(x_test)
y_pred

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1],
      dtype=int64)

In [233]:
cnf_matrix = confusion_matrix(y_test, y_pred)
print("cnf_matrix :\n",cnf_matrix)

cnf_matrix :
 [[ 20  28]
 [  4 102]]


In [234]:
clf_report = classification_report(y_test, y_pred)
print(clf_report)

              precision    recall  f1-score   support

           0       0.83      0.42      0.56        48
           1       0.78      0.96      0.86       106

    accuracy                           0.79       154
   macro avg       0.81      0.69      0.71       154
weighted avg       0.80      0.79      0.77       154



## project data

In [263]:
project_data = {'Gender':{'Male': 1, 'Female': 0},
                  'Married':  {'Yes':1, 'No':0},
                  'Education':  {'Graduate':1, 'Not Graduate':0},
                  'Self_Employed':  {'No':0, 'Yes':1},
                  'columns': x.columns.to_list()
}
project_data

{'Gender': {'Male': 1, 'Female': 0},
 'Married': {'Yes': 1, 'No': 0},
 'Education': {'Graduate': 1, 'Not Graduate': 0},
 'Self_Employed': {'No': 0, 'Yes': 1},
 'columns': ['Gender',
  'Married',
  'Education',
  'Self_Employed',
  'ApplicantIncome',
  'CoapplicantIncome',
  'LoanAmount',
  'Loan_Amount_Term',
  'Credit_History',
  'Dependents_one',
  'Dependents_three_plus',
  'Dependents_two',
  'Dependents_zero',
  'Property_Area_Rural',
  'Property_Area_Semiurban',
  'Property_Area_Urban']}

## testing one instance

In [265]:
Gender='Male'
Married='Yes'
Dependents='one'
Education='Graduate'
Self_Employed='No'
ApplicantIncome=4583
CoapplicantIncome=1508.0
LoanAmount=128.0
Loan_Amount_Term=360.0
Credit_History=1.0
Property_Area='Rural'

Gender=project_data['Gender'][Gender]
Married=project_data['Married'][Married]
Education=project_data['Education'][Education]
Self_Employed=project_data['Self_Employed'][Self_Employed]

Dependents='Dependents_'+Dependents
Dependents_index=np.where(project_data['columns'] == Dependents)[0]

Property_Area='Property_Area_'+Property_Area
Property_Area_index=np.where(project_data['columns'] == Property_Area)[0]

test_array = np.zeros(x.shape[1])
test_array[0]=Gender
test_array[1]=Married
test_array[2]=Education
test_array[3]=Self_Employed
test_array[4]=ApplicantIncome
test_array[5]=CoapplicantIncome
test_array[6]=LoanAmount
test_array[7]=Loan_Amount_Term
test_array[8]=Credit_History
test_array[Dependents_index]=1
test_array[Property_Area_index]=1

Loan_Status= logistic_clf.predict([test_array])[0]
if Loan_Status==1:
    print('Loan_Status : Approved')
else:
    print('Loan_Status : Declined')

Loan_Status : Approved




## creating pickle file

In [266]:
with open('logistic_clf.pkl','wb') as f:
    pickle.dump(logistic_clf,f)

## creating json file

In [267]:
with open('project_data.json','w') as f:
    json.dump(project_data,f)