In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")

In [None]:
from google.colab import drive
drive.mount('/content/drive')
file = pd.read_csv("/content/drive/My Drive/loan_data.csv")
file.info()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45000 entries, 0 to 44999
Data columns (total 14 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   person_age                      45000 non-null  float64
 1   person_gender                   45000 non-null  object 
 2   person_education                45000 non-null  object 
 3   person_income                   45000 non-null  float64
 4   person_emp_exp                  45000 non-null  int64  
 5   person_home_ownership           45000 non-null  object 
 6   loan_amnt                       45000 non-null  float64
 7   loan_intent                     45000 non-null  object 
 8   loan_int_rate                   45000 non-null  float64
 9   loan_percent_income             45000 non-null  float64
 10  cb_person_cred_hist_lengt

# **Analysing DataSet**

In [None]:
file.head()

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,22.0,female,Master,71948.0,0,RENT,35000.0,PERSONAL,16.02,0.49,3.0,561,No,1
1,21.0,female,High School,12282.0,0,OWN,1000.0,EDUCATION,11.14,0.08,2.0,504,Yes,0
2,25.0,female,High School,12438.0,3,MORTGAGE,5500.0,MEDICAL,12.87,0.44,3.0,635,No,1
3,23.0,female,Bachelor,79753.0,0,RENT,35000.0,MEDICAL,15.23,0.44,2.0,675,No,1
4,24.0,male,Master,66135.0,1,RENT,35000.0,MEDICAL,14.27,0.53,4.0,586,No,1


In [None]:
file.shape

(45000, 14)

In [None]:
file.loan_status.unique()

array([1, 0])

In [None]:
file.describe()

Unnamed: 0,person_age,person_income,person_emp_exp,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,loan_status
count,45000.0,45000.0,45000.0,45000.0,45000.0,45000.0,45000.0,45000.0,45000.0
mean,27.764178,80319.05,5.410333,9583.157556,11.006606,0.139725,5.867489,632.608756,0.222222
std,6.045108,80422.5,6.063532,6314.886691,2.978808,0.087212,3.879702,50.435865,0.415744
min,20.0,8000.0,0.0,500.0,5.42,0.0,2.0,390.0,0.0
25%,24.0,47204.0,1.0,5000.0,8.59,0.07,3.0,601.0,0.0
50%,26.0,67048.0,4.0,8000.0,11.01,0.12,4.0,640.0,0.0
75%,30.0,95789.25,8.0,12237.25,12.99,0.19,8.0,670.0,0.0
max,144.0,7200766.0,125.0,35000.0,20.0,0.66,30.0,850.0,1.0


The file 'loan.csv' contains 45000 rows and 14 columns with the output i.e. 'loan_status' having two unique values - 0 and 1. The dataset consists of details for the loan requested such as age, income, loan amt., loan interest, previous loan records. The loan_status tells whether the loan is approved or not.

# **Preprocessing**

In [None]:
X = file.drop(columns=['person_age','person_gender','person_education','person_home_ownership','previous_loan_defaults_on_file','loan_intent','previous_loan_defaults_on_file','loan_status'])
Y = file['loan_status']

In [None]:
X.isnull()

Unnamed: 0,person_income,person_emp_exp,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score
0,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...
44995,False,False,False,False,False,False,False
44996,False,False,False,False,False,False,False
44997,False,False,False,False,False,False,False
44998,False,False,False,False,False,False,False


**X.isnull()** checks and tell that whether there is any NaN value or not. Here, we get the reult to be false, so we don't need to do anything.
If we would have got a True in our result, then we would have used **X = X.dropna()**

# **Normalizing Data**

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

# **Encoding**

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
Y = encoder.fit_transform(Y)

**Spliting the dataset into training and testing sets**

In [None]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,train_size=0.9, shuffle=True,random_state=42)

# **Model 1: Importing Logistic Regression Model**

In [None]:
m1 = LogisticRegression()
m1.fit(X_train,Y_train)
Y_pred = m1.predict(X_test)
m1accuracy = accuracy_score(Y_test, Y_pred) * 100

# **Model 2: Implementing Logistic Regression Model**

In [None]:
class Regression:
  def __init__(self, l_rate, iter):
    self.l_rate = l_rate
    self.iter = iter

  def fit(self, X, y):
    self.m,self.n = X.shape
    self.W = np.zeros(self.n)
    self.b = 0
    self.X = X
    self.y = y

    for i in range(self.iter):
      self.update_weight()
    return self

  def update_weight(self):
    A = 1/ (1+np.exp(-(self.X.dot(self.W)+self.b)))
    tmp = A - self.y.T
    tmp = np.reshape(tmp, self.m)
    dW = np.dot(self.X.T, tmp)/self.m
    db = np.sum(tmp)/self.m

    self.W = self.W - self.l_rate * dW
    self.b = self.b - self.l_rate * db
    return self

  def pred(self, X):
    z = 1 / (1 + np.exp(-(X.dot(self.W) + self.b)))
    Y = np.where(z>0.5 , 1 , 0)
    return Y


In [None]:
m2 = Regression(0.01, 1000)
m2.fit(X_train, Y_train)
y_pred = m2.pred(X_test)
correctly_classified = np.sum(Y_test == y_pred)
m2accuracy = (correctly_classified/len(Y_test))*100

# **Accuracy**

In [None]:
print(f"Model 1 has the accuracy of {m1accuracy} %")
print(f"Model 2 has the accuracy of {m2accuracy} %")


Model 1 has the accuracy of 81.71111111111111 %
Model 2 has the accuracy of 76.53333333333333 %


# **Analysis**

The Model 1 uses the test data and gives the accuracy of 81.71% meanwhile the second one i.e. Model 2 uses the same data but gives accuracy of 76.53% only.