# Import data set

In [38]:
import pandas as pd

In [39]:
loan_df = pd.read_csv("./loan.csv")

In [40]:
loan_df

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [41]:
# Loan_ID : Unique Loan ID

# Categorical Columns
# Gender : Male/Female
# Married : Applicant married (Y/N)
# Self_Employed : Self employed (Y/N)
# Education : Applicant Education (Graduate/ Not Graduate)
# Property_Area : (Urban/ Semi Urban/ Rural)

# Numerical Columns
# Dependents : Number of dependents
# ApplicantIncome : Applicant income
# CoapplicantIncome : Coapplicant income
# LoanAmount : Loan amount in thousands of dollars
# Loan_Amount_Term : Term of loan in months
# Credit_History : 1 / 0

# Predict
# Loan_Status : Loan approved (Y/N) this is the target variable

### 1. Display Top 5 Rows of The Dataset

In [42]:
loan_df.head(5)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


### 2. Check Last 5 Rows of The Dataset

In [43]:
loan_df.tail(5)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y
613,LP002990,Female,No,0,Graduate,Yes,4583,0.0,133.0,360.0,0.0,Semiurban,N


### 3. Find Shape of Our Dataset (Number of Rows And Number of Columns)

In [44]:
loan_df.shape

(614, 13)

### 4. Get Information About Our Dataset Like Total Number Rows, Total Number of Columns, Datatypes of Each Column And Memory Requirement

In [45]:
loan_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


### 5. Check Null Values In The Dataset

In [46]:
loan_df.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [47]:
# Task get nulls in %
(loan_df.isnull().sum() / len(loan_df)) * 100

Loan_ID              0.000000
Gender               2.117264
Married              0.488599
Dependents           2.442997
Education            0.000000
Self_Employed        5.211726
ApplicantIncome      0.000000
CoapplicantIncome    0.000000
LoanAmount           3.583062
Loan_Amount_Term     2.280130
Credit_History       8.143322
Property_Area        0.000000
Loan_Status          0.000000
dtype: float64

### 6. Handling The missing Values

In [48]:
# Task 1: Drop the Loan_ID

loan_df = loan_df.drop('Loan_ID', axis=1)   # 1 -> column 0 -> row

In [49]:
loan_df

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...
609,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [50]:
# Task 2: Delete the the entire row if the missing data is < 5% in the column
columns = ['Gender', 'Married', 'Dependents', 'LoanAmount', 'Loan_Amount_Term']
loan_df = loan_df.dropna(subset=columns)
loan_df

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
5,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...
609,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [51]:
(loan_df.isnull().sum() / len(loan_df)) * 100

Gender               0.000000
Married              0.000000
Dependents           0.000000
Education            0.000000
Self_Employed        5.424955
ApplicantIncome      0.000000
CoapplicantIncome    0.000000
LoanAmount           0.000000
Loan_Amount_Term     0.000000
Credit_History       8.679928
Property_Area        0.000000
Loan_Status          0.000000
dtype: float64

In [52]:
loan_df['Credit_History'].unique()
loan_df['Self_Employed'].unique()

array(['No', 'Yes', nan], dtype=object)

In [53]:
loan_df['Credit_History'].value_counts()

Credit_History
1.0    434
0.0     71
Name: count, dtype: int64

In [54]:
loan_df['Self_Employed'].value_counts()

Self_Employed
No     451
Yes     72
Name: count, dtype: int64

In [55]:
# Task: Impute Both Credit_History & Self_Employed - mode
from sklearn.impute import SimpleImputer

In [56]:
imputer = SimpleImputer(strategy="most_frequent")
columns = ['Credit_History', 'Self_Employed']
loan_df[columns] = imputer.fit_transform(loan_df[columns])
loan_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  loan_df[columns] = imputer.fit_transform(loan_df[columns])


Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
5,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...
609,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [57]:
(loan_df.isnull().sum() / len(loan_df)) * 100

Gender               0.0
Married              0.0
Dependents           0.0
Education            0.0
Self_Employed        0.0
ApplicantIncome      0.0
CoapplicantIncome    0.0
LoanAmount           0.0
Loan_Amount_Term     0.0
Credit_History       0.0
Property_Area        0.0
Loan_Status          0.0
dtype: float64

### 7. Handling Categorical Columns - Encoding

In [58]:
# Gender : Male/Female
# Married : Applicant married (Y/N)
# Self_Employed : Self employed (Y/N)
# Education : Applicant Education (Graduate/ Not Graduate)
# Property_Area : (Urban/ Semi Urban/ Rural)

In [59]:
loan_df['Gender'].unique()

array(['Male', 'Female'], dtype=object)

In [60]:

loan_df['Married'].unique()

array(['Yes', 'No'], dtype=object)

In [61]:
loan_df['Self_Employed'].unique()

array(['No', 'Yes'], dtype=object)

In [62]:
loan_df['Education'].unique()

array(['Graduate', 'Not Graduate'], dtype=object)

In [63]:
loan_df['Property_Area'].unique()

array(['Rural', 'Urban', 'Semiurban'], dtype=object)

In [64]:
loan_df['Dependents'].unique()

array(['1', '0', '2', '3+'], dtype=object)

In [65]:
loan_df['Dependents'].value_counts()

Dependents
0     316
1      96
2      96
3+     45
Name: count, dtype: int64

In [66]:
loan_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 553 entries, 1 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             553 non-null    object 
 1   Married            553 non-null    object 
 2   Dependents         553 non-null    object 
 3   Education          553 non-null    object 
 4   Self_Employed      553 non-null    object 
 5   ApplicantIncome    553 non-null    int64  
 6   CoapplicantIncome  553 non-null    float64
 7   LoanAmount         553 non-null    float64
 8   Loan_Amount_Term   553 non-null    float64
 9   Credit_History     553 non-null    object 
 10  Property_Area      553 non-null    object 
 11  Loan_Status        553 non-null    object 
dtypes: float64(3), int64(1), object(8)
memory usage: 56.2+ KB


In [67]:
loan_df.replace({
    "Gender": {"Male": 0, "Female": 1},
    "Married": {"No": 0, "Yes": 1},
    "Self_Employed": {"No": 0, "Yes": 1},
    "Education": {"Not Graduate": 0, "Graduate": 1},
    "Property_Area": {"Rural": 0,  "Semiurban": 1, "Urban": 2},
    "Loan_Status": {"N": 0, "Y": 1},
    "Dependents": {"3+": 4},
}, inplace=True)
# Completed Encoding

  loan_df.replace({
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  loan_df.replace({


In [68]:
loan_df

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,0,1,1,1,0,4583,1508.0,128.0,360.0,1.0,0,0
2,0,1,0,1,1,3000,0.0,66.0,360.0,1.0,2,1
3,0,1,0,0,0,2583,2358.0,120.0,360.0,1.0,2,1
4,0,0,0,1,0,6000,0.0,141.0,360.0,1.0,2,1
5,0,1,2,1,1,5417,4196.0,267.0,360.0,1.0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...
609,1,0,0,1,0,2900,0.0,71.0,360.0,1.0,0,1
610,0,1,4,1,0,4106,0.0,40.0,180.0,1.0,0,1
611,0,1,1,1,0,8072,240.0,253.0,360.0,1.0,2,1
612,0,1,2,1,0,7583,0.0,187.0,360.0,1.0,2,1


In [69]:
loan_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 553 entries, 1 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             553 non-null    int64  
 1   Married            553 non-null    int64  
 2   Dependents         553 non-null    object 
 3   Education          553 non-null    int64  
 4   Self_Employed      553 non-null    int64  
 5   ApplicantIncome    553 non-null    int64  
 6   CoapplicantIncome  553 non-null    float64
 7   LoanAmount         553 non-null    float64
 8   Loan_Amount_Term   553 non-null    float64
 9   Credit_History     553 non-null    object 
 10  Property_Area      553 non-null    int64  
 11  Loan_Status        553 non-null    int64  
dtypes: float64(3), int64(7), object(2)
memory usage: 56.2+ KB


In [70]:
# Convert from String
# Dependents, Credit_History
loan_df['Dependents']= loan_df['Dependents'].astype('int')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  loan_df['Dependents']= loan_df['Dependents'].astype('int')


In [71]:
loan_df['Credit_History']= loan_df['Credit_History'].astype('int')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  loan_df['Credit_History']= loan_df['Credit_History'].astype('int')


In [72]:
loan_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 553 entries, 1 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             553 non-null    int64  
 1   Married            553 non-null    int64  
 2   Dependents         553 non-null    int64  
 3   Education          553 non-null    int64  
 4   Self_Employed      553 non-null    int64  
 5   ApplicantIncome    553 non-null    int64  
 6   CoapplicantIncome  553 non-null    float64
 7   LoanAmount         553 non-null    float64
 8   Loan_Amount_Term   553 non-null    float64
 9   Credit_History     553 non-null    int64  
 10  Property_Area      553 non-null    int64  
 11  Loan_Status        553 non-null    int64  
dtypes: float64(3), int64(9)
memory usage: 56.2 KB


# Bar graph - Education vs Loan status  (Graduate?)
# Bar graph - Married vs Loan status (output)

In [73]:
loan_df.groupby(['Education', 'Loan_Status']).size()

Education  Loan_Status
0          0               40
           1               76
1          0              124
           1              313
dtype: int64

In [74]:
loan_df.groupby(['Married', 'Loan_Status']).size()

Married  Loan_Status
0        0               70
         1              124
1        0               94
         1              265
dtype: int64

### 8. Store Feature In X And Response (Target)  y

In [75]:
features_df = loan_df.drop('Loan_Status', axis=1) # X
target_df = loan_df['Loan_Status'] # y

In [76]:
target_df

1      0
2      1
3      1
4      1
5      1
      ..
609    1
610    1
611    1
612    1
613    0
Name: Loan_Status, Length: 553, dtype: int64

### 9. Feature Scaling - Standardize

In [77]:
from sklearn.preprocessing import StandardScaler  # (-3, 3) -> mean - 0, std - 1
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features_df)
features_scaled # minmax - (0, 1)

array([[-0.48127522,  0.73511222,  0.12748113, ...,  0.27996068,
         0.38380061, -1.30735908],
       [-0.48127522,  0.73511222, -0.70189613, ...,  0.27996068,
         0.38380061,  1.25182525],
       [-0.48127522,  0.73511222, -0.70189613, ...,  0.27996068,
         0.38380061,  1.25182525],
       ...,
       [-0.48127522,  0.73511222,  0.12748113, ...,  0.27996068,
         0.38380061,  1.25182525],
       [-0.48127522,  0.73511222,  0.9568584 , ...,  0.27996068,
         0.38380061,  1.25182525],
       [ 2.0778132 , -1.36033653, -0.70189613, ...,  0.27996068,
        -2.6055196 , -0.02776692]], shape=(553, 11))

### 10. Splitting The Dataset Into The Training Set And Test Set 

In [78]:
from sklearn.model_selection import train_test_split

In [79]:
X_train, X_test, y_train, y_test = train_test_split(
    features_scaled, target_df, test_size=0.2, random_state=42
)

### 11. Logistic Regression

In [80]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)

In [85]:
predict = model.predict(X_test)

predict

array([1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1])

# 12. Metrics

In [83]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, predict)

print(f"Model accuracy is: {accuracy:.2%}")

Model accuracy is: 79.28%
