In [25]:
# import libraries 
import pandas as pd 
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder,MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import joblib

In [26]:
test_df = pd.read_csv("/media/danlof/dan files/data_science_codes/udemy_course/packaging/prediction_model/datasets/test.csv")
train_df = pd.read_csv("/media/danlof/dan files/data_science_codes/udemy_course/packaging/prediction_model/datasets/train.csv")
train_df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [27]:
train_df.shape

(614, 13)

In [28]:
## checking for unique vallues in the data (train)

train_df.nunique()

Loan_ID              614
Gender                 2
Married                2
Dependents             4
Education              2
Self_Employed          2
ApplicantIncome      505
CoapplicantIncome    287
LoanAmount           203
Loan_Amount_Term      10
Credit_History         2
Property_Area          3
Loan_Status            2
dtype: int64

In [29]:
# creating a copy of the data so that i may not affect the original dataframes 

train = train_df.copy()
test = test_df.copy()

In [30]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [31]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 367 entries, 0 to 366
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            367 non-null    object 
 1   Gender             356 non-null    object 
 2   Married            367 non-null    object 
 3   Dependents         357 non-null    object 
 4   Education          367 non-null    object 
 5   Self_Employed      344 non-null    object 
 6   ApplicantIncome    367 non-null    int64  
 7   CoapplicantIncome  367 non-null    int64  
 8   LoanAmount         362 non-null    float64
 9   Loan_Amount_Term   361 non-null    float64
 10  Credit_History     338 non-null    float64
 11  Property_Area      367 non-null    object 
dtypes: float64(3), int64(2), object(7)
memory usage: 34.5+ KB


In [32]:
## dropping unnecessary columns
train.drop(columns='Loan_ID',inplace=True)
test.drop(columns='Loan_ID',inplace=True)

In [33]:
train.columns

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [34]:
## checking for duplicates 

train.duplicated().value_counts()

False    614
Name: count, dtype: int64

In [35]:
test.duplicated().value_counts()

False    366
True       1
Name: count, dtype: int64

In [36]:
test.drop_duplicates(inplace=True)

In [37]:
## Missing values analysis
train.isna().sum()

Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [38]:
# lets have numerical columns first 

num_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term']
cat_cols = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
            'Credit_History', 'Property_Area']

In [39]:
## imputattion 
# categorical imputation

cat_imputer = SimpleImputer(strategy='most_frequent')
cat_imputer.fit(train[cat_cols])

train[cat_cols] = cat_imputer.transform(train[cat_cols])
test[cat_cols] = cat_imputer.transform(test[cat_cols])

In [40]:
num_imputer = SimpleImputer(strategy='mean')
num_imputer.fit(train[num_cols])

train[num_cols] = num_imputer.transform(train[num_cols])
test[num_cols] = num_imputer.transform(test[num_cols])

In [41]:
# check for missing values 

train.isna().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [42]:
test.isna().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
dtype: int64

In [43]:
## preprocessing
## lets add both the applicant income and the coapplication income together 

train['ApplicantIncome'] = train['ApplicantIncome'] + train['CoapplicantIncome']
test['ApplicantIncome'] = test['ApplicantIncome'] + test['CoapplicantIncome']

# drop the coapplicant income column

train.drop(columns='CoapplicantIncome',inplace=True)
test.drop(columns='CoapplicantIncome',inplace=True)

In [44]:
Y_train = train['Loan_Status']
train.drop(columns='Loan_Status',inplace=True)
train.columns

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History',
       'Property_Area'],
      dtype='object')

In [45]:
num_cols.remove('CoapplicantIncome')
print("numerical columns :",num_cols)
print("categorical columns :",cat_cols)

numerical columns : ['ApplicantIncome', 'LoanAmount', 'Loan_Amount_Term']
categorical columns : ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Credit_History', 'Property_Area']


In [46]:
# lets perform a label encoding 

for col in cat_cols:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col])
    test[col] = le.transform(test[col])
    

In [47]:
train.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1,0,0,0,0,5849.0,146.412162,360.0,1,2
1,1,1,1,0,0,6091.0,128.0,360.0,1,0
2,1,1,0,0,1,3000.0,66.0,360.0,1,2
3,1,1,0,1,0,4941.0,120.0,360.0,1,2
4,1,0,0,0,0,6000.0,141.0,360.0,1,2


In [48]:
# performing log transformation on the data 


train[num_cols]=np.log(train[num_cols])
test[num_cols] = np.log(test[num_cols])

In [49]:
# performing minmax scaling 

minmax = MinMaxScaler()
train = minmax.fit_transform(train)

test = minmax.transform(test)

### Building the model

In [50]:
x_train,x_test,y_train,y_test = train_test_split(train,Y_train,train_size=0.7,random_state=42)

print(x_train.shape)
print(y_train.shape)

(429, 10)
(429,)


In [51]:
# fit the logistic regression model
log = LogisticRegression()
log.fit(x_train,y_train)

In [52]:
# find the predictions 

y_pred_test = log.predict(x_test)

In [53]:
# evaluation metrics 

acc = accuracy_score(y_test,y_pred_test)
print(f"Accuracy is {acc}")

Accuracy is 0.7837837837837838


In [54]:
# serialization and deserilisation 
joblib.dump(log,"loan_model_v1.pkl")


['loan_model_v1.pkl']

In [55]:
final_model = joblib.load("loan_model_v1.pkl")

In [56]:
# check the model if it satisfies you 

final_model.intercept_,final_model.coef_

(array([-2.14800082]),
 array([[-3.09359265e-01,  5.13940243e-01,  3.70881831e-01,
         -2.26763589e-01, -3.23407019e-03,  1.83223111e-01,
         -5.62463467e-01,  4.84682864e-01,  3.23938377e+00,
          6.19681136e-02]]))

In [57]:
log.intercept_,log.coef_

(array([-2.14800082]),
 array([[-3.09359265e-01,  5.13940243e-01,  3.70881831e-01,
         -2.26763589e-01, -3.23407019e-03,  1.83223111e-01,
         -5.62463467e-01,  4.84682864e-01,  3.23938377e+00,
          6.19681136e-02]]))