In [46]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Import necessary modules from scikit-learn for pipline
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_selector,make_column_transformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay



In [47]:

pd.set_option('display.max_columns', None)

In [48]:
trainset = pd.read_csv('./Dataset/LoanPrediction_TrainSet.csv')
testset = pd.read_csv('./DataSet/LoanPrediction_TestSet.csv')

trainset = trainset.drop(columns=["Unnamed: 0"])
testset = testset.drop(columns=["Unnamed: 0"])







In [49]:
trainset.head(10)

Unnamed: 0,LoanID,Gender,Married,Dependents,Education,SelfEmployed,ApplicantIncome,CoapplicantIncome,LoanAmount,LoanAmountTerm,CreditHistory,PropertyArea,LoanStatus
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
5,LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y
6,LP001013,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,Y
7,LP001014,Male,Yes,3+,Graduate,No,3036,2504.0,158.0,360.0,0.0,Semiurban,N
8,LP001018,Male,Yes,2,Graduate,No,4006,1526.0,168.0,360.0,1.0,Urban,Y
9,LP001020,Male,Yes,1,Graduate,No,12841,10968.0,349.0,360.0,1.0,Semiurban,N


In [50]:

ABT_train = trainset.describe(include="all").T

ABT_train = ABT_train.reset_index().rename(columns={'index': 'Feature'})
ABT_train['Missing'] = trainset.isnull().sum().values
ABT_train['dtype'] = trainset.dtypes.values
ABT_train


# Wrong data types for some features like 'Dependents' and 'Credit_History' and missing values in some features
# There is some unbalance in the target variable 'Loan_Status' as i belive aswell som Bias in features for example Gender more male and females
# Dependents looks like it have some not allowed additions sign as well for 3+ as a category which i will fix


Unnamed: 0,Feature,count,unique,top,freq,mean,std,min,25%,50%,75%,max,Missing,dtype
0,LoanID,614.0,614.0,LP001002,1.0,,,,,,,,0,object
1,Gender,601.0,2.0,Male,489.0,,,,,,,,13,object
2,Married,611.0,2.0,Yes,398.0,,,,,,,,3,object
3,Dependents,599.0,4.0,0,345.0,,,,,,,,15,object
4,Education,614.0,2.0,Graduate,480.0,,,,,,,,0,object
5,SelfEmployed,582.0,2.0,No,500.0,,,,,,,,32,object
6,ApplicantIncome,614.0,,,,5403.459283,6109.041673,150.0,2877.5,3812.5,5795.0,81000.0,0,int64
7,CoapplicantIncome,614.0,,,,1621.245798,2926.248369,0.0,0.0,1188.5,2297.25,41667.0,0,float64
8,LoanAmount,592.0,,,,146.412162,85.587325,9.0,100.0,128.0,168.0,700.0,22,float64
9,LoanAmountTerm,600.0,,,,342.0,65.12041,12.0,360.0,360.0,360.0,480.0,14,float64


In [51]:
ABT_train.to_excel('ABT_train.xlsx', index=False)

In [52]:
# fix dependents
trainset['Dependents'] = trainset['Dependents'].replace('3+', 3).astype(float)
testset['Dependents'] = testset['Dependents'].replace('3+', 3).astype(float)


In [53]:
X_train = trainset.drop(columns=['LoanID', 'LoanStatus'])
y_train = trainset['LoanStatus'].map({'N':0, 'Y':1})
X_test = testset.drop(columns=['LoanID'])
y_test = testset['LoanStatus'].map({'N':0, 'Y':1})


In [None]:
num_select = make_column_selector(dtype_include=np.number)
cat_select = make_column_selector(dtype_include=object)

num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
cat_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocess = ColumnTransformer(transformers=[
    ('num', num_pipeline, num_select),
    ('cat', cat_pipeline, cat_select)
])  

#investigate the preprocessed data
preprocess.fit(X_train)
X_train_processed = preprocess.transform(X_train)
feature = preprocess.get_feature_names_out()
X_train_proccessed = pd.DataFrame(X_train_processed, columns=feature)

X_train_proccessed.describe().T



Unnamed: 0,count,mean,std,min,25%,50%,75%,max
num__Dependents,614.0,2.893089e-18,1.000815,-0.737806,-0.737806,-0.737806,0.25347,2.236021
num__ApplicantIncome,614.0,-4.339634e-18,1.000815,-0.860649,-0.413816,-0.260639,0.064144,12.384623
num__CoapplicantIncome,614.0,4.0503250000000005e-17,1.000815,-0.554487,-0.554487,-0.148005,0.231202,13.696173
num__LoanAmount,614.0,-2.6037800000000002e-17,1.000815,-1.627255,-0.541446,-0.211241,0.226057,6.595146
num__LoanAmountTerm,614.0,5.930833e-17,1.000815,-5.132498,0.273231,0.273231,0.273231,2.137276
num__CreditHistory,614.0,-1.62013e-16,1.000815,-2.42876,0.411733,0.411733,0.411733,0.411733
cat__Gender_Female,614.0,0.1824104,0.386497,0.0,0.0,0.0,0.0,1.0
cat__Gender_Male,614.0,0.8175896,0.386497,0.0,1.0,1.0,1.0,1.0
cat__Married_No,614.0,0.3469055,0.476373,0.0,0.0,0.0,1.0,1.0
cat__Married_Yes,614.0,0.6530945,0.476373,0.0,0.0,1.0,1.0,1.0


In [56]:
pipeline = make_pipeline(preprocess,LogisticRegression(max_iter=1000))   
pipeline.fit(X_train, y_train)
pipeline

0,1,2
,steps,"[('columntransformer', ...), ('logisticregression', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [58]:
pipeline.score(X_test, y_test)

0.989100817438692