In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv(r"D:\Ultimate Programming\Data Bases\Home Loan DS.csv")
df.head(3)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y


In [4]:
df.drop(columns=['Loan_ID'], inplace=True)

In [5]:
df.isnull().sum()

Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [6]:
x = df.drop(columns='Loan_Status')
y = df['Loan_Status']

In [7]:
type(x)

pandas.core.frame.DataFrame

In [8]:
x.head(2)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural


In [9]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, MinMaxScaler
from sklearn.compose import ColumnTransformer

In [10]:
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             601 non-null    object 
 1   Married            611 non-null    object 
 2   Dependents         599 non-null    object 
 3   Education          614 non-null    object 
 4   Self_Employed      582 non-null    object 
 5   ApplicantIncome    614 non-null    int64  
 6   CoapplicantIncome  614 non-null    float64
 7   LoanAmount         592 non-null    float64
 8   Loan_Amount_Term   600 non-null    float64
 9   Credit_History     564 non-null    float64
 10  Property_Area      614 non-null    object 
dtypes: float64(4), int64(1), object(6)
memory usage: 52.9+ KB


In [11]:
for col in x.select_dtypes(include='object').columns:
    x[col].fillna(x[col].mode()[0], inplace=True)

In [12]:
for col in x.select_dtypes(include=['int64', 'float64']).columns:
    x[col].fillna(x[col].mean(), inplace=True)

In [13]:
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             614 non-null    object 
 1   Married            614 non-null    object 
 2   Dependents         614 non-null    object 
 3   Education          614 non-null    object 
 4   Self_Employed      614 non-null    object 
 5   ApplicantIncome    614 non-null    int64  
 6   CoapplicantIncome  614 non-null    float64
 7   LoanAmount         614 non-null    float64
 8   Loan_Amount_Term   614 non-null    float64
 9   Credit_History     614 non-null    float64
 10  Property_Area      614 non-null    object 
dtypes: float64(4), int64(1), object(6)
memory usage: 52.9+ KB


In [14]:
ct = ColumnTransformer(transformers=[
    ('ohe', OneHotEncoder(drop='first', dtype=np.int32), [0, 1, 2, 3, 4, 10]),
    ('ft', FunctionTransformer(func=np.log1p, validate=True), [5, 6, 7, 8, 9]),
    ('mms', MinMaxScaler(), [5, 6, 7, 8, 9])
], remainder='passthrough')

In [15]:
x_en = ct.fit_transform(x)
x_en = pd.DataFrame(x_en)

In [16]:
x_en.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,8.674197,0.0,4.993232,5.888878,0.693147,0.070489,0.0,0.19886,0.74359,1.0
1,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,8.430327,7.319202,4.859812,5.888878,0.693147,0.05483,0.036192,0.172214,0.74359,1.0
2,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,8.006701,0.0,4.204693,5.888878,0.693147,0.03525,0.0,0.082489,0.74359,1.0


In [18]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [19]:
x_train, x_test, y_train, y_test = train_test_split(x_en, y, test_size=0.2, random_state=42)

In [20]:
lr = LogisticRegression()
lr.fit(x_train, y_train)

In [21]:
print(f"Training Accuracy: {round(lr.score(x_train, y_train)*100, 2)}")
print(f"Testing Accuracy: {round(lr.score(x_test, y_test)*100, 2)}")

Training Accuracy: 81.06
Testing Accuracy: 78.86


## Hyperparameter Tuning

In [22]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [23]:
param_grid = {
    'penalty' : ['l1', 'l2', 'elasticnet'],
    'dual' : [False, True],
    'C' : [0.01, 0.1, 1, 10],
    'random_state' : [i for i in range(2, 43)],
    'solver': ['liblinear', 'saga'],
}

grid = GridSearchCV(LogisticRegression(), param_grid)
grid.fit(x_train, y_train)

In [24]:
print(grid.best_params_)
print(grid.best_score_)

{'C': 1, 'dual': True, 'penalty': 'l2', 'random_state': 3, 'solver': 'liblinear'}
0.8166357452071737


In [25]:
import optuna

In [26]:
def objective(trial):
    penalty = trial.suggest_categorical('penalty', ['l1', 'l2'])
    C = trial.suggest_float('C', 0.001, 10.0, log=True)
    solver = trial.suggest_categorical('solver', ['liblinear', 'saga'])
    
    if penalty == 'l1' and solver not in ['liblinear', 'saga']:
        raise optuna.exceptions.TrialPruned()
    if penalty == 'l2' and solver not in ['lbfgs', 'newton-cg', 'sag', 'saga', 'liblinear']:
        raise optuna.exceptions.TrialPruned()
    clf = LogisticRegression(
        penalty=penalty,
        C=C,
        solver=solver,
        max_iter=500,
        random_state=42
    )

    clf.fit(x_train, y_train)
    score = clf.score(x_test, y_test)
    return score

In [27]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=25)

[I 2025-10-14 23:30:34,496] A new study created in memory with name: no-name-957cfc8c-a477-4de7-98f3-efa430b1b722
[I 2025-10-14 23:30:34,717] Trial 0 finished with value: 0.7886178861788617 and parameters: {'penalty': 'l1', 'C': 9.147361083870521, 'solver': 'saga'}. Best is trial 0 with value: 0.7886178861788617.
[I 2025-10-14 23:30:34,744] Trial 1 finished with value: 0.7886178861788617 and parameters: {'penalty': 'l1', 'C': 0.58882237232492, 'solver': 'liblinear'}. Best is trial 0 with value: 0.7886178861788617.
[I 2025-10-14 23:30:34,783] Trial 2 finished with value: 0.7886178861788617 and parameters: {'penalty': 'l2', 'C': 5.350309418426101, 'solver': 'liblinear'}. Best is trial 0 with value: 0.7886178861788617.
[I 2025-10-14 23:30:34,803] Trial 3 finished with value: 0.7235772357723578 and parameters: {'penalty': 'l1', 'C': 0.03367497847351754, 'solver': 'liblinear'}. Best is trial 0 with value: 0.7886178861788617.
[I 2025-10-14 23:30:34,826] Trial 4 finished with value: 0.7073170

In [33]:
print("Best Parameters:", study.best_params)
print("Best Accuracy:", round(study.best_value, 2)*100)

Best Parameters: {'penalty': 'l1', 'C': 9.147361083870521, 'solver': 'saga'}
Best Accuracy: 79.0


In [31]:
# inp = ['Male',	'No',	0,	'Graduate',	'No', 5849, 0.0, 128.0,	360.0, 1.0,	'Urban']
# enc = ct.transform(inp)

# Cross Validation Technique

In [40]:
x_train2, x_test2, y_train2, y_test2 = train_test_split(x_en, y, test_size=0.2, random_state=42)

In [51]:
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score

kf = StratifiedKFold()
model = LogisticRegression(max_iter=200)
scores = cross_val_score(model, x_train2, y_train2, cv=kf, scoring='accuracy')*100

scores.sort()
print("Cross-validation scores for each fold:", scores)
print("Mean Accuracy:", np.mean(scores))
print("Standard Deviation:", np.std(scores))

Cross-validation scores for each fold: [78.57142857 78.57142857 80.6122449  81.63265306 85.85858586]
Mean Accuracy: 81.04926819212534
Standard Deviation: 2.6810536008762043
