In [847]:
import pandas as pd
from sklearn.model_selection import train_test_split
import warnings 
warnings.filterwarnings('ignore')



In [848]:
data = pd.read_csv("FinalData.csv")



In [849]:
data.columns

Index(['Unnamed: 0', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area',
       'Loan_Status (Approved)'],
      dtype='object')

In [None]:
'''
Target : Loan_Status (Approved)
    
   1. Loan_Amount_Term has very low correlation with Target
        Better to remove 
   2. LoanAmounnt cannot be remove bcz it not has correlation with Target with with others 
   3. CoapplicantIncome also should not to remove 

'''
data.drop(["Loan_Amount_Term",'Unnamed: 0'],axis=1,inplace=True)





In [851]:
data.corr()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,LoanAmount,Credit_History,Property_Area,Loan_Status (Approved)
Gender,1.0,0.349424,0.213867,-0.059245,-0.002761,0.032644,0.098975,0.022447,-0.09494,0.064504
Married,0.349424,1.0,0.372864,-0.001652,0.015674,0.036717,0.183442,0.029095,-0.008139,0.112321
Dependents,0.213867,0.372864,1.0,-0.03008,0.040018,0.137604,0.172061,-0.032292,-0.023588,0.0316
Education,-0.059245,-0.001652,-0.03008,1.0,0.005085,0.131172,0.17278,0.056656,0.007953,0.068437
Self_Employed,-0.002761,0.015674,0.040018,0.005085,1.0,0.170785,0.120389,-0.023568,0.00983,-0.034715
ApplicantIncome,0.032644,0.036717,0.137604,0.131172,0.170785,1.0,0.49531,-0.056152,0.025068,-0.043152
LoanAmount,0.098975,0.183442,0.172061,0.17278,0.120389,0.49531,1.0,-0.040773,0.06243,-0.071753
Credit_History,0.022447,0.029095,-0.032292,0.056656,-0.023568,-0.056152,-0.040773,1.0,0.042091,0.52939
Property_Area,-0.09494,-0.008139,-0.023588,0.007953,0.00983,0.025068,0.06243,0.042091,1.0,0.121813
Loan_Status (Approved),0.064504,0.112321,0.0316,0.068437,-0.034715,-0.043152,-0.071753,0.52939,0.121813,1.0


In [None]:
## Step 1 Creating independent and dependent variable.

x = data[['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Credit_History', 'Property_Area']]





y = data['Loan_Status (Approved)']


In [853]:
data['Loan_Status (Approved)'].value_counts() # data is not balanced 

Loan_Status (Approved)
1    332
0    148
Name: count, dtype: int64

## balancing data using SMOTE

In [854]:
from imblearn.over_sampling import SMOTE
sm = SMOTE()
x_res,y_res = sm.fit_resample(x,y)


In [855]:
y_res.value_counts()  # balanced

Loan_Status (Approved)
0    332
1    332
Name: count, dtype: int64

In [856]:
## Step 2 Creating training and testing data.

from sklearn.model_selection import train_test_split as tts
x_train,x_test,y_train,y_test = tts(x_res,y_res,test_size=0.25,random_state=42)

In [857]:
## Step 3 Model creation

from sklearn.linear_model import LogisticRegression as LogReg 
model = LogReg()

#  fitting the model
model.fit(x_train,y_train)

In [858]:
## Step 4 Prediction

y_predict = model.predict(x_test)
y_predict.shape

(166,)

In [859]:
y_predict_proba = model.predict_proba(x_test)
y_predict_proba

array([[0.23302201, 0.76697799],
       [0.90993238, 0.09006762],
       [0.28926615, 0.71073385],
       [0.15061248, 0.84938752],
       [0.11737086, 0.88262914],
       [0.27253847, 0.72746153],
       [0.70781574, 0.29218426],
       [0.55410291, 0.44589709],
       [0.9741423 , 0.0258577 ],
       [0.76900617, 0.23099383],
       [0.42241098, 0.57758902],
       [0.272898  , 0.727102  ],
       [0.84019402, 0.15980598],
       [0.3946832 , 0.6053168 ],
       [0.14004952, 0.85995048],
       [0.82933156, 0.17066844],
       [0.21802254, 0.78197746],
       [0.97520437, 0.02479563],
       [0.30511709, 0.69488291],
       [0.12647343, 0.87352657],
       [0.39030758, 0.60969242],
       [0.09142823, 0.90857177],
       [0.5630959 , 0.4369041 ],
       [0.26444655, 0.73555345],
       [0.81448248, 0.18551752],
       [0.12909407, 0.87090593],
       [0.24276427, 0.75723573],
       [0.96033369, 0.03966631],
       [0.23174584, 0.76825416],
       [0.08129988, 0.91870012],
       [0.

In [860]:

from sklearn.metrics import confusion_matrix,accuracy_score,recall_score,precision_score,classification_report,f1_score


In [861]:
confusion_matrix (y_test,y_predict)


array([[59, 25],
       [18, 64]], dtype=int64)

In [862]:

accuracy_score(y_test,y_predict)

0.7409638554216867

In [863]:
print(recall_score(y_test,y_predict))


0.7804878048780488


In [864]:
print(precision_score(y_test,y_predict))


0.7191011235955056


In [865]:
print(f1_score(y_test,y_predict))

0.7485380116959064


In [866]:

print(classification_report(y_test,y_predict))


              precision    recall  f1-score   support

           0       0.77      0.70      0.73        84
           1       0.72      0.78      0.75        82

    accuracy                           0.74       166
   macro avg       0.74      0.74      0.74       166
weighted avg       0.74      0.74      0.74       166



Class 0:

Precision is good (0.80), but recall is lower (0.70).

It means your model correctly identifies most class 0, but misses some.

Class 1:

Recall is high (0.82), so it's catching most positives.

Precision is 0.73, meaning it's okay but has some false positives.

✅ Overall Accuracy:
Accuracy = 0.76 → Your model is correct 76% of the time.

## Overall Accuracy: 0.76 → The model correctly predicted 76% of the cases