# Task-2: Loan Approval Prediction

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import(classification_report, confusion_matrix, roc_auc_score, roc_curve)

from imblearn.over_sampling import SMOTE

In [None]:
df = pd.read_csv("loan_prediction.csv")

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df['Loan_Status'].value_counts(dropna=False)

In [None]:
df.columns

In [None]:
for col in df.select_dtypes(include='object').columns:
    print(df[col].value_counts().head())
    print("-"*40)

In [None]:
df.rename(columns={'Unnamed:13': 'Loan_Status'},inplace=True)
df.rename(columns={'Loan_Status_x': 'Property_Area'}, inplace=True)

In [None]:
df.columns

In [None]:
df.drop(columns=['Loan_Status'],inplace=True)

In [None]:
df.rename(columns={'Unnamed: 13': 'Loan_Status'},inplace=True)

In [None]:
df.columns

In [None]:
df['Loan_Status'] = df['Loan_Status'].map({'Y':1, 'N':0})

In [None]:
df['Loan_Status'].isna().sum()

In [None]:
x = df.drop('Loan_Status', axis=1)
y = df['Loan_Status']

In [None]:
y.value_counts()

In [None]:
num_cols = x.select_dtypes(include=['int64', 'float64']).columns
for col in num_cols:
    x[col]=x[col].fillna(x[col].median())

In [None]:
cat_cols = x.select_dtypes(include=['object']).columns
for col in cat_cols:
    x[col]=x[col].fillna(x[col].mode()[0])

In [None]:
x.isna().sum()

In [None]:
x = pd.get_dummies(x,drop_first=True)

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42,stratify=y)

In [None]:
sm = SMOTE(random_state=42)
x_train_sm,y_train_sm = sm.fit_resample(x_train,y_train)

In [None]:
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train_sm)
x_test_scaled = scaler.transform(x_test)

lr = LogisticRegression(max_iter=1000)
lr.fit(x_train_scaled,y_train_sm)

y_pred = lr.predict(x_test_scaled)
y_prob = lr.predict_proba(x_test_scaled)[:,1]

print(classification_report(y_test,y_pred))
print("ROC-AUC:",roc_auc_score(y_test,y_prob))

In [None]:
rf = RandomForestClassifier(n_estimators=200, random_state=42, class_weight='balanced',max_depth=None)

rf.fit(x_train_sm,y_train_sm)

y_pred_rf = rf.predict(x_test)
y_prob_rf = rf.predict_proba(x_test)[:,1]

print(classification_report(y_test, y_pred_rf))
print("ROC-AUC:", roc_auc_score(y_test, y_prob_rf))

# key Points:
•DataSet was cleaned, corrected, and validated.

•Missing values and Columns arrangement were handle.

•Class imbalance address using SMOTE.

•Two model compared : Ⅰ.Logistic Regression (interpretable)
 Ⅱ.Random Forest Classifier (powerful)

•Evaluation used Recall and ROC-AUC, Accuracy.

# Final Conclusion: 
1. This project succesfully demonstrate a end-to-end machine learnig pipeline for loan approval prediction.

2. As we have above models after initial data exploration we identified a wrongly name column and the data was imbalanced.Machine learing model required one clean target variable. The missname target columns can cause 1. Nan Values 2.Model training errors.

3. Then we did Mapping Target Varible it convert the categorical target labels into numeric form like { 'Y':1 and 'N':0 } , because scikit-learn model cann't work on text varibles that why it was converted to binary.
It required Binary encoding for classification tasks.

4. Use train-test-split model for preventing data from leaking, Ensure fair evaluation.

5. Also use SMOTE it improves the recall values and prevent bias toward majority class. 

6. Logistic Regression model is widely use in finance, it is interpretable and provide strong baseline model performance with excellent recall for approved loan, ensuring minimal rejection of eligible applicants.
              
              •Accuracy: 85%
              •Recall (Approved): 98%
              •ROC-AUC: ~0.80

              •Almost all eligible applicants are approved loan
              •Very few good customers are rejected toward loan

7. Random Forest model it handles non-linear relationships, enchance risk detection by capturing non-linear patterns in borrower behavior.
              
              •Higher recall for rejected loans
              •Improves ROC-AUC
              •Better balance between approval and risk