In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

df = pd.read_csv(r'D:\Downloads\loan_train.csv')
print(df.head())
# ---------------------
# 1) Basic imputations
# ---------------------
# Numeric imputations
if 'LoanAmount' in df.columns:
    df['LoanAmount'] = df['LoanAmount'].fillna(df['LoanAmount'].median())

if 'ApplicantIncome' in df.columns:
    df['ApplicantIncome'] = df['ApplicantIncome'].fillna(0)

if 'CoapplicantIncome' in df.columns:
    df['CoapplicantIncome'] = df['CoapplicantIncome'].fillna(0)

if 'Loan_Amount_Term' in df.columns:
    df['Loan_Amount_Term'] = df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mode()[0])

# Categorical imputations (use mode or domain default)
if 'Credit_History' in df.columns:
    if not df['Credit_History'].mode().empty:
        df['Credit_History'] = df['Credit_History'].fillna(df['Credit_History'].mode()[0])
    else:
        df['Credit_History'] = df['Credit_History'].fillna(1.0)

if 'Self_Employed' in df.columns:
    df['Self_Employed'] = df['Self_Employed'].fillna('No')

if 'Gender' in df.columns:
    df['Gender'] = df['Gender'].fillna(df['Gender'].mode()[0])

if 'Married' in df.columns:
    df['Married'] = df['Married'].fillna(df['Married'].mode()[0])

if 'Dependents' in df.columns:
    df['Dependents'] = df['Dependents'].fillna(df['Dependents'].mode()[0])

# ---------------------
# 2) Dependents cleanup
# ---------------------
if 'Dependents' in df.columns:
    # Convert '3+' to '3' then to int
    df['Dependents'] = df['Dependents'].astype(str).replace('3+', '3')
    # If some non-numeric values remain, coerce and fill with mode
    df['Dependents'] = pd.to_numeric(df['Dependents'], errors='coerce')
    if df['Dependents'].isnull().any():
        df['Dependents'] = df['Dependents'].fillna(df['Dependents'].mode()[0])
    df['Dependents'] = df['Dependents'].astype(int)

# ---------------------
# 3) Engineered numeric features (after imputations)
# ---------------------
# Total income
if {'ApplicantIncome', 'CoapplicantIncome'}.issubset(df.columns):
    df['TotalIncome'] = df['ApplicantIncome'] + df['CoapplicantIncome']
else:
    df['TotalIncome'] = np.nan

# Ensure LoanAmount is not zero/NaN (we already filled with median)
if 'LoanAmount' in df.columns:
    # replace any zero with median to avoid divide by zero
    loan_median = df['LoanAmount'].median() if not df['LoanAmount'].isnull().all() else 1.0
    df['LoanAmount'] = df['LoanAmount'].replace(0, loan_median)
    # Income to loan ratio
    df['Income_to_Loan_Ratio'] = df['TotalIncome'] / df['LoanAmount']
else:
    df['Income_to_Loan_Ratio'] = np.nan

# safe applicant/coapplicant ratio (applicant / coapplicant). coapplicant may be zero -> use np.where
if {'ApplicantIncome', 'CoapplicantIncome'}.issubset(df.columns):
    df['Applicant_to_Coapp_Ratio'] = np.where(
        df['CoapplicantIncome'] == 0,
        df['ApplicantIncome'],  # if no coapplicant income, ratio = applicant income (or set to a big number)
        df['ApplicantIncome'] / df['CoapplicantIncome']
    )

# ---------------------
# 4) Outlier capping
# ---------------------
# ApplicantIncome: cap at 99th percentile
if 'ApplicantIncome' in df.columns:
    cap_val = df['ApplicantIncome'].quantile(0.99)
    df.loc[df['ApplicantIncome'] > cap_val, 'ApplicantIncome'] = int(cap_val)
# LoanAmount: cap using IQR
if 'LoanAmount' in df.columns:
    Q1 = df['LoanAmount'].quantile(0.25)
    Q3 = df['LoanAmount'].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df.loc[df['LoanAmount'] < lower, 'LoanAmount'] = lower
    df.loc[df['LoanAmount'] > upper, 'LoanAmount'] = upper

# ---------------------
# 5) Categorical mappings (clear & consistent)
# ---------------------
# Binary maps (choose convention: 1 = positive / Yes / Male)
if 'Gender' in df.columns:
    df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0}).fillna(0).astype(int)

if 'Married' in df.columns:
    df['Married'] = df['Married'].map({'Yes': 1, 'No': 0}).fillna(0).astype(int)

if 'Education' in df.columns:
    df['Education'] = df['Education'].map({'Graduate': 1, 'Not Graduate': 0}).fillna(0).astype(int)

if 'Self_Employed' in df.columns:
    df['Self_Employed'] = df['Self_Employed'].map({'Yes': 1, 'No': 0}).fillna(0).astype(int)

# ---------------------
# 6) One-hot encode Property_Area (keeps deterministic columns)
# ---------------------
if 'Property_Area' in df.columns:
    prop_dummies = pd.get_dummies(df['Property_Area'], prefix='Property')
    # ensure consistent columns appear even if some categories are missing
    for col in ['Property_Rural', 'Property_Semiurban', 'Property_Urban']:
        if col not in prop_dummies.columns:
            prop_dummies[col] = 0
    # attach and drop original
    df = pd.concat(
        [df.drop(columns=['Property_Area']), prop_dummies[['Property_Rural', 'Property_Semiurban', 'Property_Urban']]],
        axis=1
    )
    # cast to int
    df['Property_Rural'] = df['Property_Rural'].astype(int)
    df['Property_Semiurban'] = df['Property_Semiurban'].astype(int)
    df['Property_Urban'] = df['Property_Urban'].astype(int)

# ---------------------
# 7) Target mapping if present (optional)
# ---------------------
if 'Loan_Status' in df.columns:
    # Map 'Y' -> 1, 'N' -> 0. If you used a different mapping before, keep consistent.
    df['Loan_Status'] = df['Loan_Status'].map({'Y': 1, 'N': 0})

# final: df is cleaned
df.head()

    Loan_ID Gender Married Dependents     Education Self_Employed  \
0  LP001002   Male      No          0      Graduate            No   
1  LP001003   Male     Yes          1      Graduate            No   
2  LP001005   Male     Yes          0      Graduate           Yes   
3  LP001006   Male     Yes          0  Not Graduate            No   
4  LP001008   Male      No          0      Graduate            No   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0             5849                0.0         NaN             360.0   
1             4583             1508.0       128.0             360.0   
2             3000                0.0        66.0             360.0   
3             2583             2358.0       120.0             360.0   
4             6000                0.0       141.0             360.0   

   Credit_History Property_Area Loan_Status  
0             1.0         Urban           Y  
1             1.0         Rural           N  
2             1.0   

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,TotalIncome,Income_to_Loan_Ratio,Applicant_to_Coapp_Ratio,Property_Rural,Property_Semiurban,Property_Urban
0,LP001002,1,0,0,1,0,5849,0.0,128.0,360.0,1.0,1,5849.0,45.695312,5849.0,0,0,1
1,LP001003,1,1,1,1,0,4583,1508.0,128.0,360.0,1.0,0,6091.0,47.585938,3.039125,1,0,0
2,LP001005,1,1,0,1,1,3000,0.0,66.0,360.0,1.0,1,3000.0,45.454545,3000.0,0,0,1
3,LP001006,1,1,0,0,0,2583,2358.0,120.0,360.0,1.0,1,4941.0,41.175,1.09542,0,0,1
4,LP001008,1,0,0,1,0,6000,0.0,141.0,360.0,1.0,1,6000.0,42.553191,6000.0,0,0,1
