In [1]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.utils import resample


In [2]:
train_df = pd.read_csv("data/credit_data_train.csv")
test_df  = pd.read_csv("data/credit_data_test.csv")

print(train_df.head())
print(train_df.info())


    Loan_ID  Gender Married Dependents Education Self_Employed  \
0  LP002087  Female      No          0  Graduate            No   
1  LP002114  Female      No          0  Graduate            No   
2  LP002143  Female     Yes          0  Graduate            No   
3  LP002144  Female      No        NaN  Graduate            No   
4  LP002194  Female      No          0  Graduate           Yes   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0             2500                0.0        67.0             360.0   
1             4160                0.0        71.0             360.0   
2             2423              505.0       130.0             360.0   
3             3813                0.0       116.0             180.0   
4            15759                0.0        55.0             360.0   

   Credit_History Property_Area Loan_Status  
0             1.0         Urban           Y  
1             1.0     Semiurban           Y  
2             1.0     Semiurban       

In [3]:
target_col = "Loan_Status"
gender_col = "Gender"

train_df[target_col] = train_df[target_col].map({"Y": 1, "N": 0})
test_df[target_col]  = test_df[target_col].map({"Y": 1, "N": 0})


In [4]:
X_train = train_df.drop(columns=[target_col, "Loan_ID"])
y_train = train_df[target_col]

X_test  = test_df.drop(columns=[target_col, "Loan_ID"])
y_test  = test_df[target_col]


In [5]:
numeric_features = X_train.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X_train.select_dtypes(include=["object"]).columns


In [6]:
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)


In [7]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "kNN": KNeighborsClassifier(n_neighbors=5),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(random_state=42)
}


In [8]:
results = {}

for name, model in models.items():
    clf = Pipeline([
        ("preprocess", preprocessor),
        ("classifier", model)
    ])
    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    results[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred)
    }

results_df = pd.DataFrame(results).T
print(results_df)


                     Accuracy  Precision  Recall        F1
Logistic Regression    0.6500   0.588235   1.000  0.740741
kNN                    0.6500   0.590909   0.975  0.735849
Naive Bayes            0.5125   0.512195   0.525  0.518519
Random Forest          0.5750   0.541667   0.975  0.696429


In [9]:
def evaluate_by_gender(X_tr, y_tr, X_te, y_te, model, preprocessor, gender_col):
    clf = Pipeline([
        ("preprocess", preprocessor),
        ("classifier", model)
    ])
    
    clf.fit(X_tr, y_tr)
    y_pred = clf.predict(X_te)
    
    eval_df = X_te.copy()
    eval_df["y_true"] = y_te.values
    eval_df["y_pred"] = y_pred
    
    metrics = {}
    for g in eval_df[gender_col].unique():
        subset = eval_df[eval_df[gender_col] == g]
        metrics[g] = {
            "Recall": recall_score(subset["y_true"], subset["y_pred"]),
            "F1": f1_score(subset["y_true"], subset["y_pred"])
        }
    return metrics


In [10]:
baseline_gender_results = evaluate_by_gender(
    X_train, y_train,
    X_test, y_test,
    LogisticRegression(max_iter=1000),
    preprocessor,
    gender_col
)

print("Baseline gender-wise results:")
print(baseline_gender_results)


Baseline gender-wise results:
{'Female': {'Recall': 1.0, 'F1': 0.7692307692307693}, 'Male': {'Recall': 1.0, 'F1': 0.7142857142857143}}


In [11]:
X_train_no_gender = X_train.drop(columns=[gender_col])
X_test_no_gender  = X_test.drop(columns=[gender_col])


In [12]:
num_no_gender = X_train_no_gender.select_dtypes(include=["int64", "float64"]).columns
cat_no_gender = X_train_no_gender.select_dtypes(include=["object"]).columns

preprocessor_no_gender = ColumnTransformer([
    ("num", Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ]), num_no_gender),
    ("cat", Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ]), cat_no_gender)
])


In [13]:
no_gender_results = evaluate_by_gender(
    X_train_no_gender, y_train,
    X_test, y_test,   # ‚Üê keep Gender here
    LogisticRegression(max_iter=1000),
    preprocessor_no_gender,
    gender_col
)


print("Without gender feature:")
print(no_gender_results)


Without gender feature:
{'Female': {'Recall': 1.0, 'F1': 0.7692307692307693}, 'Male': {'Recall': 1.0, 'F1': 0.7142857142857143}}


In [14]:
train_full = train_df.copy()

approved = train_full[train_full[target_col] == 1]
rejected = train_full[train_full[target_col] == 0]

rejected_upsampled = resample(
    rejected,
    replace=True,
    n_samples=len(approved),
    random_state=42
)

balanced_df = pd.concat([approved, rejected_upsampled])


In [15]:
X_train_bal = balanced_df.drop(columns=[target_col, "Loan_ID"])
y_train_bal = balanced_df[target_col]


In [16]:
num_bal = X_train_bal.select_dtypes(include=["int64", "float64"]).columns
cat_bal = X_train_bal.select_dtypes(include=["object"]).columns

preprocessor_bal = ColumnTransformer([
    ("num", Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ]), num_bal),
    ("cat", Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ]), cat_bal)
])


In [17]:
balanced_results = evaluate_by_gender(
    X_train_bal, y_train_bal,
    X_test, y_test,
    LogisticRegression(max_iter=1000),
    preprocessor_bal,
    gender_col
)

print("Balanced training data:")
print(balanced_results)


Balanced training data:
{'Female': {'Recall': 0.3, 'F1': 0.36363636363636365}, 'Male': {'Recall': 0.75, 'F1': 0.6}}
