In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.feature_selection import chi2, VarianceThreshold, mutual_info_classif
from sklearn.model_selection import train_test_split
from sklearn.neighbors import kneighbors_graph



In [3]:
df = pd.read_csv("loan_default.csv")

print("Dataset shape:", df.shape)
df.head()

Dataset shape: (255347, 18)


Unnamed: 0,LoanID,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,I38PQUQS96,56,85994,50587,520,80,4,15.23,36,0.44,Bachelor's,Full-time,Divorced,Yes,Yes,Other,Yes,0
1,HPSK72WA7R,69,50432,124440,458,15,1,4.81,60,0.68,Master's,Full-time,Married,No,No,Other,Yes,0
2,C1OZ6DPJ8Y,46,84208,129188,451,26,3,21.17,24,0.31,Master's,Unemployed,Divorced,Yes,Yes,Auto,No,1
3,V2KKSFM3UN,32,31713,44799,743,0,3,7.07,24,0.23,High School,Full-time,Married,No,No,Business,No,0
4,EY08JDHTZP,60,20437,9139,633,8,4,6.51,48,0.73,Bachelor's,Unemployed,Divorced,No,Yes,Auto,No,0


In [5]:
y = df["Default"]
X = df.drop(["LoanID", "Default"], axis=1)

X.head()

Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner
0,56,85994,50587,520,80,4,15.23,36,0.44,Bachelor's,Full-time,Divorced,Yes,Yes,Other,Yes
1,69,50432,124440,458,15,1,4.81,60,0.68,Master's,Full-time,Married,No,No,Other,Yes
2,46,84208,129188,451,26,3,21.17,24,0.31,Master's,Unemployed,Divorced,Yes,Yes,Auto,No
3,32,31713,44799,743,0,3,7.07,24,0.23,High School,Full-time,Married,No,No,Business,No
4,60,20437,9139,633,8,4,6.51,48,0.73,Bachelor's,Unemployed,Divorced,No,Yes,Auto,No


In [7]:
num_cols = X.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns

print("Numerical columns:", list(num_cols))
print("Categorical columns:", list(cat_cols))

Numerical columns: ['Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed', 'NumCreditLines', 'InterestRate', 'LoanTerm', 'DTIRatio']
Categorical columns: ['Education', 'EmploymentType', 'MaritalStatus', 'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner']


In [9]:
X[num_cols] = X[num_cols].fillna(X[num_cols].median())

if len(cat_cols) > 0:
    X[cat_cols] = X[cat_cols].fillna(X[cat_cols].mode().iloc[0])

In [11]:
label_encoders = {}

for col in cat_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le

X.head()

Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner
0,56,85994,50587,520,80,4,15.23,36,0.44,0,0,0,1,1,4,1
1,69,50432,124440,458,15,1,4.81,60,0.68,2,0,1,0,0,4,1
2,46,84208,129188,451,26,3,21.17,24,0.31,2,3,0,1,1,0,0
3,32,31713,44799,743,0,3,7.07,24,0.23,1,0,1,0,0,1,0
4,60,20437,9139,633,8,4,6.51,48,0.73,0,3,0,0,1,0,0


In [13]:
X_std = X.copy()
scaler_std = StandardScaler()
X_std[num_cols] = scaler_std.fit_transform(X_std[num_cols])

X_std.head()

Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner
0,0.83399,0.089693,-1.086833,-0.341492,0.590533,1.341937,0.261771,-0.001526,-0.260753,0,0,0,1,1,4,1
1,1.701221,-0.823021,-0.044309,-0.731666,-1.285731,-1.343791,-1.30835,1.412793,0.778585,2,0,1,0,0,4,1
2,0.166888,0.043854,0.022715,-0.775718,-0.968209,0.446694,1.156831,-0.708685,-0.823728,2,3,0,1,1,0,0
3,-0.767053,-1.303452,-1.168538,1.061875,-1.718715,0.446694,-0.967805,-0.708685,-1.170174,1,0,1,0,0,1,0
4,1.10083,-1.592855,-1.671921,0.369631,-1.48779,1.341937,-1.052188,0.705634,0.995114,0,3,0,0,1,0,0


In [15]:
X_mm = X.copy()
scaler_mm = MinMaxScaler()
X_mm[num_cols] = scaler_mm.fit_transform(X_mm[num_cols])

X_mm.head()

Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner
0,0.745098,0.525885,0.18607,0.400729,0.672269,1.0,0.575217,0.5,0.425,0,0,0,1,1,4,1
1,1.0,0.262461,0.487512,0.287796,0.12605,0.0,0.122174,1.0,0.725,2,0,1,0,0,4,1
2,0.54902,0.512656,0.506892,0.275046,0.218487,0.666667,0.833478,0.25,0.2625,2,3,0,1,1,0,0
3,0.27451,0.123801,0.162446,0.806922,0.0,0.666667,0.220435,0.25,0.1625,1,0,1,0,0,1,0
4,0.823529,0.040274,0.016894,0.606557,0.067227,1.0,0.196087,0.75,0.7875,0,3,0,0,1,0,0


In [17]:
X_train, X_test, y_train, y_test = train_test_split(
    X_std, y, test_size=0.2, random_state=42, stratify=y
)

In [19]:
pearson_scores = {}

for col in X_std.columns:
    pearson_scores[col] = abs(np.corrcoef(X_std[col], y)[0, 1])

pearson_df = pd.DataFrame.from_dict(
    pearson_scores, orient="index", columns=["Pearson"]
).sort_values(by="Pearson", ascending=False)

pearson_df.head(10)

Unnamed: 0,Pearson
Age,0.167783
InterestRate,0.131273
Income,0.099119
MonthsEmployed,0.097374
LoanAmount,0.086659
EmploymentType,0.04101
HasCoSigner,0.039109
HasDependents,0.034678
CreditScore,0.034166
NumCreditLines,0.02833


In [21]:
chi_scores, _ = chi2(X_mm, y)

chi_df = pd.DataFrame({
    "Feature": X_mm.columns,
    "ChiSquare": chi_scores
}).sort_values(by="ChiSquare", ascending=False)

chi_df.head(10)

Unnamed: 0,Feature,ChiSquare
0,Age,1242.117139
6,InterestRate,733.158786
1,Income,417.948106
4,MonthsEmployed,410.09432
10,EmploymentType,357.415994
2,LoanAmount,320.438017
15,HasCoSigner,195.232513
13,HasDependents,153.451151
9,Education,111.530314
12,HasMortgage,66.695496


In [23]:
vt = VarianceThreshold(threshold=0.0)
vt.fit(X)

variance_df = pd.DataFrame({
    "Feature": X.columns,
    "Variance": vt.variances_
}).sort_values(by="Variance", ascending=False)

variance_df.head(10)

Unnamed: 0,Feature,Variance
2,LoanAmount,244999.0
1,Income,134999.0
3,CreditScore,549.0
4,MonthsEmployed,119.0
0,Age,51.0
7,LoanTerm,48.0
6,InterestRate,23.0
14,LoanPurpose,1.995778
9,Education,1.251766
10,EmploymentType,1.248484


In [25]:
ig_scores = mutual_info_classif(X, y, random_state=42)

ig_df = pd.DataFrame({
    "Feature": X.columns,
    "InformationGain": ig_scores
}).sort_values(by="InformationGain", ascending=False)

ig_df.head(10)

Unnamed: 0,Feature,InformationGain
13,HasDependents,0.036347
15,HasCoSigner,0.034824
12,HasMortgage,0.034449
11,MaritalStatus,0.018282
0,Age,0.015031
10,EmploymentType,0.011974
9,Education,0.011412
14,LoanPurpose,0.010563
5,NumCreditLines,0.009501
6,InterestRate,0.008728


In [27]:
mi_scores = mutual_info_classif(X_std, y, random_state=42)

mi_df = pd.DataFrame({
    "Feature": X.columns,
    "MutualInformation": mi_scores
}).sort_values(by="MutualInformation", ascending=False)

mi_df.head(10)

Unnamed: 0,Feature,MutualInformation
13,HasDependents,0.036347
15,HasCoSigner,0.034824
12,HasMortgage,0.034449
11,MaritalStatus,0.018282
0,Age,0.014877
10,EmploymentType,0.011974
9,Education,0.011412
14,LoanPurpose,0.010563
5,NumCreditLines,0.009424
6,InterestRate,0.00872


In [29]:
def fisher_score(X, y):
    scores = {}
    classes = np.unique(y)

    for col in X.columns:
        mean_all = np.mean(X[col])
        num, den = 0, 0

        for c in classes:
            Xc = X[col][y == c]
            num += len(Xc) * (np.mean(Xc) - mean_all) ** 2
            den += len(Xc) * np.var(Xc)

        scores[col] = num / den if den != 0 else 0

    return scores

fisher_df = pd.DataFrame.from_dict(
    fisher_score(X_std, y), orient="index", columns=["FisherScore"]
).sort_values(by="FisherScore", ascending=False)

fisher_df.head(10)

Unnamed: 0,FisherScore
Age,0.028967
InterestRate,0.017535
Income,0.009922
MonthsEmployed,0.009572
LoanAmount,0.007567
EmploymentType,0.001685
HasCoSigner,0.001532
HasDependents,0.001204
CreditScore,0.001169
NumCreditLines,0.000803


In [31]:
from sklearn.neighbors import kneighbors_graph

# Take a manageable sample (important for large datasets)
X_sample = X_std.sample(n=5000, random_state=42)

# Build k-nearest neighbor graph
W = kneighbors_graph(
    X_sample,
    n_neighbors=5,
    mode="connectivity",
    include_self=True
)

laplacian_scores = {}

for col in X_sample.columns:
    f = X_sample[col].values
    num = np.sum(W.multiply((f[:, None] - f[None, :]) ** 2))
    den = np.var(f)
    laplacian_scores[col] = num / den if den != 0 else 0

# Convert to DataFrame and sort (LOWER score = BETTER feature)
laplacian_df = pd.DataFrame.from_dict(
    laplacian_scores,
    orient="index",
    columns=["LaplacianScore"]
).sort_values(by="LaplacianScore")

laplacian_df.head(10)




Unnamed: 0,LaplacianScore
LoanPurpose,4613.770955
EmploymentType,7102.94428
Education,7223.706699
NumCreditLines,8181.745095
LoanTerm,8488.020894
MonthsEmployed,8562.833744
InterestRate,8674.798189
CreditScore,8697.220827
Income,8743.925067
DTIRatio,8761.749417


In [33]:
def entropy(x):
    probs = np.bincount(x) / len(x)
    probs = probs[probs > 0]
    return -np.sum(probs * np.log2(probs))

su_scores = {}

for col in X.columns:
    ig = mutual_info_classif(X[[col]], y)[0]
    su_scores[col] = 2 * ig / (entropy(X[col]) + entropy(y))

su_df = pd.DataFrame.from_dict(
    su_scores, orient="index", columns=["SymmetricalUncertainty"]
).sort_values(by="SymmetricalUncertainty", ascending=False)

su_df.head(10)

Unnamed: 0,SymmetricalUncertainty
HasCoSigner,0.046449
HasDependents,0.045769
HasMortgage,0.044846
MaritalStatus,0.017076
EmploymentType,0.009803
NumCreditLines,0.008112
Education,0.008069
LoanPurpose,0.006797
LoanTerm,0.00658
Age,0.005


In [35]:
mb_scores = {}

for col in X.columns:
    mb_scores[col] = mutual_info_classif(X[[col]], y)[0]

mb_df = pd.DataFrame.from_dict(
    mb_scores, orient="index", columns=["MarkovBlanketScore"]
).sort_values(by="MarkovBlanketScore", ascending=False)

mb_df.head(10)

Unnamed: 0,MarkovBlanketScore
HasCoSigner,0.035946
HasDependents,0.035327
HasMortgage,0.033962
MaritalStatus,0.018878
Age,0.014169
EmploymentType,0.011529
Education,0.011272
LoanPurpose,0.009844
NumCreditLines,0.009604
LoanTerm,0.008867


In [37]:
pearson_df.to_csv("pearson_results.csv")
chi_df.to_csv("chi_square_results.csv")
variance_df.to_csv("variance_results.csv")
ig_df.to_csv("information_gain_results.csv")
mi_df.to_csv("mutual_information_results.csv")
fisher_df.to_csv("fisher_score_results.csv")
laplacian_df.to_csv("laplacian_score_results.csv")
su_df.to_csv("symmetrical_uncertainty_results.csv")
mb_df.to_csv("markov_blanket_results.csv")

In [39]:
with pd.ExcelWriter("feature_selection_results.xlsx") as writer:
    pearson_df.to_excel(writer, sheet_name="Pearson")
    chi_df.to_excel(writer, sheet_name="ChiSquare", index=False)
    variance_df.to_excel(writer, sheet_name="Variance", index=False)
    ig_df.to_excel(writer, sheet_name="InformationGain", index=False)
    mi_df.to_excel(writer, sheet_name="MutualInformation", index=False)
    fisher_df.to_excel(writer, sheet_name="FisherScore")
    laplacian_df.to_excel(writer, sheet_name="LaplacianScore")
    su_df.to_excel(writer, sheet_name="SymmetricalUncertainty")
    mb_df.to_excel(writer, sheet_name="MarkovBlanket")

In [41]:
all_results = pd.concat({
    "Pearson": pearson_df,
    "ChiSquare": chi_df.set_index("Feature"),
    "Variance": variance_df.set_index("Feature"),
    "InformationGain": ig_df.set_index("Feature"),
    "MutualInformation": mi_df.set_index("Feature"),
    "FisherScore": fisher_df,
    "LaplacianScore": laplacian_df,
    "SymmetricalUncertainty": su_df,
    "MarkovBlanket": mb_df
})

all_results.to_csv("feature_selection_results.csv")