#### Preprocessing home loan dataset.

In [60]:
# Importing necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
import warnings
warnings.filterwarnings('ignore')


# Preprocessing Libraries
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split,StratifiedKFold
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.preprocessing import LabelEncoder


# Statistical Libraries
from scipy import stats
from scipy.stats import zscore, skew


print("Libraries Imported successfully")




Libraries Imported successfully


In [26]:
home_loan = pd.read_csv("/Users/mac/Desktop/my_tasks/week_11/df.csv")

In [30]:
df = home_loan.copy()
df.head()

Unnamed: 0.1,Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban,1
1,1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban,1
2,2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban,0
3,3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,1.0,Urban,1
4,4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban,0


In [31]:
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

In [32]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban,1
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban,1
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban,0
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,1.0,Urban,1
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban,0


In [41]:
# Since loan id is just an identifier , let's drop it to compute our features correlation with our target

df.drop('Loan_ID', errors='ignore',axis =1,inplace=True)
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban,1
1,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban,1
2,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban,0
3,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,1.0,Urban,1
4,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban,0


#### EDA-Based Data Quality Assessment

In [43]:

# 1. Check for missing values (EDA showed no missing values)
print("\n1. Missing Values:")
missing_values = df.isnull().sum()
if missing_values.sum() > 0:
    print(missing_values[missing_values > 0])
else:
    print("No missing values found (as expected from EDA)")

# 2. Check for duplicates
print("\n2. Duplicate Rows:")
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")
if duplicates > 0:
    print(f"Percentage of duplicates: {(duplicates/len(df))*100:.2f}%")

# 3. Check skewness for variables identified in EDA as right-skewed
print("\n3. Skewness Analysis (EDA identified right-skewed variables):")
skewed_vars = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount','Loan_Amount_Term','Credit_History']
for var in skewed_vars:
    if var in df.columns:
        skewness = skew(df[var])
        print(f"{var}: skewness = {skewness:.3f} ({'right-skewed' if skewness > 0.5 else 'negatively skewed'})")

# 4. Check correlation with target (EDA evidence)
print("\n4. Correlation with Quality (EDA Evidence):")
correlations = df.select_dtypes(include=['number']).corr()['Loan_Status'].sort_values(key=abs, ascending=False)
print("High-signal features (|correlation| > 0.2):")
high_signal = correlations[abs(correlations) > 0.2].drop('Loan_Status')
for feature, corr in high_signal.items():
    print(f"  {feature}: {corr:.3f}")

print("\nLow-signal features (|correlation| < 0.1):")
low_signal = correlations[abs(correlations) < 0.1]
for feature, corr in low_signal.items():
    print(f"  {feature}: {corr:.3f}")


1. Missing Values:
No missing values found (as expected from EDA)

2. Duplicate Rows:
Number of duplicate rows: 0

3. Skewness Analysis (EDA identified right-skewed variables):
ApplicantIncome: skewness = 8.407 (right-skewed)
CoapplicantIncome: skewness = 4.240 (right-skewed)
LoanAmount: skewness = 2.230 (right-skewed)
Loan_Amount_Term: skewness = -2.690 (negatively skewed)
Credit_History: skewness = -1.847 (negatively skewed)

4. Correlation with Quality (EDA Evidence):
High-signal features (|correlation| > 0.2):

Low-signal features (|correlation| < 0.1):
  Credit_History: -0.052
  LoanAmount: 0.040
  Loan_Amount_Term: -0.031
  ApplicantIncome: 0.013
  CoapplicantIncome: 0.010


In [7]:
# Let's fix the missing values

home_loan.dropna()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban
5,LP001054,Male,Yes,0,Not Graduate,Yes,2165,3422,152.0,360.0,1.0,Urban
...,...,...,...,...,...,...,...,...,...,...,...,...
361,LP002969,Male,Yes,1,Graduate,No,2269,2167,99.0,360.0,1.0,Semiurban
362,LP002971,Male,Yes,3+,Not Graduate,Yes,4009,1777,113.0,360.0,1.0,Urban
363,LP002975,Male,Yes,0,Graduate,No,4158,709,115.0,360.0,1.0,Urban
365,LP002986,Male,Yes,0,Graduate,No,5000,2393,158.0,360.0,1.0,Rural


In [None]:
# Let's check for duplicates

duplicates = home_loan.duplicated().sum()
duplicates

# There are no duplicates in our dataset

np.int64(0)

In [None]:
# Check skewness for variables identified in EDA as right skewed
print("\n Skewness Analysis (EDA Identified right-skewed variable):")
skewed_vars = []

#### Based on EDA recommendations
Let's Log-Transform Skewed Variables

In [55]:
# Log transforming skewed variables

skewed_vars = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']

for var in skewed_vars:
    min_val = df[var].min()

    if min_val <= 0:
        df[f'{var}_log'] = np.log1p(df[var])
        print(f"✓ {var}: Applied log1p transformation (had {min_val:.3f} minimum value)")

    else:
        df[f'{var}_log'] = np.log(df[var])
        print(f"✓ {var}: Applied log transformation")

        # Check skewness before and after
        original_skew = skew(df[var])
        transformed_skew = skew(df[f'{var}_log'])
        print(f"  Original skewness: {original_skew:.3f} → Transformed skewness: {transformed_skew:.3f}")

print(f"\nDataset shape after log transformation: {df.shape}")
print("New log-transformed columns:", [col for col in df.columns if '_log' in col])




✓ ApplicantIncome: Applied log1p transformation (had 0.000 minimum value)
✓ CoapplicantIncome: Applied log1p transformation (had 0.000 minimum value)
✓ LoanAmount: Applied log transformation
  Original skewness: 2.230 → Transformed skewness: -0.258

Dataset shape after log transformation: (367, 15)
New log-transformed columns: ['ApplicantIncome_log', 'CoapplicantIncome_log', 'LoanAmount_log']


In [50]:
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,ApplicantIncome_log,CoapplicantIncome_log,LoanAmount_log,Loan_Amount_Term_log,Credit_History_log
0,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban,1,8.651899,0.0,4.70048,5.886104,0.693147
1,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban,1,8.03171,7.313887,4.836282,5.886104,0.693147
2,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban,0,8.517393,7.496097,5.337538,5.886104,0.693147
3,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,1.0,Urban,1,7.758333,7.842671,4.60517,5.886104,0.693147
4,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban,0,8.094684,0.0,4.356709,5.886104,0.693147


In [52]:
df.head()
df.drop('Credit_History_log',axis=1, inplace=True)

In [53]:
df.drop('Loan_Amount_Term_log',axis=1, inplace=True)

In [54]:
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,ApplicantIncome_log,CoapplicantIncome_log,LoanAmount_log
0,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban,1,8.651899,0.0,4.70048
1,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban,1,8.03171,7.313887,4.836282
2,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban,0,8.517393,7.496097,5.337538
3,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,1.0,Urban,1,7.758333,7.842671,4.60517
4,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban,0,8.094684,0.0,4.356709


#### Outlier Treatment

In [57]:
# Outlier treatment based on EDA recommendations
print("=== OUTLIER TREATMENT (IQR-CAPPING METHOD) ===")
print("EDA recommended IQR-capping for extreme acidity/sulphates to preserve data points")

# Define numerical columns (excluding target)
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if 'Loan_Status' in numerical_cols:
    numerical_cols.remove('Loan_Status')

print(f"Treating outliers in {len(numerical_cols)} numerical features...")

# Apply IQR-capping method
outliers_capped = 0
for col in numerical_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Count outliers before capping
    outliers_before = ((df[col] < lower_bound) | (df[col] > upper_bound)).sum()
    
    if outliers_before > 0:
        # Cap outliers
        df[col] = np.where(df[col] < lower_bound, lower_bound, df[col])
        df[col] = np.where(df[col] > upper_bound, upper_bound, df[col])
        outliers_capped += outliers_before
        print(f"✓ {col}: Capped {outliers_before} outliers")

print(f"\nTotal outliers capped: {outliers_capped}")
print(f"Dataset shape after outlier treatment: {df.shape}")


=== OUTLIER TREATMENT (IQR-CAPPING METHOD) ===
EDA recommended IQR-capping for extreme acidity/sulphates to preserve data points
Treating outliers in 8 numerical features...

Total outliers capped: 0
Dataset shape after outlier treatment: (367, 15)


In [58]:
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,ApplicantIncome_log,CoapplicantIncome_log,LoanAmount_log
0,Male,Yes,0,Graduate,No,5720.0,0.0,110.0,360.0,1.0,Urban,1,8.651899,0.0,4.70048
1,Male,Yes,1,Graduate,No,3076.0,1500.0,126.0,360.0,1.0,Urban,1,8.03171,7.313887,4.836282
2,Male,Yes,2,Graduate,No,5000.0,1800.0,208.0,360.0,1.0,Urban,0,8.517393,7.496097,5.337538
3,Male,Yes,2,Graduate,No,2340.0,2546.0,100.0,360.0,1.0,Urban,1,7.758333,7.842671,4.60517
4,Male,No,0,Not Graduate,No,3276.0,0.0,78.0,360.0,1.0,Urban,0,8.094684,0.0,4.356709


#### Feature Selection


In [62]:
#Let's encode all categorical features
categorical_cols = df.select_dtypes(include=['object']).columns

# Encode them with LabelEncoder
label_enc = LabelEncoder()
for col in categorical_cols:
    df[col] = label_enc.fit_transform(df[col])





X = df.drop('Loan_Status', axis=1)
y = df['Loan_Status']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
anova_selector = SelectKBest(score_func=f_classif, k=8)
anova_selector.fit(X_scaled, y)
anova_features = X.columns[anova_selector.get_support()]
print("\nTop Features (ANOVA F-test):")
print(list(anova_features))


selected_features = df[["Married","Dependents","Education","Self_Employed","LoanAmount","Property_Area","ApplicantIncome_log","LoanAmount_log"]]


Top Features (ANOVA F-test):
['Married', 'Dependents', 'Education', 'Self_Employed', 'LoanAmount', 'Property_Area', 'ApplicantIncome_log', 'LoanAmount_log']


In [63]:
selected_features.head()

Unnamed: 0,Married,Dependents,Education,Self_Employed,LoanAmount,Property_Area,ApplicantIncome_log,LoanAmount_log
0,1,0,0,0,110.0,2,8.651899,4.70048
1,1,1,0,0,126.0,2,8.03171,4.836282
2,1,2,0,0,208.0,2,8.517393,5.337538
3,1,2,0,0,100.0,2,7.758333,4.60517
4,0,0,1,0,78.0,2,8.094684,4.356709


####    Data Splitting

In [66]:
df['Loan_Status'].value_counts()

selected_features = [col for col in selected_features if col in X.columns]



X_selected = X[selected_features]


# First splitL: 80% train + val, 20% test

X_temp, X_test,y_temp,y_test = train_test_split(X_selected,y,test_size = 0.2,random_state = 234)


# Second split: 75% train 25% val out of the 80%

X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size = 0.25, random_state=234)



print(f"\nData split results:")
print(f"Training set: {X_train.shape} ({(X_train.shape[0]/len(X_selected))*100:.1f}%)")
print(f"Validation set: {X_val.shape} ({(X_val.shape[0]/len(X_selected))*100:.1f}%)")
print(f"Test set: {X_test.shape} ({(X_test.shape[0]/len(X_selected))*100:.1f}%)")

# Check class distribution in each set (should be similar due to stratification)
print(f"\nClass distribution verification:")
print("Training set quality distribution:")
print(y_train.value_counts().sort_index())
print("\nValidation set quality distribution:")
print(y_val.value_counts().sort_index())
print("\nTest set quality distribution:")
print(y_test.value_counts().sort_index())




Data split results:
Training set: (219, 8) (59.7%)
Validation set: (74, 8) (20.2%)
Test set: (74, 8) (20.2%)

Class distribution verification:
Training set quality distribution:
Loan_Status
0    100
1    119
Name: count, dtype: int64

Validation set quality distribution:
Loan_Status
0    33
1    41
Name: count, dtype: int64

Test set quality distribution:
Loan_Status
0    38
1    36
Name: count, dtype: int64


In [67]:
# Apply StandardScaler as recommended by EDA
print("=== FEATURE SCALING (STANDARD SCALER) ===")
print("EDA recommended StandardScaler for distance-based models")

# Fit scaler on training data only (to avoid data leakage)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)

# Transform validation and test sets
X_val_scaled = scaler.transform(X_val)
X_val_scaled = pd.DataFrame(X_val_scaled, columns=X_val.columns)

X_test_scaled = scaler.transform(X_test)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

print("✓ Scaling applied successfully!")
print(f"Training set scaled - Mean: {X_train_scaled.mean().mean():.4f}, Std: {X_train_scaled.std().mean():.4f}")
print(f"Validation set scaled - Mean: {X_val_scaled.mean().mean():.4f}, Std: {X_val_scaled.std().mean():.4f}")
print(f"Test set scaled - Mean: {X_test_scaled.mean().mean():.4f}, Std: {X_test_scaled.std().mean():.4f}")

# Verify scaling worked correctly
print(f"\nScaling verification:")
print(f"Training set - Mean ≈ 0: {abs(X_train_scaled.mean().mean()) < 0.01}")
print(f"Training set - Std ≈ 1: {abs(X_train_scaled.std().mean() - 1) < 0.01}")

=== FEATURE SCALING (STANDARD SCALER) ===
EDA recommended StandardScaler for distance-based models
✓ Scaling applied successfully!
Training set scaled - Mean: 0.0000, Std: 1.0023
Validation set scaled - Mean: -0.0137, Std: 1.0418
Test set scaled - Mean: -0.0664, Std: 1.0319

Scaling verification:
Training set - Mean ≈ 0: True
Training set - Std ≈ 1: True


#### Saving Preprocessed Data

In [68]:
print("SAVING PREPROCESSED DATA ")

# Save scaled datasets
X_train_scaled.to_csv('X_train_scaled.csv', index=False)
X_val_scaled.to_csv('X_val_scaled.csv', index=False)
X_test_scaled.to_csv('X_test_scaled.csv', index=False)

# Save target variables
y_train.to_csv('y_train.csv', index=False)
y_val.to_csv('y_val.csv', index=False)
y_test.to_csv('y_test.csv', index=False)

# Save preprocessing objects
import joblib
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(selected_features, 'selected_features.pkl')

# Save preprocessing summary
preprocessing_summary = {
    'original_shape': df.shape,
    'final_shape': df.shape,
    'selected_features': selected_features,
    'train_samples': X_train_scaled.shape[0],
    'val_samples': X_val_scaled.shape[0],
    'test_samples': X_test_scaled.shape[0],
    'scaling_method': 'StandardScaler',
    'outlier_treatment': 'IQR_capping',
}

import json
with open('preprocessing_summary.json', 'w') as f:
    json.dump(preprocessing_summary, f, indent=2)

print("- Preprocessed data saved successfully!")
print("\nFiles created:")
print("- X_train_scaled.csv, X_val_scaled.csv, X_test_scaled.csv")
print("- y_train.csv, y_val.csv, y_test.csv")
print("- scaler.pkl, selected_features.pkl")


SAVING PREPROCESSED DATA 
- Preprocessed data saved successfully!

Files created:
- X_train_scaled.csv, X_val_scaled.csv, X_test_scaled.csv
- y_train.csv, y_val.csv, y_test.csv
- scaler.pkl, selected_features.pkl
