<a href="https://colab.research.google.com/github/AdithGH762/Fundamentals-of-ML/blob/main/Unbalanced_Data_SMOTE_LAB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ==========================================
# STEP 1: Install Required Library
# ==========================================
!pip install imbalanced-learn

# ==========================================
# STEP 2: Import Libraries
# ==========================================
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE

# ==========================================
# STEP 3: Load Dataset
# ==========================================
df = pd.read_csv('/content/Loan.csv')   # Make sure Loan.csv is uploaded
print("First 5 rows:")
print(df.head())

# ==========================================
# STEP 4: Drop Unnecessary Columns
# ==========================================
if 'Loan_ID' in df.columns:
    df.drop('Loan_ID', axis=1, inplace=True)

# ==========================================
# STEP 5: Encode Categorical Variables
# ==========================================
le = LabelEncoder()
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = le.fit_transform(df[col].astype(str))

# ==========================================
# STEP 6: Separate Features & Target
# ==========================================
X = df.drop('Loan_Status', axis=1)
y = df['Loan_Status']

# ==========================================
# STEP 7: Handle Missing Values (Important for SMOTE)
# ==========================================
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)
X = pd.DataFrame(X, columns=df.drop('Loan_Status', axis=1).columns)

# ==========================================
# STEP 8: Check Original Class Distribution
# ==========================================
print("\nOriginal Class Distribution:")
print(y.value_counts())

majority_class = y.value_counts().idxmax()
minority_class = y.value_counts().idxmin()

print("\nMajority Class:", majority_class)
print("Minority Class:", minority_class)

# ==========================================
# STEP 9: Train-Test Split
# ==========================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# ==========================================
# STEP 10: Baseline Model (Imbalanced Data)
# ==========================================
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("\n=== Baseline Model (Imbalanced Data) ===")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# ==========================================
# STEP 11: Manual Oversampling (Duplicate Minority)
# ==========================================
train_data = pd.concat([X_train, y_train], axis=1)

majority_data = train_data[train_data['Loan_Status'] == majority_class]
minority_data = train_data[train_data['Loan_Status'] == minority_class]

minority_upsampled = minority_data.sample(
    n=len(majority_data),
    replace=True,
    random_state=42
)

balanced_data = pd.concat([majority_data, minority_upsampled])

X_train_dup = balanced_data.drop('Loan_Status', axis=1)
y_train_dup = balanced_data['Loan_Status']

model_dup = RandomForestClassifier(random_state=42)
model_dup.fit(X_train_dup, y_train_dup)
y_pred_dup = model_dup.predict(X_test)

print("\n=== Manual Duplication Oversampling ===")
print("Accuracy:", accuracy_score(y_test, y_pred_dup))
print(classification_report(y_test, y_pred_dup))

# ==========================================
# STEP 12: SMOTE Oversampling
# ==========================================
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print("\nAfter SMOTE Class Distribution:")
print(pd.Series(y_train_smote).value_counts())

model_smote = RandomForestClassifier(random_state=42)
model_smote.fit(X_train_smote, y_train_smote)
y_pred_smote = model_smote.predict(X_test)

print("\n=== SMOTE Oversampling ===")
print("Accuracy:", accuracy_score(y_test, y_pred_smote))
print(classification_report(y_test, y_pred_smote))


First 5 rows:
    Loan_ID Gender Married Dependents     Education Self_Employed  \
0  LP001002   Male      No          0      Graduate            No   
1  LP001003   Male     Yes          1      Graduate            No   
2  LP001005   Male     Yes          0      Graduate           Yes   
3  LP001006   Male     Yes          0  Not Graduate            No   
4  LP001008   Male      No          0      Graduate            No   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0             5849                0.0         NaN             360.0   
1             4583             1508.0       128.0             360.0   
2             3000                0.0        66.0             360.0   
3             2583             2358.0       120.0             360.0   
4             6000                0.0       141.0             360.0   

   Credit_History Property_Area Loan_Status  
0             1.0         Urban           Y  
1             1.0         Rural           N  
2     