<a href="https://colab.research.google.com/github/Adarsh0911/Elements-of-Aiml-LAb-Adarsh-Singh/blob/main/Aiml_exp_9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

# Generate an imbalanced dataset
X, y = make_classification(
    n_classes=2,
    class_sep=2,
    weights=[0.9, 0.1], # 90% majority, 10% minority
    n_informative=3,
    n_redundant=1,
    flip_y=0,
    n_features=5,
    n_clusters_per_class=1,
    n_samples=1000,
    random_state=42
)

# Convert to DataFrame
df = pd.DataFrame(X, columns=[f"Feature_{i}" for i in range(X.shape[1])])
df['Target'] = y

# Display basic details
print("Dataset Info:")
print(df.info())

# Show class distribution
print("\nClass Distribution:")
print(df['Target'].value_counts())


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Feature_0  1000 non-null   float64
 1   Feature_1  1000 non-null   float64
 2   Feature_2  1000 non-null   float64
 3   Feature_3  1000 non-null   float64
 4   Feature_4  1000 non-null   float64
 5   Target     1000 non-null   int64  
dtypes: float64(5), int64(1)
memory usage: 47.0 KB
None

Class Distribution:
Target
0    900
1    100
Name: count, dtype: int64


In [8]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=42)
X_resampled_ros, y_resampled_ros = ros.fit_resample(X, y)

print("\nClass Distribution After Random Oversampling:")
print(pd.Series(y_resampled_ros).value_counts())



Class Distribution After Random Oversampling:
0    900
1    900
Name: count, dtype: int64


In [9]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=42)
X_resampled_rus, y_resampled_rus = rus.fit_resample(X, y)

print("\nClass Distribution After Random Undersampling:")
print(pd.Series(y_resampled_rus).value_counts())



Class Distribution After Random Undersampling:
0    100
1    100
Name: count, dtype: int64


In [10]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_resampled_smote, y_resampled_smote = smote.fit_resample(X, y)

print("\nClass Distribution After SMOTE:")
print(pd.Series(y_resampled_smote).value_counts())



Class Distribution After SMOTE:
0    900
1    900
Name: count, dtype: int64


In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

# Function to train and evaluate a classifier
def evaluate_classifier(X_train, X_test, y_train, y_test, class_weight=None):
    model = LogisticRegression(class_weight=class_weight, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    # Performance Metrics
    print(classification_report(y_test, y_pred))
    print(f"AUC-ROC: {roc_auc_score(y_test, y_pred_proba):.4f}")

# Split the original dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

print("\nPerformance on Imbalanced Dataset:")
evaluate_classifier(X_train, X_test, y_train, y_test)

# Oversampled
X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)
print("\nPerformance After Random Oversampling:")
evaluate_classifier(X_train_ros, X_test, y_train_ros, y_test)

# Undersampled
X_train_rus, y_train_rus = rus.fit_resample(X_train, y_train)
print("\nPerformance After Random Undersampling:")
evaluate_classifier(X_train_rus, X_test, y_train_rus, y_test)

# SMOTE
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
print("\nPerformance After SMOTE:")
evaluate_classifier(X_train_smote, X_test, y_train_smote, y_test)

# Class Weighting
print("\nPerformance with Class Weighting:")
evaluate_classifier(X_train, X_test, y_train, y_test, class_weight='balanced')



Performance on Imbalanced Dataset:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       270
           1       1.00      1.00      1.00        30

    accuracy                           1.00       300
   macro avg       1.00      1.00      1.00       300
weighted avg       1.00      1.00      1.00       300

AUC-ROC: 1.0000

Performance After Random Oversampling:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       270
           1       0.97      1.00      0.98        30

    accuracy                           1.00       300
   macro avg       0.98      1.00      0.99       300
weighted avg       1.00      1.00      1.00       300

AUC-ROC: 1.0000

Performance After Random Undersampling:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       270
           1       0.97      1.00      0.98        30

    accuracy                     

In [12]:
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score

# Function to collect metrics
def get_metrics(y_test, y_pred, y_pred_proba):
    return {
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1-Score": f1_score(y_test, y_pred),
        "AUC-ROC": roc_auc_score(y_test, y_pred_proba)
    }

# Collect metrics
metrics = {}

# Evaluate each method and collect metrics
methods = {
    "Imbalanced": (X_train, y_train, None),
    "Oversampling": (X_train_ros, y_train_ros, None),
    "Undersampling": (X_train_rus, y_train_rus, None),
    "SMOTE": (X_train_smote, y_train_smote, None),
    "Class Weighting": (X_train, y_train, 'balanced')
}

for method, (X_train_balanced, y_train_balanced, weight) in methods.items():
    model = LogisticRegression(class_weight=weight, random_state=42)
    model.fit(X_train_balanced, y_train_balanced)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    metrics[method] = get_metrics(y_test, y_pred, y_pred_proba)

# Display metrics
metrics_df = pd.DataFrame(metrics).T
print("\nPerformance Metrics Summary:")
print(metrics_df)



Performance Metrics Summary:
                 Precision  Recall  F1-Score  AUC-ROC
Imbalanced        1.000000     1.0  1.000000      1.0
Oversampling      0.967742     1.0  0.983607      1.0
Undersampling     0.967742     1.0  0.983607      1.0
SMOTE             0.967742     1.0  0.983607      1.0
Class Weighting   0.967742     1.0  0.983607      1.0
