# Handling Imbalanced Data in the Abalone Dataset 
This notebook demonstrates how to handle imbalanced data using the Abalone dataset and compare the performance of machine learning model before and after dealing with imbalance data.
## Import Libraries and Dataset

In [None]:
from ucimlrepo import fetch_ucirepo
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE, RandomOverSampler, ADASYN
from collections import Counter

# fetch abalone dataset 
abalone = fetch_ucirepo(id=1)

# split into features and target 
X = abalone.data.features 
y = abalone.data.targets 

## Check Class Distribution Before Handling Imbalance 

In [None]:
print(f"Class distribution before handling imbalance: {Counter(y)}")

## Split Data and Standardise Features 

In [None]:
features = list(X.columns)
data= pd.get_dummies(X, columns=['Sex'], prefix='', prefix_sep='')
data = data.drop(['I'], axis=1)
data

In [None]:
# Split data into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.2, random_state=42)

# standardise the features 
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Train and Evaluate Model Before Handling Imbalance
RandomForestClassifier will be train before applying any technique to handle imbalance and evaluate its performance.

In [None]:
# Train a model before handling class imbalance
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# predict on test set and evaluate performance 
y_pred_before = rf_model.predict(X_test)
print("Classification Report Before Handling Imbalance:\n", classification_report(y_test, y_pred_before))
print("Confusion Matrix Before Handling Imbalance:\n",confusion_matrix(y_test, y_pred_before))

## Determine if Data is Under fit or Over fit

In [None]:
def check_dataset(x, y):
    # generate and plot learning curve
    train_sizes, train_scores, val_scores = learning_curve(rf_model, x, y, cv=5, n_jobs=-1)
    
    # calculate mean and std deviation for training and validation scores 
    train_scores_mean = np.mean(train_scores, axis=1)
    val_scores_mean = np.mean(val_scores, axis=1)
    train_scores_std = np.std(train_scores,axis=1)
    val_scores_std = np.std(val_scores, axis=1)
    
    plt.figure(figsize=(10, 6))
    plt.plot(train_sizes, train_scores_mean, label='Training Score', color='blue')
    plt.plot(train_sizes, val_scores_mean, label='Validation Score', color='orange')
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color='blue')
    plt.fill_between(train_sizes, val_scores_mean - val_scores_std, val_scores_mean + val_scores_std, alpha=0.1, color='orange')
    plt.title('Learning Curve for RandomForestClassifier')
    plt.xlabel('Training Size')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.grid()
    plt.show()

check_dataset(data, y)

## Handle Imbalance Using Random Oversampling
Using Random Oversampling to balance the class distribution by duplicating samples from the minority class in the training set. 

In [None]:
### use SMOTE to handle imbalance
#smote = SMOTE(random_state=42)
#X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
ros = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)

# check class distribution after handling imbalance
print(f"Class distribution after Random Oversampling: {Counter(y_train_resampled)}")
check_dataset(X_train_resampled, y_train_resampled)

## Train and Evaluate Model After Handling Imbalance
After applying Random Oversampling, the model will be trained again and evaluate its performance on the test set. 

In [None]:
# Train the model on resampled data
rf_model_resampled = RandomForestClassifier(random_state=42)
rf_model_resampled.fit(X_train_resampled, y_train_resampled)

# predict on test set and evaluate performance after resampling
y_pred_after = rf_model_resampled.predict(X_test)
print("Classification Report After Handling Imbalance:\n", classification_report(y_test, y_pred_after))
print("Confusion Matrix After Handling Imbalance:\n", confusion_matrix(y_test, y_pred_after))