# One-vs-Rest Classification with Oversampling on Iris Dataset
This notebook demonstrates how to perform multi-class classification on the Iris dataset using a one-vs-rest approach. It includes manual oversampling and SMOTE-style oversampling to balance the classes, and compares results for different train/validation/test splits.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, classification_report

# Load and preprocess dataset
df = pd.read_csv("Iris.csv").drop(columns=["Id"])
X, y = df.drop(columns=["Species"]), df["Species"]
classes = y.unique()

# Split data into train/val/test
def split_data(X, y, train_size, val_size, test_size):
    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y, train_size=train_size, stratify=y, random_state=42
    )
    val_ratio = val_size / (val_size + test_size)
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, train_size=val_ratio, stratify=y_temp, random_state=42
    )
    return X_train, X_val, X_test, y_train, y_val, y_test

# Oversample binary labels to 100 positives and 100 negatives
def oversample_fixed(X, y_binary, target_size=100):
    def replicate_class(X_class):
        reps = target_size // len(X_class)
        remainder = target_size - reps * len(X_class)
        replicated = pd.concat([X_class] * reps)
        if remainder > 0:
            replicated = pd.concat([replicated, X_class.sample(remainder, replace=True, random_state=42)])
        return replicated

    X_pos = X[y_binary == 1]
    X_neg = X[y_binary == 0]
    y_pos = pd.Series([1] * target_size)
    y_neg = pd.Series([0] * target_size)

    X_bal = pd.concat([replicate_class(X_pos), replicate_class(X_neg)])
    y_bal = pd.concat([y_pos, y_neg])
    return X_bal.to_numpy(), y_bal.to_numpy()

# Predict using one-vs-rest models
def predict(models, X):
    prob_matrix = np.column_stack([model.predict(X) for model in models.values()])
    return [cls for cls in classes[np.argmax(prob_matrix, axis=1)]]

# Define splits
splits = {
    "80-10-10": split_data(X, y, 0.8, 0.1, 0.1),
    "70-15-15": split_data(X, y, 0.7, 0.15, 0.15)
}

results = {}

# Train and evaluate models
for split_name, (X_train, X_val, X_test, y_train, y_val, y_test) in splits.items():
    print(f"\n==== Results for {split_name} split ====")
    models = {}

    for cls in classes:
        y_binary = (y_train == cls).astype(int)
        X_resampled, y_resampled = oversample_fixed(X_train, y_binary)

        model = LinearRegression()
        model.fit(X_resampled, y_resampled)
        models[cls] = model

    y_val_pred = predict(models, X_val.to_numpy())
    y_test_pred = predict(models, X_test.to_numpy())

    val_acc = accuracy_score(y_val, y_val_pred)
    test_acc = accuracy_score(y_test, y_test_pred)

    print(f"Validation Accuracy: {val_acc:.2f}")
    print(f"Test Accuracy: {test_acc:.2f}")
    print("Classification Report (Test):\n", classification_report(y_test, y_test_pred))

    results[split_name] = {"val_acc": val_acc, "test_acc": test_acc}


==== Results for 80-10-10 split ====
Validation Accuracy: 0.80
Test Accuracy: 0.80
Classification Report (Test):
                  precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00         5
Iris-versicolor       0.75      0.60      0.67         5
 Iris-virginica       0.67      0.80      0.73         5

       accuracy                           0.80        15
      macro avg       0.81      0.80      0.80        15
   weighted avg       0.81      0.80      0.80        15


==== Results for 70-15-15 split ====
Validation Accuracy: 0.68
Test Accuracy: 0.83
Classification Report (Test):
                  precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00         7
Iris-versicolor       0.83      0.62      0.71         8
 Iris-virginica       0.70      0.88      0.78         8

       accuracy                           0.83        23
      macro avg       0.84      0.83      0.83        23
   weighted avg       0.

## Data Loading, Preprocessing, and One-vs-Rest Training (Manual Oversampling)
- Loads the Iris dataset and removes the 'Id' column.
- Splits the data into train, validation, and test sets using stratified sampling.
- Implements a function to manually oversample binary labels to a fixed number of positives and negatives.
- Trains one-vs-rest linear regression models for each class using the oversampled data.
- Evaluates the models on validation and test sets, reporting accuracy and classification metrics for each split.

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.neighbors import NearestNeighbors

# Load dataset
df = pd.read_csv("Iris.csv").drop(columns=["Id"])
X, y = df.drop(columns=["Species"]), df["Species"]
classes = y.unique()

# === Data Splitting ===
def split_data(X, y, train_size, val_size, test_size):
    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y, train_size=train_size, stratify=y, random_state=42
    )
    val_ratio = val_size / (val_size + test_size)
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, train_size=val_ratio, stratify=y_temp, random_state=42
    )
    return X_train, X_val, X_test, y_train, y_val, y_test

# === Fixed Oversampling (100:100) ===
def oversample_fixed(X, y_binary, target_size=100):
    def replicate_class(X_class):
        reps = target_size // len(X_class)
        remainder = target_size - reps * len(X_class)
        replicated = pd.concat([X_class] * reps)
        if remainder > 0:
            replicated = pd.concat([replicated, X_class.sample(remainder, replace=True, random_state=42)])
        return replicated

    X_pos, X_neg = X[y_binary == 1], X[y_binary == 0]
    y_pos = pd.Series([1] * target_size)
    y_neg = pd.Series([0] * target_size)

    X_bal = pd.concat([replicate_class(X_pos), replicate_class(X_neg)])
    y_bal = pd.concat([y_pos, y_neg])
    return X_bal.to_numpy(), y_bal.to_numpy()

# === SMOTE-style Oversampling ===
def smote_oversample(X, y, method="random", lam=0.5):
    X_min, X_maj = X[y == 1], X[y == 0]
    target_size = len(X_maj)
    synthetic = []

    if method == "random":
        while len(synthetic) + len(X_min) < target_size:
            i, j = np.random.choice(len(X_min), 2, replace=False)
            x_new = lam * X_min[i] + (1 - lam) * X_min[j]
            synthetic.append(x_new)

    elif method == "nearest":
        neigh = NearestNeighbors(n_neighbors=2).fit(X_min)
        while len(synthetic) + len(X_min) < target_size:
            i = np.random.randint(len(X_min))
            x1 = X_min[i]
            _, idx = neigh.kneighbors([x1])
            x2 = X_min[idx[0][1]]
            x_new = lam * x1 + (1 - lam) * x2
            synthetic.append(x_new)

    X_syn = np.vstack([X_min, synthetic])
    y_syn = np.ones(len(X_syn))
    X_bal = np.vstack([X_syn, X_maj[:target_size]])
    y_bal = np.hstack([y_syn, np.zeros(target_size)])
    return X_bal, y_bal

# === Prediction Function (One-vs-Rest) ===
def predict(models, X):
    prob_matrix = np.column_stack([model.predict(X) for model in models.values()])
    return [cls for cls in classes[np.argmax(prob_matrix, axis=1)]]

# === Define splits ===
splits = {
    "80-10-10": split_data(X, y, 0.8, 0.1, 0.1),
    "70-15-15": split_data(X, y, 0.7, 0.15, 0.15)
}

results = {}

# === Train and evaluate ===
for split_name, (X_train, X_val, X_test, y_train, y_val, y_test) in splits.items():
    print(f"\n==== Results for {split_name} split ====")

    # --- Fixed Oversampling ---
    models = {}
    for cls in classes:
        y_binary = (y_train == cls).astype(int)
        X_res, y_res = oversample_fixed(X_train, y_binary)
        model = LinearRegression().fit(X_res, y_res)
        models[cls] = model

    y_val_pred = predict(models, X_val.to_numpy())
    y_test_pred = predict(models, X_test.to_numpy())
    print("\n[Fixed Oversampling]")
    print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
    print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
    print("Report:\n", classification_report(y_test, y_test_pred))

    # --- SMOTE Oversampling (random & nearest) ---
    for method in ["random", "nearest"]:
        models = {}
        for cls in classes:
            y_binary = (y_train == cls).astype(int)
            X_res, y_res = smote_oversample(X_train.to_numpy(), y_binary.to_numpy(), method=method)
            model = LinearRegression().fit(X_res, y_res)
            models[cls] = model

        y_val_pred = predict(models, X_val.to_numpy())
        y_test_pred = predict(models, X_test.to_numpy())
        print(f"\n[SMOTE-{method}]")
        print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
        print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
        print("Report:\n", classification_report(y_test, y_test_pred))


==== Results for 80-10-10 split ====

[Fixed Oversampling]
Validation Accuracy: 0.8
Test Accuracy: 0.8
Report:
                  precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00         5
Iris-versicolor       0.75      0.60      0.67         5
 Iris-virginica       0.67      0.80      0.73         5

       accuracy                           0.80        15
      macro avg       0.81      0.80      0.80        15
   weighted avg       0.81      0.80      0.80        15


[SMOTE-random]
Validation Accuracy: 0.8
Test Accuracy: 0.8
Report:
                  precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00         5
Iris-versicolor       0.75      0.60      0.67         5
 Iris-virginica       0.67      0.80      0.73         5

       accuracy                           0.80        15
      macro avg       0.81      0.80      0.80        15
   weighted avg       0.81      0.80      0.80        15


[SMOTE-neares

## SMOTE-style Oversampling and Method Comparison
- Loads the Iris dataset and splits it as before.
- Implements manual oversampling and SMOTE-style oversampling (random and nearest neighbor methods) to balance classes.
- Trains one-vs-rest linear regression models for each class using both oversampling strategies.
- Compares validation and test accuracy for each method and split, and prints classification reports.