In [20]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.metrics import r2_score
import os 
from sklearn.preprocessing import StandardScaler
import mglearn
from sklearn.pipeline import make_pipeline
from sklearn.datasets import fetch_openml
from mglearn.datasets import make_wave
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, roc_auc_score
import matplotlib.pyplot as plt

In [5]:
''' 
Q1) Analyze the performance of SVM with respect to accuracy, recall, precision, FPR, and ROC metrics
for iris and wine datasets by using linear, RBF, and polynomial kernels. Hint: the datasets can be
loaded using sklearn.datasets.load function.
'''

def compute_fpr(cm):
    # Compute false positive rate per class and return their average
    fp = cm.sum(axis=0) - np.diag(cm)
    tn = cm.sum() - (cm.sum(axis=1) + cm.sum(axis=0) - np.diag(cm))
    fpr = fp / (fp + tn)
    return np.mean(fpr)

results = []

for name, dataset in [("Iris", datasets.load_iris()), ("Wine", datasets.load_wine())]:
    X, y = dataset.data, dataset.target
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42, stratify=y
    )
    y_test_bin = label_binarize(y_test, classes=np.unique(y))
    
    for kernel in ["linear", "rbf", "poly"]:
        clf = SVC(kernel=kernel, probability=True, random_state=42)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        y_score = clf.predict_proba(X_test)
        
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred, average='macro', zero_division=0)
        rec = recall_score(y_test, y_pred, average='macro', zero_division=0)
        cm = confusion_matrix(y_test, y_pred)
        fpr = compute_fpr(cm)
        roc_auc = roc_auc_score(y_test_bin, y_score, average='macro', multi_class='ovr')
        
        results.append({
            "Dataset": name,
            "Kernel": kernel,
            "Accuracy": round(acc, 4),
            "Precision (macro)": round(prec, 4),
            "Recall (macro)": round(rec, 4),
            "FPR (macro)": round(fpr, 4),
            "ROC AUC (macro)": round(roc_auc, 4)
        })

df_results = pd.DataFrame(results)
df_results

Unnamed: 0,Dataset,Kernel,Accuracy,Precision (macro),Recall (macro),FPR (macro),ROC AUC (macro)
0,Iris,linear,1.0,1.0,1.0,0.0,1.0
1,Iris,rbf,0.9556,0.9556,0.9556,0.0222,0.9956
2,Iris,poly,0.9556,0.9556,0.9556,0.0222,0.997
3,Wine,linear,0.9444,0.9522,0.9397,0.0295,0.999
4,Wine,rbf,0.6667,0.4833,0.6111,0.1801,0.8949
5,Wine,poly,0.6667,0.5128,0.6111,0.1818,0.8203


In [7]:
''' 
Q2) Analyze the impact on accuracy of training and testing for iris and wine datasets due to the
parameter tuning (C=0.1, 1.0, 1000 and gamma=0.1, 1.0, 10) of RBF kernel.
'''

results = []

for name, dataset in [("Iris", datasets.load_iris()), ("Wine", datasets.load_wine())]:
    X, y = dataset.data, dataset.target
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42, stratify=y
    )
    
    for C in [0.1, 1.0, 1000]:
        for gamma in [0.1, 1.0, 10]:
            # training
            clf = SVC(kernel='rbf', C=C, gamma=gamma, random_state=42)
            clf.fit(X_train, y_train)
            
            y_train_pred = clf.predict(X_train)
            y_test_pred = clf.predict(X_test)
            
            # metrics
            train_acc = accuracy_score(y_train, y_train_pred)
            test_acc = accuracy_score(y_test, y_test_pred)
            
            results.append({
                "Dataset": name,
                "C": C,
                "Gamma": gamma,
                "Train Accuracy": round(train_acc, 4),
                "Test Accuracy": round(test_acc, 4)
            })

df = pd.DataFrame(results)
df

Unnamed: 0,Dataset,C,Gamma,Train Accuracy,Test Accuracy
0,Iris,0.1,0.1,0.9429,0.8889
1,Iris,0.1,1.0,0.981,0.9556
2,Iris,0.1,10.0,0.8952,0.7333
3,Iris,1.0,0.1,0.981,0.9556
4,Iris,1.0,1.0,0.981,0.9556
5,Iris,1.0,10.0,1.0,0.9333
6,Iris,1000.0,0.1,1.0,0.9333
7,Iris,1000.0,1.0,1.0,0.8889
8,Iris,1000.0,10.0,1.0,0.9111
9,Wine,0.1,0.1,0.4032,0.3889


In [21]:
'''
Q3) Compare the linear and SVM (linear, rbf, polynomial kernel) regressions on the basis of R
2 and mean relative error for wave, RAM prices, and Boston Housing datasets. Use train-test split of
60:40. Apply the log transformation to preprocess the data.
'''

def mean_relative_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true))


def load_wave():
    X, y = mglearn.datasets.make_wave()
    return X.reshape(-1, 1), y


def load_ram_prices():
    ram = pd.read_csv(
        os.path.join(mglearn.datasets.DATA_PATH, "ram_price.csv")
    )
    ram['date'] = pd.to_datetime(ram['date'])
    X = ram['date'].map(pd.Timestamp.toordinal).values.reshape(-1, 1)
    y = ram['price'].values
    return X, y


def load_boston():
    boston = fetch_openml(name='boston', version=1, as_frame=True)
    X = boston.data.values
    y = boston.target.astype(float).values
    return X, y

# Dictionary of dataset loaders
datasets = {
    'wave': load_wave,
    'ram_prices': load_ram_prices,
    'boston_housing': load_boston
}


data = []

for name, loader in datasets.items():
    X, y = loader()

    X = np.array(X, dtype=float)
    y = np.array(y, dtype=float)

    # Log-transform features/target if all strictly positive
    if np.all(X > 0):
        X = np.log(X)
    if np.all(y > 0):
        y = np.log(y)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.4, random_state=0
    )

    models = {
        'LinearRegression': LinearRegression(),
        'SVR_linear': make_pipeline(StandardScaler(), SVR(kernel='linear')),
        'SVR_rbf': make_pipeline(StandardScaler(), SVR(kernel='rbf')),
        'SVR_poly': make_pipeline(StandardScaler(), SVR(kernel='poly', degree=3))
    }

    # Fit each model, predict, and compute metrics
    for model_name, model in models.items():
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)

        r2 = r2_score(y_test, y_pred)
        mre = mean_relative_error(y_test, y_pred)
        data.append({
            'Dataset': name,
            'Model': model_name,
            'R2': r2,
            'MRE': mre
        })

df_results = pd.DataFrame(data)
df_results


Unnamed: 0,Dataset,Model,R2,MRE
0,wave,LinearRegression,0.623021,0.809431
1,wave,SVR_linear,0.596814,0.771936
2,wave,SVR_rbf,0.670189,0.973148
3,wave,SVR_poly,0.406952,0.830989
4,ram_prices,LinearRegression,-0.012652,1.909637
5,ram_prices,SVR_linear,-0.02611,1.744954
6,ram_prices,SVR_rbf,-0.02611,1.744954
7,ram_prices,SVR_poly,-0.02611,1.744954
8,boston_housing,LinearRegression,0.744991,0.051435
9,boston_housing,SVR_linear,0.735596,0.050275


In [None]:
''' 
Q5) Analyze the impact on mean relative error for wave, RAM prices, and Boston Housing datasets
due to the parameter tuning (C=0.1, 1.0, 1000 and gamma=0.1, 1.0, 10) of RBF kernel in SVM. Use
train-test split of 60:40. Apply the log transformation to preprocess the data.
'''

Cs = [0.1, 1.0, 1000]
gammas = [0.1, 1.0, 10]

results = []

for name, loader in datasets.items():
    X, y = loader()
    X = X.astype(float)
    y = y.astype(float)

    # log transform if strictly positive
    if np.all(X > 0):
        X = np.log(X)
    if np.all(y > 0):
        y = np.log(y)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=0.6, random_state=0
    )

    for C in Cs:
        for gamma in gammas:
            model = make_pipeline(
                StandardScaler(),
                SVR(kernel='rbf', C=C, gamma=gamma)
            )
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            mre = mean_relative_error(y_test, y_pred)
            results.append({
                'Dataset': name,
                'C': C,
                'gamma': gamma,
                'MRE': mre
            })

df_results = pd.DataFrame(results)
df_results.pivot_table(
    index=['Dataset'],
    columns=['C', 'gamma'],
    values='MRE'
)

C,0.1,0.1,0.1,1.0,1.0,1.0,1000.0,1000.0,1000.0
gamma,0.1,1.0,10.0,0.1,1.0,10.0,0.1,1.0,10.0
Dataset,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
boston_housing,0.05278,0.08538,0.105991,0.041759,0.066791,0.103264,0.050665,0.067076,0.103137
ram_prices,1.744954,1.744954,1.744954,1.744954,1.744954,1.744954,1.744954,1.744954,1.744954
wave,0.821112,0.792907,0.736348,0.780858,0.973148,0.886573,1.064935,0.825018,1.164682
