In [24]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import label_binarize
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris, load_wine
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    roc_auc_score,
    confusion_matrix,
)
import mglearn
from sklearn.datasets import fetch_openml
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

In [None]:
''' 
Q1) Compare the performance of random forest, SVM, logistic regression, and voting (hard & soft)
classifiers with respect to accuracy, recall, precision, FPR, and ROC metrics for iris and wine
datasets. Hint: the datasets can be loaded using sklearn.datasets.load function.
'''

def compute_fpr(cm):
    '''false positive rate per class and return their average'''
    fp = cm.sum(axis=0) - np.diag(cm)
    tn = cm.sum() - (cm.sum(axis=1) + cm.sum(axis=0) - np.diag(cm))
    fpr = fp / (fp + tn)
    return np.mean(fpr)

results = []

classifiers = [
    ('rf', RandomForestClassifier(random_state=42)),
    ('svm', SVC(kernel='rbf', probability=True, random_state=42)),
    ('lr', LogisticRegression(max_iter=1000, random_state=42)),
]
voting_hard = VotingClassifier(classifiers, voting='hard')
voting_soft = VotingClassifier(classifiers, voting='soft')
all_classifiers = classifiers + [
    ('voting_hard', voting_hard),
    ('voting_soft', voting_soft),
]

# Evaluate on both Iris and Wine datasets
for name, dataset in [("Iris", datasets.load_iris()), ("Wine", datasets.load_wine())]:
    X, y = dataset.data, dataset.target
    y_bin = label_binarize(y, classes=np.unique(y))

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, stratify=y, random_state=42
    )
    y_test_bin = label_binarize(y_test, classes=np.unique(y))

    for clf_name, clf in all_classifiers:
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

    
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred, average='macro', zero_division=0)
        rec = recall_score(y_test, y_pred, average='macro', zero_division=0)
        cm = confusion_matrix(y_test, y_pred)
        fpr = compute_fpr(cm)

        results.append({
            'Dataset': name,
            'Classifier': clf_name,
            'Accuracy': round(acc, 4),
            'Precision (macro)': round(prec, 4),
            'Recall (macro)': round(rec, 0),
            'FPR (macro)': round(fpr, 4),
        })


df = pd.DataFrame(results)
df

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Unnamed: 0,Dataset,Classifier,Accuracy,Precision (macro),Recall (macro),FPR (macro)
0,Iris,rf,0.8889,0.8981,1.0,0.0556
1,Iris,svm,0.9556,0.9556,1.0,0.0222
2,Iris,lr,0.9333,0.9345,1.0,0.0333
3,Iris,voting_hard,0.9333,0.9345,1.0,0.0333
4,Iris,voting_soft,0.9333,0.9345,1.0,0.0333
5,Wine,rf,1.0,1.0,1.0,0.0
6,Wine,svm,0.6667,0.4833,1.0,0.1801
7,Wine,lr,0.9815,0.9848,1.0,0.0101
8,Wine,voting_hard,0.9815,0.9848,1.0,0.0101
9,Wine,voting_soft,0.9815,0.9848,1.0,0.0101


In [11]:
''' 
2) Analyze the impact on accuracy of training and testing for iris and wine datasets by keeping an
ensemble of 10, 50, and 100 decision trees in bagging classifier.
'''

ensemb = [10, 50, 100]
results = []

for name, loader in [("Iris", datasets.load_iris()), ("Wine", datasets.load_wine())]:
    X, y = dataset.data, dataset.target
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, stratify=y, random_state=42
    )
    
    for n in ensemb:
        bag = BaggingClassifier(
            estimator=DecisionTreeClassifier(random_state=42),
            n_estimators=n,
            random_state=42
        )

        bag.fit(X_train, y_train)
        train_acc = accuracy_score(y_train, bag.predict(X_train))
        test_acc  = accuracy_score(y_test,  bag.predict(X_test))
        
        results.append({
            "Dataset": name,
            "n_trees": n,
            "Train Accuracy": train_acc,
            "Test Accuracy": test_acc
        })

df = pd.DataFrame(results)
df

Unnamed: 0,Dataset,n_trees,Train Accuracy,Test Accuracy
0,Iris,10,0.991935,0.981481
1,Iris,50,1.0,1.0
2,Iris,100,1.0,1.0
3,Wine,10,0.991935,0.981481
4,Wine,50,1.0,1.0
5,Wine,100,1.0,1.0


In [None]:
'''
Q5) Compare the linear and random forest regressions on the basis of R
2
and mean relative error for
wave, RAM prices, and Boston Housing datasets. Use train-test split of 60:40. Apply the log
transformation to preprocess the data.
'''
def mean_relative_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true))


def load_wave():
    X, y = mglearn.datasets.make_wave()
    return X.reshape(-1, 1), y


def load_ram_prices():
    ram = pd.read_csv(
        os.path.join(mglearn.datasets.DATA_PATH, "ram_price.csv")
    )
    ram['date'] = pd.to_datetime(ram['date'])
    X = ram['date'].map(pd.Timestamp.toordinal).values.reshape(-1, 1)
    y = ram['price'].values
    return X, y


def load_boston():
    boston = fetch_openml(name='boston', version=1, as_frame=True)
    X = boston.data.values
    y = boston.target.astype(float).values
    return X, y


datasets = {
    'wave': load_wave,
    'ram_prices': load_ram_prices,
    'boston_housing': load_boston
}

results = []

for name, loader in datasets.items():
    X, y = loader()

    # y for wave, rest log trans
    if name == 'wave':
        y_trans = y
        invert = lambda z: z
    else:
        y_trans = np.log1p(y)
        invert = np.expm1    

    X_train, X_test, y_train, y_test = train_test_split(
        X, y_trans, test_size=0.4, random_state=0
    )

    for model_name, model in [
        ('LinearRegression', LinearRegression()),
        ('RandomForest', RandomForestRegressor(random_state=0))
    ]:

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        r2 = r2_score(y_test, y_pred)

        y_test_orig = invert(y_test)
        y_pred_orig = invert(y_pred)
        mre = mean_relative_error(y_test_orig, y_pred_orig)

        results.append({
            'dataset': name,
            'model': model_name,
            'r2': r2,
            'mean_rel_error': mre
        })

df = pd.DataFrame(results)
df


Unnamed: 0,dataset,model,r2,mean_rel_error
0,wave,LinearRegression,0.623021,0.809431
1,wave,RandomForest,0.690468,0.86911
2,ram_prices,LinearRegression,-0.008742,363.349768
3,ram_prices,RandomForest,-0.011309,346.528298
4,boston_housing,LinearRegression,0.744852,0.148574
5,boston_housing,RandomForest,0.845858,0.111571


In [25]:
''' 
Q6) Compare the performance of random forest, adaboost, and gradient boosting classifiers with
respect to accuracy, recall, precision, FPR, and ROC metrics for iris and wine datasets.
'''

def compute_fpr(y_true, y_pred, labels):
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    fp = cm.sum(axis=0) - np.diag(cm)
    tn = cm.sum() - (cm.sum(axis=1) + cm.sum(axis=0) - np.diag(cm))
    fpr = fp / (fp + tn)
    return np.mean(fpr)

datasets = {
    'iris': load_iris,
    'wine': load_wine
}

classifiers = {
    'RandomForest': RandomForestClassifier(random_state=0),
    'AdaBoost': AdaBoostClassifier(random_state=0),
    'GradientBoosting': GradientBoostingClassifier(random_state=0)
}

results = []

for ds_name, loader in datasets.items():
    data = loader()
    X, y = data.data, data.target
    labels = np.unique(y)
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.4, random_state=0, stratify=y
    )
    
    y_test_onehot = pd.get_dummies(y_test).values
    
    for clf_name, clf in classifiers.items():
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        
        acc   = accuracy_score(y_test, y_pred)
        prec  = precision_score(y_test, y_pred, average='macro', zero_division=0)
        rec   = recall_score(y_test, y_pred, average='macro', zero_division=0)
        fpr   = compute_fpr(y_test, y_pred, labels)
        
        results.append({
            'dataset': ds_name,
            'classifier': clf_name,
            'accuracy': acc,
            'precision_macro': prec,
            'recall_macro': rec,
            'fpr_macro': fpr,
        })

df = pd.DataFrame(results)
df

Unnamed: 0,dataset,classifier,accuracy,precision_macro,recall_macro,fpr_macro
0,iris,RandomForest,0.95,0.95071,0.95,0.025
1,iris,AdaBoost,0.95,0.95071,0.95,0.025
2,iris,GradientBoosting,0.966667,0.966667,0.966667,0.016667
3,wine,RandomForest,0.972222,0.974617,0.974617,0.014696
4,wine,AdaBoost,0.902778,0.912361,0.889292,0.051993
5,wine,GradientBoosting,0.861111,0.894824,0.847626,0.075904
