# Day 09. Exercise 02
# Metrics

## 0. Imports

In [1]:
import pandas as pd
import numpy as np
from itertools import combinations

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

## 1. Preprocessing

1. Create the same dataframe as in the previous exercise.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [2]:
df = pd.read_csv('../data/dayofweek.csv')

X = df.drop('dayofweek', axis=1)
y = df['dayofweek']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

## 2. SVM

1. Use the best parameters from the previous exercise and train the model of SVM.
2. You need to calculate `accuracy`, `precision`, `recall`, `ROC AUC`.

 - `precision` and `recall` should be calculated for each class (use `average='weighted'`)
 - `ROC AUC` should be calculated for each class against any other class (all possible pairwise combinations) and then weighted average should be applied for the final metric
 - the code in the cell should display the result as below:

```
accuracy is 0.88757
precision is 0.89267
recall is 0.88757
roc_auc is 0.97878
```

In [3]:
def print_metrics(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')

    roc_auc_scores = []
    for class1, class2 in combinations(range(len(model.classes_)), 2):
        y_true_bin = y_test == model.classes_[class1]
        y_prob_diff = y_prob[:, class1] - y_prob[:, class2]
        if np.any(y_prob_diff == 0):
            roc_auc_scores.append(1)
        else:
            roc_auc_scores.append(roc_auc_score(y_true_bin, y_prob_diff))

    roc_auc = np.mean(roc_auc_scores)

    print(f"accuracy is {accuracy:.5f}")
    print(f"precision is {precision:.5f}")
    print(f"recall is {recall:.5f}")
    print(f"roc_auc is {roc_auc:.5f}")


In [4]:
svm = SVC(C=10, class_weight='balanced', gamma='scale', kernel='rbf', random_state=21, probability=True)
svm.fit(X_train, y_train)

print_metrics(svm, X_test, y_test)

accuracy is 0.90828
precision is 0.90854
recall is 0.90828
roc_auc is 0.93846


## 3. Decision tree

1. The same task for decision tree

In [5]:
dt = DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=23, random_state=21)
dt.fit(X_train, y_train)

print_metrics(dt, X_test, y_test)

accuracy is 0.88757
precision is 0.88922
recall is 0.88757
roc_auc is 1.00000


## 4. Random forest

1. The same task for random forest.

In [6]:
rf = RandomForestClassifier(n_estimators=50, max_depth=39, class_weight='balanced', criterion='gini', random_state=21)
rf.fit(X_train, y_train)

print_metrics(rf, X_test, y_test)

accuracy is 0.89941
precision is 0.90364
recall is 0.89941
roc_auc is 1.00000


## 5. Predictions

1. Choose the best model.
2. Analyze: for which `weekday` your model makes the most errors (in % of the total number of samples of that class in your full dataset), for which `labname` and for which `users`.
3. Save the model.

In [7]:
best_model = SVC(C=10, class_weight='balanced', gamma='scale', kernel='rbf', random_state=21, probability=True)
best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Final Accuracy:", accuracy)

Final Accuracy: 0.908284023668639


In [8]:
result = pd.DataFrame({'actual': y_test, 'predicted': y_pred})

errors = result[result['actual'] != result['predicted']]['actual'].value_counts(normalize=True) * 100

errors_df = pd.DataFrame({'weekday': errors.index, 'error_percentage': errors.values})
errors_df['weekday'] = errors_df['weekday'].replace({0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday'})

errors_df

Unnamed: 0,weekday,error_percentage
0,Monday,25.806452
1,Tuesday,19.354839
2,Saturday,16.129032
3,Friday,12.903226
4,Wednesday,12.903226
5,Thursday,9.677419
6,Sunday,3.225806


In [9]:
from joblib import dump

filepath = '../data/svm_model.joblib'
dump(best_model, filepath)

['../data/svm_model.joblib']

## 6. Function

1. Write a function that takes a list of different models and a corresponding list of parameters (dicts) and returns a dict that contains all the 4 metrics for each model.

In [10]:
def evaluate_models(models, params, X_train, X_test, y_train, y_test):
    results = {}
    for model, param in zip(models, params):
        model.set_params(**param)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_prob = model.predict_proba(X_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        
        roc_auc_scores = []
        for class1, class2 in combinations(range(len(model.classes_)), 2):
            y_true_bin = y_test == model.classes_[class1]
            y_prob_diff = y_prob[:, class1] - y_prob[:, class2]
            if np.any(y_prob_diff == 0):
                roc_auc_scores.append(1)
            else:
                roc_auc_scores.append(roc_auc_score(y_true_bin, y_prob_diff))
        
        roc_auc = np.mean(roc_auc_scores)
        
        results[model.__class__.__name__] = {'accuracy': accuracy, 
                                             'precision': precision, 
                                             'recall': recall, 
                                             'roc_auc': roc_auc}
    
    return results

In [11]:
models = [SVC(random_state=21, probability=True), DecisionTreeClassifier(random_state=21), RandomForestClassifier(random_state=21)]
params = [{'C': 10, 'class_weight': 'balanced', 'gamma': 'scale', 'kernel': 'rbf'}, 
          {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 23}, 
          {'n_estimators': 50, 'max_depth': 39, 'class_weight': 'balanced', 'criterion': 'gini'}]
results = evaluate_models(models, params, X_train, X_test, y_train, y_test)
for model, result in results.items():
    print(f"{model}: {result}")

SVC: {'accuracy': 0.908284023668639, 'precision': 0.908537422429379, 'recall': 0.908284023668639, 'roc_auc': 0.9384552349169959}
DecisionTreeClassifier: {'accuracy': 0.9053254437869822, 'precision': 0.9055019273619678, 'recall': 0.9053254437869822, 'roc_auc': 1.0}
RandomForestClassifier: {'accuracy': 0.8994082840236687, 'precision': 0.9036368343294753, 'recall': 0.8994082840236687, 'roc_auc': 1.0}
