# Day 09. Exercise 02
# Metrics

## 0. Imports

In [5]:
import pandas as pd 
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

## 1. Preprocessing

1. Create the same dataframe as in the previous exercise.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [6]:
df=pd.read_csv("../data/day-of-week-not-scaled.csv", sep=',')
df['dayofweek']=pd.read_csv("../data/dayofweek.csv", usecols=['dayofweek'])
df.head()

Unnamed: 0,numTrials,hour,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,uid_user_15,uid_user_16,uid_user_17,uid_user_18,uid_user_19,uid_user_2,uid_user_20,uid_user_21,uid_user_22,uid_user_23,uid_user_24,uid_user_25,uid_user_26,uid_user_27,uid_user_28,uid_user_29,uid_user_3,uid_user_30,uid_user_31,uid_user_4,uid_user_6,uid_user_7,uid_user_8,labname_code_rvw,labname_lab02,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1,dayofweek
0,1,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
1,2,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
2,3,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
3,4,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
4,5,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4


In [7]:
X=df.loc[:, df.columns!='dayofweek']
y=df['dayofweek']

In [8]:
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=21,stratify=y)

## 2. SVM

1. Use the best parameters from the previous exercise and train the model of SVM.
2. You need to calculate `accuracy`, `precision`, `recall`, `ROC AUC`.

 - `precision` and `recall` should be calculated for each class (use `average='weighted'`)
 - `ROC AUC` should be calculated for each class against any other class (all possible pairwise combinations) and then weighted average should be applied for the final metric
 - the code in the cell should display the result as below:

```
accuracy is 0.88757
precision is 0.89267
recall is 0.88757
roc_auc is 0.97878
```

In [9]:
model_svc=SVC(C=10, class_weight=None, gamma='auto',kernel='rbf',probability=True, random_state=21)
model_svc.fit(X_train, y_train)
predict_prob_svc=model_svc.predict_proba(X_test)
predict_svc=model_svc.predict(X_test)

In [11]:
accuracy_svc=accuracy_score(y_test, predict_svc)
precision_svc=precision_score(y_test, predict_svc, average='weighted')
recall_svc=recall_score(y_test, predict_svc, average='weighted')
roc_auc_svc=roc_auc_score(y_test, predict_prob_svc,multi_class='ovo', average='weighted')

print(f'accuracy is {accuracy_svc:.5f}')
print(f'precision is {precision_svc:.5f}')
print(f'recall is {recall_svc:.5f}')
print(f'roc_auc is {roc_auc_svc:.5f}')

accuracy is 0.88757
precision is 0.89267
recall is 0.88757
roc_auc is 0.97878


## 3. Decision tree

1. The same task for decision tree

In [12]:
model_tree=DecisionTreeClassifier(class_weight='balanced', criterion='gini', max_depth=22, random_state=21)
model_tree.fit(X_train, y_train)
predict_tree=model_tree.predict(X_test)
predict_proba_tree=model_tree.predict_proba(X_test)

In [14]:
acc_tree=accuracy_score(y_test, predict_tree)
prec_tree=precision_score(y_test, predict_tree, average='weighted')
recall_tree=recall_score(y_test, predict_tree, average='weighted')
roc_auc__tree=roc_auc_score(y_test, predict_proba_tree, multi_class='ovo',average='weighted' )

print(f"accuracy : {acc_tree:.5f}\nprecision : {prec_tree:.5f}\nrecall : {recall_tree:.5f}\nroc-auc : {roc_auc__tree:.5f}")

accuracy : 0.89053
precision : 0.89262
recall : 0.89053
roc-auc : 0.93664


## 4. Random forest

1. The same task for random forest.

In [15]:
model_forest = RandomForestClassifier(class_weight=None, criterion='gini', max_depth=28, n_estimators=50, random_state=42)
model_forest.fit(X_train, y_train)
predict_forest = model_forest.predict(X_test)
proba_forest = model_forest.predict_proba(X_test)

In [16]:
acc_foresct=accuracy_score(y_test, predict_forest)
prec_foresct=precision_score(y_test, predict_forest, average='weighted')
recall_foresct=recall_score(y_test, predict_forest, average='weighted')
roc_auc__foresct=roc_auc_score(y_test, proba_forest, multi_class='ovo',average='weighted' )

print(f"accuracy : {acc_foresct:.5f}\nprecision : {prec_foresct:.5f}\nrecall : {recall_foresct:.5f}\nroc-auc : {roc_auc__foresct:.5f}")

accuracy : 0.92604
precision : 0.92684
recall : 0.92604
roc-auc : 0.98824


## 5. Predictions

1. Choose the best model.
2. Analyze: for which `weekday` your model makes the most errors (in % of the total number of samples of that class in your full dataset), for which `labname` and for which `users`.
3. Save the model.

In [18]:
cm=confusion_matrix(y_test, predict_forest)
print("Errors per weekday:")
for i in range(7):
    total=cm[i].sum()
    correct=cm[i][i]
    errors=total-correct
    errors_p_class=errors/total*100 
    print(f'{i}: {errors_p_class:.5f}%')

Errors per weekday:
0: 25.92593%
1: 10.90909%
2: 6.66667%
3: 2.50000%
4: 14.28571%
5: 5.55556%
6: 2.81690%


In [20]:
df_temp=pd.read_csv('../data/checker_submits.csv')

df_test=X_test.copy()
df_test['true_res']=y_test
df_test['pred_res']=predict_forest
df_test['error']=df_test['true_res']!=df_test['pred_res']

df_test['user']=df_temp['user']
df_test['labname']=df_temp['labname']



FileNotFoundError: [Errno 2] No such file or directory: '../data/checker_submits.csv'

In [19]:
errors_per_lab=df_test.groupby('labname')['error'].mean()
errors_per_lab.sort_values(ascending=False, inplace= True)

NameError: name 'df_test' is not defined

In [None]:
errors_per_user=df_test.groupby('user')['errors'].mean()
errors_per_user.sort_values(ascenging=False, inpace = True)

## 6. Function

1. Write a function that takes a list of different models and a corresponding list of parameters (dicts) and returns a dict that contains all the 4 metrics for each model.

In [30]:
def metrics_for_models(models, params, X_train, X_test, y_train, y_test):
    if len(models)!= len(params):
        raise ValueError("amount of models and params should be equel")
    result={}

    for model_name, param in zip(models, params):
        model=model_name(**param)
        model.fit(X_train, y_train)

        prediction=model.predict(X_test)
        proba=model.predict_proba(X_test)

        result[type(model).__name__]={
            'accuracy' : accuracy_score(prediction, y_test),
            'precise' : precision_score(prediction, y_test, average='weighted'),
            'recall' : recall_score(prediction, y_test, average='weighted'),
            'roc_auc' : roc_auc_score(y_test,proba, multi_class='ovo', average='weighted')      
        }
    return result

In [31]:
params_svc = {'C':10, 'class_weight':None, 'gamma':'auto', 'kernel':'rbf', 'probability':True, 'random_state':21}
params_tree = {'class_weight':'balanced', 'criterion':'gini', 'max_depth':22, 'random_state':21}
params_forest = {'class_weight':None, 'criterion':'gini', 'max_depth':28, 'random_state':21, 'n_estimators': 50}

parametrs_ = [params_svc, params_tree, params_forest]
models_ = [SVC, DecisionTreeClassifier, RandomForestClassifier]


In [32]:
pd.DataFrame(metrics_for_models(models_, parametrs_, X_train, X_test, y_train, y_test))

Unnamed: 0,SVC,DecisionTreeClassifier,RandomForestClassifier
accuracy,0.887574,0.890533,0.928994
precise,0.893693,0.891669,0.933847
recall,0.887574,0.890533,0.928994
roc_auc,0.978779,0.936635,0.990327
