In [32]:
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [33]:
import os
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.preprocessing import MinMaxScaler

In [34]:
results = {
  'experiment': [],
  'algorithm': [],
	'tn'       : [],
	'fp'       : [],
	'fn'       : [],
	'tp'       : [],
  'tpr'      : [],
	'tnr'      : [],
	'fnr'      : [],
	'fpr'      : [],
	'accuracy' : [],
	'precision': [],
	'recall'   : [],
	'f1'       : [],
	'auc'      : []
}

In [35]:
from google.colab import drive
drive.mount('/content/drive')
cur_path = "drive/MyDrive/Mafaulda_organizado"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [36]:
experiments_dir = cur_path + '/runs'

In [37]:
experiments = next(os.walk(experiments_dir))[1]

In [38]:
for experiment in experiments:

  algorithms = [SVC(probability=True), XGBClassifier(), LGBMClassifier(), CatBoostClassifier(), DecisionTreeClassifier(), KNeighborsClassifier(), LogisticRegression(), RandomForestClassifier() ]

  df_train = pd.read_csv(cur_path + '/runs/'+experiment+'/train.csv', sep=";")
  df_test = pd.read_csv(cur_path + '/runs/'+experiment+'/test.csv', sep=";")

  df_train['target'] = df_train['target'].astype(int)
  df_test['target'] = df_test['target'].astype(int)

  train_target2 = df_train['target2']
  test_target2 = df_test['target2']

  y_train = df_train['target']
  X_train = df_train.drop(columns=['target', 'target2'])

  y_test = df_test['target']
  X_test = df_test.drop(columns=['target', 'target2'])

  scaler = MinMaxScaler()

  columns = X_train.columns
  X_train = scaler.fit_transform(X_train)
  X_train = pd.DataFrame(X_train, columns=columns)

  columns = X_test.columns
  X_test = scaler.transform(X_test)
  X_test = pd.DataFrame(X_test, columns=columns)

  for algorithm in algorithms:
    model = algorithm
    algorithm_name = model.__class__.__name__
    print(algorithm_name)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)

    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()

    # Calculate True Positive Rate (TPR), True Negative Rate (TNR), False Negative Rate (FNR), False Positive Rate (FPR)
    tpr = tp / (tp + fn)
    tnr = tn / (tn + fp)
    fnr = fn / (fn + tp)
    fpr = fp / (fp + tn)

    # Calculate Accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Calculate Precision
    precision = precision_score(y_test, y_pred)

    # Calculate Recall (Sensitivity or True Positive Rate)
    recall = recall_score(y_test, y_pred)

    # Calculate F1-Score
    f1 = f1_score(y_test, y_pred)

    # Calculate Area Under the Curve (AUC)
    auc = roc_auc_score(y_test, y_pred)


    results['experiment'].append(experiment)
    results['algorithm'].append(algorithm_name)
    results['tn'].append(tn)
    results['fp'].append(fp)
    results['fn'].append(fn)
    results['tp'].append(tp)
    results['tpr'].append(tpr)
    results['tnr'].append(tnr)
    results['fnr'].append(fnr)
    results['fpr'].append(fpr)
    results['accuracy'].append(accuracy)
    results['precision'].append(precision)
    results['recall'].append(recall)
    results['f1'].append(f1)
    results['auc'].append(auc)

    df_result = pd.DataFrame(y_proba, columns=model.classes_)
    df_result['pred'] = y_pred
    df_result['target'] = y_test
    df_result['target2'] = test_target2

    df_result.to_csv(cur_path + '/results/proba/'+ experiment +'_'+ algorithm_name + '.csv',index=False)

    fig, ax = plt.subplots()
    sns.heatmap(cm, annot=True, fmt='g', ax=ax, cmap='Blues');
    ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels');
    ax.set_title(experiment + ' - ' + algorithm_name);
    fig.savefig(cur_path + '/results/confusion_matrix/'+ experiment +'_'+ algorithm_name + '.png')
    plt.close('all')

[1;30;43mA saída de streaming foi truncada nas últimas 5000 linhas.[0m
40:	learn: 0.3581376	total: 1.15s	remaining: 26.9s
41:	learn: 0.3533528	total: 1.18s	remaining: 26.9s
42:	learn: 0.3481437	total: 1.2s	remaining: 26.8s
43:	learn: 0.3434512	total: 1.23s	remaining: 26.7s
44:	learn: 0.3395584	total: 1.26s	remaining: 26.7s
45:	learn: 0.3335433	total: 1.29s	remaining: 26.8s
46:	learn: 0.3295277	total: 1.32s	remaining: 26.8s
47:	learn: 0.3241532	total: 1.35s	remaining: 26.7s
48:	learn: 0.3206265	total: 1.37s	remaining: 26.6s
49:	learn: 0.3155838	total: 1.4s	remaining: 26.5s
50:	learn: 0.3111047	total: 1.42s	remaining: 26.5s
51:	learn: 0.3066041	total: 1.45s	remaining: 26.4s
52:	learn: 0.3024835	total: 1.47s	remaining: 26.3s
53:	learn: 0.2975549	total: 1.51s	remaining: 26.5s
54:	learn: 0.2931601	total: 1.54s	remaining: 26.4s
55:	learn: 0.2887553	total: 1.57s	remaining: 26.4s
56:	learn: 0.2850234	total: 1.59s	remaining: 26.3s
57:	learn: 0.2819707	total: 1.62s	remaining: 26.3s
58:	learn: 

In [39]:
results_csv = pd.DataFrame.from_dict(results)
results_csv.to_csv(cur_path + '/results/run_2.csv',index=False)