# Libraries

In [None]:
import polars as pl
import pandas as pd
import seaborn as sns
from tqdm import tqdm
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.metrics import (confusion_matrix, accuracy_score, precision_score, recall_score, f1_score)
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoost
from keras.models import Sequential
from keras.layers import Dense

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

# Dataset

In [None]:
dataset = pl.read

# Data Preparation

# Fine-Tuning

## GridSearch

In [None]:
#parametri su cui fare GridSeacrh

names = ["Random Forrest", "Decision Tree", 'Catboost', 'AdaBoost ']

# Classificatori
classifiers = [
    RandomForestClassifier(n_estimators=100, criterion='entropy'),
    DecisionTreeClassifier(),
    LogisticRegression(random_state=0),
    CatBoostClassifier(iterations=500, learning_rate=0.1, verbose=0, auto_class_weights='Balanced')    
]

# Parametri possibili
param_grids = {
    "Random Forrest": {
        'n_estimators': [10, 30, 50],
        'criterion': ['entropy', 'gini', 'log_loss'],
        "random_state": [42],
        "class_weight": ['balanced', 'None']
    },
    "Decision Tree": {
        "max_depth": [4, 5, 7,8],
        "min_samples_split": [2, 3],
        "min_samples_leaf": [1, 2, 4],
        "criterion": ['gini', 'entropy', 'log_loss'],
        "class_weight": ['balanced', 'None']
    },
    
    "Catboost": {
        'iterations': [400, 800],              
        'learning_rate': [0.05, 0.1],          
        'depth': [4, 8],                   
        'l2_leaf_reg': [1, 5],              
        'bagging_temperature': [0.1, 1, 8],       
        'auto_class_weights': ['Balanced']    
    },

    "AdaBoost": {
        "n_estimators": [],
        "learning_rate": [],
        "random_state": []
    }
    
}

In [None]:
mods={}
for i, (name, clf) in tqdm(enumerate(zip(names, classifiers))):
    grid = GridSearchCV(clf, param_grid=param_grids[name], cv=5, scoring='f1_macro') #cv=integer, to specify the number of folds in a (Stratified)KFold
    grid.fit(x_train, y_train_)
    mods[name]=grid
  # Display results
    print(f"Best parameters for {name}: {grid.best_params_}")
    print(f"Best cross-validated micro for {name}: {grid.best_score_:.2f}\n")

## Randomized Search

In [None]:
#parametri su cui fare RandomizedSearch

In [None]:
class sklearn.model_selection.RandomizedSearchCV(estimator, param_distributions, *, n_iter=10, scoring=None, n_jobs=None, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', random_state=None, error_score=nan, return_train_score=False)

# Training

In [None]:
# feature selction? 

# Test

In [None]:
results = {}  # To store evaluation metrics for each model

for i, (name, clf) in tqdm(enumerate(zip(names, Best_Classifiers))):
    x_train_selected = x_train[selected_features_ig]
    x_test_selected = x_test[selected_features_ig]

    # Fit the model again using only the selected features (if necessary)
    model = clf.fit(x_train_selected, y_train_)
    test_predict = model.predict(x_test_selected)

    # Calculate metrics
    f1 = f1_score(y_test_, test_predict)
    accuracy = model.score(x_test_selected, y_test_)
    precision = precision_score(y_test_, test_predict, average='macro')
    conf_matrix = confusion_matrix(y_test_, test_predict)

    # Store metrics in results dictionary
    results[name] = {
        'F1 Score': f1_score,
        'Accuracy': accuracy,
        'Precision': precision,
        'Confusion Matrix': conf_matrix,
    }

    
    # Print metrics
    print('Model:', name, ' F1:', f1, ' Accuracy:', accuracy, ' Precision:', precision)

    tn, fp, fn, tp = conf_matrix.ravel()
    # Calcola TPR e TNR
    tpr = tp / (tp + fn) if (tp + fn) > 0 else 0
    tnr = tn / (tn + fp) if (tn + fp) > 0 else 0

    # Calcola la balanced accuracy
    balanced_accuracy = (tpr + tnr) / 2
    print("Balanced Accuracy ", balanced_accuracy)

    # If it's the second model (index 1), plot the decision tree
    if i == 1:
        plt.figure(figsize=(20, 10))  # Set a larger figure for better visualization
        plot_tree(clf, feature_names=selected_features_ig, class_names=['0', '1'], filled=True)
        plt.savefig('Decision Tree schema_ig.png')
        plt.show()

    # Confusion matrix visualization
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Greens", cbar=False)
    plt.title(f'Confusion Matrix for {name}')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.savefig(f"Confusion Matrix_{name}_IG.png")
    plt.show()