**Training Routine**

Inserire il file di train nella stessa cartella dov'è contenuto questo file e sostituire in train_filename il nome del file di train.

In [1]:
train_filename = 'train.csv'

import sklearn
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.ensemble import GradientBoostingClassifier, IsolationForest
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.compose import ColumnTransformer

from joblib import dump
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

np.random.seed = 123

In [3]:
#Leggo dataset
data = pd.read_csv(train_filename, sep=",")
#La x prende tutte le righe di tutte le feature
x_tr = data.iloc[:, :-1]
#La y prende tutte le righe della colonna delle label
y_tr = data.iloc[:, -1]

In [4]:
#variabili categoriche
categorical_columns = [col for col in x_tr.columns if x_tr[col].dtype == "object"]

#variabili numeriche
numerical_columns = [col for col in x_tr.columns if x_tr[col].dtype in ["int64", "float64"]]

#preprocessing per dati numerici
numerical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", RobustScaler())
])

#preprocessing per i dati categorici
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore"))
])

#ricomposizione del dataset
preprocessor = ColumnTransformer(
    transformers=[
        ('numerical', numerical_transformer, numerical_columns),
        ('categorical', categorical_transformer, categorical_columns)],
        remainder='passthrough')

x_tr = preprocessor.fit_transform(x_tr)

#balancing
sampler = SMOTE(random_state=42)
y_tr = pd.DataFrame(y_tr)
x_tr, y_tr = sampler.fit_resample(x_tr, y_tr.values.ravel())

#anomaly detection
anomaly_detector = IsolationForest()
anomaly_detector.fit(x_tr)
is_inlier = anomaly_detector.predict(x_tr)

y_tr = y_tr[is_inlier==1]
x_tr = x_tr[is_inlier == 1,:]

#addestramento
classificator = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.5, loss='exponential', max_depth=2, random_state=0)
classificator.fit(x_tr, y_tr)

print("Accuracy training set {}".format(accuracy_score(y_tr, classificator.predict(x_tr))))

#serializzazione preprocessor e classificator
dump(classificator, 'classificator.joblib') 
dump(preprocessor, 'preprocessing.joblib')

Accuracy training set 0.9266657212892248


['preprocessing.joblib']