In [None]:
#IMPORTAZIONE DATASET
from google.colab import drive
drive.mount("/content/drive")
%cd /content/drive/MyDrive/Project\ ML

In [None]:
#IMPORTAZIONE PACCHETTI
!pip install mlxtend

import sklearn
import pandas as pd
import numpy as np
import joblib
import sys

sys.modules['sklearn.externals.joblib'] = joblib

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler, OneHotEncoder, OrdinalEncoder
from imblearn.under_sampling import RandomUnderSampler, ClusterCentroids
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.ensemble import IsolationForest
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import f_classif, mutual_info_classif, SelectKBest, SelectFromModel
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier,export_graphviz
from sklearn import tree
import graphviz
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

np.random.seed = 123

In [None]:
#LETTURA DATASET
data = pd.read_csv("test_set.csv")
data.columns = ["age","workclass","fnlwgt","education","education-num",
                "marital-status","occupation","relationship","race","sex",
                "capital-gain","capital-loss","hours-per-week",
                "native-country","earns"]

print(data.head())
print(len(data))
print(np.unique(data["earns"].to_numpy(), return_counts=True))
print("# of NaN values for each column:")
print(data.isnull().sum(axis=0))

In [None]:
#PREPARAZIONE TRAINING SET E TEST SET
x = data.to_numpy()[:,:-1]
y = data.to_numpy()[:,-1]
print(x[:10,:])
x_tr, x_ts, y_tr, y_ts = train_test_split(x,y,test_size=0.4,stratify=y)
print("Shape of x_tr {}".format(x_tr.shape))
print("Shape of y_tr {}".format(y_tr.shape))
print("Shape of x_ts {}".format(x_ts.shape))
print("Shape of y_ts {}".format(y_ts.shape))
print(np.unique(y_tr, return_counts=True)[1]/len(y_tr))
print(np.unique(y_ts, return_counts=True)[1]/len(y_ts))

#Trasformazione delle y di trainning e set in interi, quindi la variabile categorica "<=50K" diventa 0 mentre la variabile ">50K" diventa 1
y_tr_1 = pd.DataFrame(y_tr).replace(" <=50K", 0)
y_tr_1 = pd.DataFrame(y_tr_1).replace(" >50K", 1)
y_tr_processed = y_tr_1.to_numpy()
print(np.unique(y_tr, return_counts=True))
print(np.unique(y_tr_processed, return_counts=True))
y_ts_1 = pd.DataFrame(y_ts).replace(" <=50K", 0)
y_ts_1 = pd.DataFrame(y_ts_1).replace(" >50K", 1)
y_ts_processed = y_ts_1.to_numpy()
print(np.unique(y_ts, return_counts=True))
print(np.unique(y_ts_processed, return_counts=True))

In [None]:
#Siccome il dataset è composto da variabili sia continue che categoriche, sono state costruite due pipeline che trattano le variabili in modo diverso. 
#Per quanto riguarda le variabili continue è stata effettuata l'imputazone utilizzando il KNNImputer (nel dataset di trining non ci sono valori nan tra le variabili continue ma è stato inserito nel caso il dataset di test dovesse averle)
#successivamente viene eseguito il RandomScalar per normalizzare le variabili

#per quanto riguarda le variabili categoriche per prima cosa viene eseguita l'imputazione dei valori mancanti (che sono segnati conil simbolo ?) utilizzando il SimpleImputer con strategia del most frequent
#e successivamente visne viene eseguito il OneHotEncoder in modo da trasformare le variabili categoriche inn variabili numeriche

#pipeline per variabili numeriche
numerical = ["age","fnlwgt","education-num","capital-gain","capital-loss",
             "hours-per-week"] # nome delle variabili numeriche
numerical_transformer = Pipeline(steps=[
                                        ('imputer', KNNImputer(
                                            missing_values=np.nan,
                                            n_neighbors=3,
                                            weights="uniform")),
                                        ('scaler', RobustScaler())])

#pipeline per variabili categoriche
categorical_one_hot = ["workclass","education","marital-status","occupation",
                       "relationship","race","sex",
                       "native-country"] # nome delle variabili categoriche
onehot_transformer = Pipeline(steps=[
                                    ('imputer', SimpleImputer(
                                        strategy='most_frequent', 
                                        fill_value='missing', 
                                        missing_values=" ?")),
                                    ('encoder', OneHotEncoder(
                                        handle_unknown='ignore'))])

#esecuzione delle divesre pipeline a seconda della variabile
preprocessing = ColumnTransformer(
    transformers=[
                  ('num', numerical_transformer, numerical),
                  ('onehot', onehot_transformer, categorical_one_hot)
                  ])

x_tr_pd = pd.DataFrame(x_tr)
x_ts_pd = pd.DataFrame(x_ts)
x_tr_pd.columns = ["age","workclass","fnlwgt","education","education-num",
                   "marital-status","occupation","relationship","race","sex",
                   "capital-gain","capital-loss","hours-per-week",
                   "native-country"]
x_ts_pd.columns = ["age","workclass","fnlwgt","education","education-num",
                   "marital-status","occupation","relationship","race","sex",
                   "capital-gain","capital-loss","hours-per-week",
                   "native-country"]

x_tr_processed = preprocessing.fit_transform(x_tr_pd).toarray() #addestramento (e utilizzo) delle operazioni di pre-processing sul trainig set
x_ts_processed = preprocessing.transform(x_ts_pd).toarray() #utilizzo delle operazioni di pre-processing sul test set
print(x_tr_processed)
print(x_ts_processed)
print("Media distribuzione di partenza: {}".format(np.mean(x_tr_pd, axis=0)))
print("Media distribuzione scalata: {}".format(np.mean(x_tr_processed, axis=0)))
print("Media distribuzione di partenza: {}".format(np.mean(x_ts_pd, axis=0)))
print("Media distribuzione scalata: {}".format(np.mean(x_ts_processed, axis=0)))

In [None]:
#BALANCING 

#Visualizzazone dello sbilanciamento del dataset
n_classes = len(np.unique(y_tr_processed))
fig, ax = plt.subplots()
sizes = [len(y_tr_processed[y_tr_processed==i]) for i in range(n_classes)]
print(sizes)
class_names = [chr(ord('A')+i) for i in range(n_classes)]
ax.pie(sizes, labels=class_names, autopct='%1.1f%%',shadow=True,startangle=90)
ax.axis("equal")
plt.show()


#ANOMALY DETECTION CON ISOLATION FOREST (metodo di bilanciamento usato = SMOTE)
print("Initial distribution")
print(len(x_tr_pd))
print(np.unique(y_tr_processed, return_counts=True))
print(np.unique(y_tr_processed, return_counts=True)[1]/len(y_tr_processed))

anomaly_detector = IsolationForest()
anomaly_detector.fit(x_tr_processed)
is_inlier = anomaly_detector.predict(x_tr_processed)

print(np.unique(is_inlier, return_counts=True))
x_tr_not_anomalous = x_tr_processed[is_inlier==1,:]
y_tr_not_anomalous = y_tr_processed[is_inlier==1]

print("Distribution after anomaly detection")
print(x_tr_not_anomalous.shape[0])
print(np.unique(y_tr_not_anomalous, return_counts=True)[1]/len(
    y_tr_not_anomalous))

balancer = SMOTE(random_state=42)
x_tr_balanced, y_tr_balanced = balancer.fit_resample(
    x_tr_not_anomalous, y_tr_not_anomalous)

print("Distribution after balancing")
print(x_tr_balanced.shape[0])
print(np.unique(y_tr_balanced, return_counts=True)[1]/len(y_tr_balanced))


#Visualizzazione dataset bilanciato
fig, ax = plt.subplots()
sizes = [len(y_tr_balanced[y_tr_balanced==i]) for i in range(n_classes)]
class_names = [chr(ord("A")+i) for i in range(n_classes)]
ax.pie(sizes, labels = class_names, autopct = "%1.1f%%", shadow = True, 
       startangle = 90)
ax.axis("equal")
plt.show()

In [None]:
#funzione che addestra il modello di classificazione scelto con il trainig set e stampa le varie misure di accuratezza oltre alla confusion matrix
def train_evaluate(model, x_tr, y_tr, x_ts, y_ts):
  model.fit(x_tr, y_tr)
  y_tr_pred = model.predict(x_tr)
  y_ts_pred = model.predict(x_ts)

  print("Confusion matrix on training set")
  print(confusion_matrix(y_tr, y_tr_pred))
  print("Confusion matrix on test set")
  print(confusion_matrix(y_ts, y_ts_pred))
  print("Accuracy on training set: {}".format(accuracy_score(y_tr, y_tr_pred)))
  print("Accuracy on test set: {}".format(accuracy_score(y_ts, y_ts_pred)))
  print("F1 score on training set: {}".format(f1_score(y_tr, y_tr_pred)))
  print("F1 score on test set: {}".format(f1_score(y_ts, y_ts_pred)))

In [None]:
#CLASSIFICAZIONE
print("Gradient Boosting")
classifier = GradientBoostingClassifier(n_estimators=100, max_depth=5)
print("---RESULT---")
train_evaluate(
    classifier, x_tr_balanced, y_tr_balanced, x_ts_processed, y_ts_processed)