<a href="https://colab.research.google.com/github/DunkleCat/ia-esame/blob/master/training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -r requirements.txt

# Caricamento del dataframe

In [None]:
import pandas as pd

def io_load_multiple_csv(csv_path_list):
  dataframe_list = []
  for elem in csv_path_list:
    dataframe_list.append(io_load_csv(elem))
  return dataframe_list

def io_load_csv(csv_path):
  return pd.read_csv(csv_path)  

# Analisi del Dataset


## Normalizzazione valori nulli

Come primo passaggio normalizziamo i valori che rappresentano attributi mancanti trasformandoli tutti in np.nan


In [None]:
import numpy as np

def normalize_nan(dataframe, nan_list):
  for elem in nan_list:
    dataframe.replace(elem, np.nan, inplace=True)
  return dataframe

## Analisi dataframe

Passiamo ora ad analizzare i dati in ingresso

In [None]:
def print_infos(dataframe):
  print("Dataframe miscellaneous:\n")
  print("Rows     : {}".format(dataframe.shape[0]) )
  print("Columns  : {}".format(dataframe.shape[1]))
  print("\nFeatures :\n{}".format(dataframe.columns.tolist()))
  print("\nUnique values :\n{}".format(dataframe.nunique()))

  print("\nDataframe info:")
  dataframe.info()

  for elem in dataframe:
    print(elem, ': ', np.sort(dataframe[elem].unique()))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sb

def print_feature_plots(dataframe, feature_list, feature_target):
  for elem in dataframe:
    if elem in feature_list:
      plot = sb.catplot(x = feature_target, 
                        col = elem, 
                        data = dataframe, 
                        kind = 'count')
    plt.show()

Dati un dataframe ed una lista di feature elimina dal dataframe tutte le features presenti all'interno della lista. 

In [None]:
def clean_useless(dataframe, column_list):
  for elem in column_list:
    dataframe.pop(elem)
  return dataframe

## Altre informazioni temporali

In [None]:
from datetime import date

def add_diffence_scheduled_appointment_day(dataframe):
  
  appointmentWeekdayDay = []
  daysFromScheduledToAppointment = []

  for i in range(dataframe['AppointmentDay'].size):
    appointmentWeekdayDay.append(get_weekday(dataframe['AppointmentDay'][i]))
    daysFromScheduledToAppointment.append(get_diff_days(dataframe['ScheduledDay'][i], dataframe['AppointmentDay'][i]))
  
  dataframe['AppointmentWeekDay'] = appointmentWeekdayDay
  dataframe['DaysFromScheduledToAppointment'] = daysFromScheduledToAppointment
  return dataframe

# string_date = YYYY-MM-DD[.*]
def get_date(string_date):
  return date(year = int(string_date[0:4]), 
              month = int(string_date[5:7]), 
              day = int(string_date[8:10]))

def get_weekday(string_date):
  return get_date(string_date).weekday()

def get_diff_days(string_date_start, string_date_end): 
  return (get_date(string_date_end) - get_date(string_date_start)).days

# Modello

Vengono sfruttate le possiilità offerte da una pipeline di sklearn per creare ed utilizzare il modello.

Come classificatore viene utilizzato l'algoritmo di Random Forest. Esso è un algoritmo di tipo Ensemble, e quindi sfrutta la combinazione di altri algoritmi più deboli ma opportunamente organizzati ed utilizzati per ottenere dei risultati soddisfacenti. Nel caso specifico abbiamo una combinazione di classificatori di tipo Decision tree.

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

def create_model(dataframe):

  preprocessor = create_preprocessor(dataframe)

  return Pipeline(
      steps = [('preprocessor', preprocessor),
               ('classifier', RandomForestClassifier())])

def create_preprocessor(dataframe):
  numeric_features = get_numeric_features(dataframe)
  numeric_transformer = Pipeline(
      steps = [('imputer', SimpleImputer(strategy='median')),
               ('scaler', StandardScaler())])

  categorical_features = get_categorical_features(dataframe)
  categorical_transformer = Pipeline(
      steps=[('imputer', SimpleImputer(strategy='most_frequent')),
             ('onehot', OneHotEncoder(handle_unknown='ignore'))])

  return ColumnTransformer(
      transformers=[('num', numeric_transformer, numeric_features),
                    ('cat', categorical_transformer, categorical_features)])

def get_numeric_features(dataframe):
  numeric_features = []
  for elem in dataframe:
    if is_numeric(dataframe[elem]):
      numeric_features.append(elem)
  return numeric_features

def get_categorical_features(dataframe):
  categorical_features = []
  for elem in dataframe:
    if is_categorical(dataframe[elem]):
      categorical_features.append(elem)
  return categorical_features

def is_numeric(elem):
  return elem is float or elem is int

def is_categorical(elem):
  return not is_numeric(elem)

In [None]:
from sklearn.model_selection import GridSearchCV

def train_model(model, dataframe_train, dataframe_train_target):
  return model.fit(dataframe_train, dataframe_train_target)

In [None]:
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, classification_report, plot_confusion_matrix

def print_test_model(model, dataframe_test, dataframe_test_target):
  preds = model.predict(dataframe_test)
  print('Test score: {}'.format(model.score(dataframe_test, dataframe_test_target)))
 
  print(classification_report(dataframe_test_target, preds))

# Esecuzione

In [None]:
## MISCELLANEOUS ###############################################################

# Readable float 
np.set_printoptions(precision=3, suppress=True)

################################################################################

In [None]:
## MANUAL INTERVENTION #########################################################

dataframe_train_path = "train.csv"
dataframe_test_path = "test.csv"

################################################################################

## EXECUTION ###################################################################

dataframe_train, dataframe_test = io_load_multiple_csv([dataframe_train_path, 
                                                        dataframe_test_path])

################################################################################

In [None]:
dataframe_train.head()

In [None]:
## MANUAL INTERVENTION #########################################################

nan_list = [
            "", 
            " ", 
            "?",
            "unknown"
] # All the ways a nan element appears inside the dataframe

feature_target = "No-show"

################################################################################

## EXECUTION ###################################################################

# Clean the dataframe from nan values
dataframe_train = normalize_nan(dataframe_train, nan_list)
dataframe_test = normalize_nan(dataframe_test, nan_list)

# Print dataframe's infographic
print_infos(dataframe_train)

################################################################################

In [None]:
## MANUAL INTERVENTION #########################################################

useless_list = [
                "AppointmentID"  
                "ScheduledDay"     
] # All the meaningless attributes in relation to the prediction 

################################################################################

## EXECUTION ###################################################################

# Clean the dataframe from meningless features
dataframe_train = clean_useless(dataframe_train, useless_list)
dataframe_test = clean_useless(dataframe_test, useless_list)

# Add some useful features
dataframe_train = add_diffence_scheduled_appointment_day(dataframe_train)
dataframe_test = add_diffence_scheduled_appointment_day(dataframe_test)

################################################################################


In [None]:
from sklearn.preprocessing import LabelEncoder

## MANUAL INTERVENTION #########################################################

feature_target = "No-show"

################################################################################

## TARGET PREPARATION ##########################################################

le = LabelEncoder()

dataframe_train_target = le.fit_transform(dataframe_train.pop(feature_target))
dataframe_test_target = le.transform(dataframe_test.pop(feature_target))

################################################################################

In [None]:
## CLASSIFIER ##################################################################

# Create the classifier
classifier = create_model(dataframe_train)

# Train the classifier
classifier = train_model(classifier, dataframe_train, dataframe_train_target)

# Test the classifier
print_test_model(classifier, dataframe_test, dataframe_test_target)

################################################################################

Test score: 0.8048063330506079
              precision    recall  f1-score   support

           0       0.80      1.00      0.89     14119
           1       0.82      0.04      0.08      3566

    accuracy                           0.80     17685
   macro avg       0.81      0.52      0.48     17685
weighted avg       0.81      0.80      0.73     17685

