In [487]:
# !pip install -r requirements.txt

# Caricamento del dataframe

In [488]:
import pandas as pd

def io_load_multiple_csv(csv_path_list):
  dataframe_list = []
  for elem in csv_path_list:
    dataframe_list.append(io_load_csv(elem))
  return dataframe_list

def io_load_csv(csv_path):
  return pd.read_csv(csv_path)  

# Analisi del Dataset


## Normalizzazione valori nulli

Come primo passaggio normalizziamo i valori che rappresentano attributi mancanti trasformandoli tutti in np.nan


In [489]:
import numpy as np

def normalize_nan(dataframe, nan_list):
  for elem in nan_list:
    dataframe.replace(elem, np.nan, inplace=True)
  return dataframe

## Analisi dataframe

Passiamo ora ad analizzare i dati in ingresso

In [490]:
def print_infos(dataframe):
  print("Dataframe miscellaneous:\n")
  print("Rows     : {}".format(dataframe.shape[0]) )
  print("Columns  : {}".format(dataframe.shape[1]))
  print("\nFeatures :\n{}".format(dataframe.columns.tolist()))
  print("\nUnique values :\n{}".format(dataframe.nunique()))

  print("\nDataframe info:")
  dataframe.info()

  for elem in dataframe:
    print(elem, ': ', np.sort(dataframe[elem].unique()))

In [491]:
import matplotlib.pyplot as plt
import seaborn as sb

# def print_feature_plots(dataframe, feature_list, feature_target):
#   for elem in dataframe:
#     if elem in feature_list:
#       plot = sb.catplot(x = feature_target, 
#                         col = elem, 
#                         data = dataframe, 
#                         kind = 'count')
#     plt.show()

def print_feature_plots(dataframe, feature_target):
  categorical_features = get_categorical_features(dataframe)
  print(categorical_features)
  numeric_features = get_numeric_features(dataframe)
  print(numeric_features)
  for elem in dataframe:
    if elem in categorical_features:
      print("Categorical:")
      plot = sb.catplot(x = feature_target, 
                        col = elem, 
                        data = dataframe, 
                        kind = 'count')
    elif elem in numeric_features:
      print("Numeric:")
      plot = sb.displot(data = dataframe,
                        x = elem,
                        hue = feature_target) 
  
    plt.show()

Dati un dataframe ed una lista di feature elimina dal dataframe tutte le features presenti all'interno della lista. 

In [492]:
def clean_useless(dataframe, column_list):
  for elem in column_list:
    dataframe.pop(elem)
  return dataframe

# Modello

Vengono sfruttate le possiilità offerte da una pipeline di sklearn per creare ed utilizzare il modello.

Come classificatore viene utilizzato l'algoritmo di Random Forest. Esso è un algoritmo di tipo Ensemble, e quindi sfrutta la combinazione di altri algoritmi più deboli ma opportunamente organizzati ed utilizzati per ottenere dei risultati soddisfacenti. Nel caso specifico abbiamo una combinazione di classificatori di tipo Decision tree.

In [494]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

def create_model(dataframe):

  preprocessor = create_preprocessor(dataframe)

  return Pipeline(
      steps = [('preprocessor', preprocessor),
               ('classifier', RandomForestClassifier())])

def create_preprocessor(dataframe):
  numeric_features = get_numeric_features(dataframe)
  numeric_transformer = Pipeline(
      steps = [('imputer', SimpleImputer(strategy='median')),
               ('scaler', StandardScaler())])

  categorical_features = get_categorical_features(dataframe)
  categorical_transformer = Pipeline(
      steps=[('imputer', SimpleImputer(strategy='most_frequent')),
             ('onehot', OneHotEncoder(handle_unknown='ignore'))])

  return ColumnTransformer(
      transformers=[('num', numeric_transformer, numeric_features),
                    ('cat', categorical_transformer, categorical_features)])

def get_numeric_features(dataframe):
  numeric_features = []
  for elem in dataframe:
    if is_numeric(dataframe[elem][0]):
      numeric_features.append(elem)
  return numeric_features

def get_categorical_features(dataframe):
  categorical_features = []
  for elem in dataframe:
    if is_categorical(dataframe[elem][0]):
      categorical_features.append(elem)
  return categorical_features

def is_numeric(elem):
  return not is_categorical(elem)

def is_categorical(elem):
  return type(elem) is str

In [495]:
from sklearn.model_selection import GridSearchCV

def train_model(model, dataframe_train, dataframe_train_target):
  model.fit(dataframe_train, dataframe_train_target)
  print('Training score: {}'.format(model.score(dataframe_train, dataframe_train_target)))
  return model

In [496]:
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, classification_report, plot_confusion_matrix

def print_test_model(model, dataframe_test, dataframe_test_target):
  preds = model.predict(dataframe_test)
  print('Test score: {}'.format(model.score(dataframe_test, dataframe_test_target)))
  print(classification_report(dataframe_test_target, preds))

# Preparazione

In [497]:
## LIBRARIES ###################################################################

import pandas as pd
from sklearn.model_selection import train_test_split

################################################################################

## FUNCTIONS ###################################################################

def io_load_csv(csv_path):
  return pd.read_csv(csv_path)

def split_train_test(dataframe):
  train, test = train_test_split(dataframe, test_size=0.2)
  train.name, test.name = 'train', 'test'
  return train, test

def io_save_dataframe_to_file(dataframe):
  dataframe.to_csv(dataframe.name + ".csv", index=False)

def io_save_multiple_dataframes_to_file(dataframe_list):
  list(map(io_save_dataframe_to_file, dataframe_list))

################################################################################

## MANUAL INTERVENTION #########################################################

csv_path = "./dataset.csv"

################################################################################

## EXECUTION ###################################################################

dataframe = io_load_csv(csv_path)

dataframe_list = split_train_test(dataframe)
io_save_multiple_dataframes_to_file(dataframe_list)

################################################################################

# Esecuzione

In [498]:
## MISCELLANEOUS ###############################################################

# Readable float 
np.set_printoptions(precision=3, suppress=True)

################################################################################

In [499]:
## MANUAL INTERVENTION #########################################################

dataframe_train_path = "train.csv"
dataframe_test_path = "test.csv"

################################################################################

## EXECUTION ###################################################################

dataframe_train, dataframe_test = io_load_multiple_csv([dataframe_train_path, 
                                                        dataframe_test_path])

################################################################################

In [None]:
dataframe_train.head()

In [None]:
## MANUAL INTERVENTION #########################################################

nan_list = [
            "", 
            " ", 
            "?",
            "unknown"
] # All the ways a nan element appears inside the dataframe

feature_target = ""

################################################################################

## EXECUTION ###################################################################

# Clean the dataframe from nan values
dataframe_train = normalize_nan(dataframe_train, nan_list)
dataframe_test = normalize_nan(dataframe_test, nan_list)

# Print dataframe's infographic
print_infos(dataframe_train)

print_feature_plots(dataframe_train, feature_target)

################################################################################

In [503]:
## MANUAL INTERVENTION #########################################################

useless_list = [
                # "salary"  
                # "ScheduledDay"     
] # All the meaningless attributes in relation to the prediction 

################################################################################

## EXECUTION ###################################################################

# Clean the dataframe from meningless features
dataframe_train = clean_useless(dataframe_train, useless_list)
dataframe_test = clean_useless(dataframe_test, useless_list)

# Add some useful features
# dataframe_train = add_diffence_scheduled_appointment_day(dataframe_train)
# dataframe_test = add_diffence_scheduled_appointment_day(dataframe_test)

################################################################################


In [504]:
from sklearn.preprocessing import LabelEncoder

## TARGET PREPARATION ##########################################################

le = LabelEncoder()

dataframe_train_target = le.fit_transform(dataframe_train.pop(feature_target))
dataframe_test_target = le.transform(dataframe_test.pop(feature_target))

################################################################################

In [None]:
## CLASSIFIER ##################################################################

# Create the classifier
classifier = create_model(dataframe_train)

# Train the classifier
classifier = train_model(classifier, dataframe_train, dataframe_train_target)

# Test the classifier
print_test_model(classifier, dataframe_test, dataframe_test_target)

################################################################################