<a href="https://colab.research.google.com/github/DunkleCat/ia-esame/blob/master/training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Caricamento del dataframe

In [129]:
import pandas as pd

def io_load_multiple_csv(csv_path_list):
  dataframe_list = []
  for elem in csv_path_list:
    dataframe_list.append(io_load_csv(elem))
  return dataframe_list

def io_load_csv(csv_path):
  return pd.read_csv(csv_path)  

# Analisi del Dataset


## Pulizia valori nulli

Come primo passaggio cerchiamo di sfruttare le potenzialità di sklearn per recuperare tutte quelle righe che presentano valori nulli e renderle valide.


In [130]:
#from sklearn.impute import SimpleImputer
import numpy as np

def normalize_nan(dataframe, nan_list):
  for elem in nan_list:
    dataframe.replace(elem, np.nan, inplace=True)
  #imp = SimpleImputer(missing_values=np.nan, strategy='mean')
  #imp.fit(dataframe)
  return dataframe

## Pulizia colonne poco significative

Passiamo ora ad analizzare i dati in ingresso e trovare colonne che molto probabilmente non ci aiuteranno nella previsione

In [131]:
def print_infos(dataframe):
  print("Dataframe miscellaneous:\n")
  print("Rows     : {}".format(dataframe.shape[0]) )
  print("Columns  : {}".format(dataframe.shape[1]))
  print("\nFeatures :\n{}".format(dataframe.columns.tolist()))
  print("\nUnique values :\n{}".format(dataframe.nunique()))

  print("\nDataframe info:")
  dataframe.info()

In [132]:
import matplotlib.pyplot as plt
import seaborn as sb

def print_feature_plots(dataframe, feature_list, feature_target):
  for elem in dataframe:
    if elem in feature_list:
      plot = sb.catplot(x = feature_target, 
                        col = elem, 
                        data = dataframe, 
                        kind = 'count')
    plt.show()

In [133]:
def clean_useless(dataframe, useless_list):
  for elem in useless_list:
    dataframe.pop(elem)
  return dataframe

## Da semplici dati a numeri utili

Per ogni colonna presente nel dataset, ogni suo elemento viene trasformato in un intero corrispondente ad una classe della colonna stessa, se non è già un numero.

In [134]:
from sklearn.preprocessing import LabelEncoder

def attributes_to_float(dataframe):
  le_dict = {}
  for elem in dataframe:
    if type(dataframe[elem][1]) is str:
      le_dict[elem] = LabelEncoder()
      dataframe[elem] = le_dict[elem].fit_transform(dataframe[elem])

  return dataframe, le_dict

# Modello

In [135]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.linear_model import Ridge 
from sklearn.ensemble import RandomForestClassifier

# TODO ordine alfabetico

def create_model(dataframe):
  #imputer = FeatureUnion( 
  #    transformer_list=[('features', SimpleImputer(strategy='mean')), 
  #                      ('indicators', MissingIndicator())])
  
  numeric_features = get_numeric_features(dataframe)
  numeric_transformer = Pipeline(
      steps=[('imputer', SimpleImputer(strategy='median')),
             ('scaler', StandardScaler())])
  
  categorical_features = get_categorical_features(dataframe)
  categorical_transformer = Pipeline(
      steps=[#('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
             ('imputer', SimpleImputer(strategy='most_frequent')),
             #('onehot', OneHotEncoder(drop='first', handle_unknown='error'))])
             ('onehot', OneHotEncoder(handle_unknown='ignore'))])

  preprocessor = ColumnTransformer(
      transformers=[('num', numeric_transformer, numeric_features),
                    ('cat', categorical_transformer, categorical_features)])

  reduce_dim = TruncatedSVD()

  classifier = RandomForestClassifier()

  return Pipeline([#('imputer', imputer),
                   #('fix_mixed_types', fix_mixed_types),
                   #('scaler', scaler),
                   ('preprocessor', preprocessor),
                   ('reduce_dim', reduce_dim),
                   ('classifier', classifier)
                   ])

def get_numeric_features(dataframe):
  numeric_features = []
  for elem in dataframe:
    if is_numeric(dataframe[elem]):
      numeric_features.append(elem)
  return numeric_features

def get_categorical_features(dataframe):
  categorical_features = []
  for elem in dataframe:
    if is_categorical(dataframe[elem]):
      categorical_features.append(elem)
  return categorical_features

def is_numeric(elem):
  return elem is float

def is_categorical(elem):
  return not is_numeric(elem)

In [136]:
from sklearn import set_config

def print_model(model):
  #set_config(display='diagram')
  model

In [137]:
from sklearn.model_selection import GridSearchCV

def train_model(model, dataframe_train, dataframe_train_target):
  return model.fit(dataframe_train, dataframe_train_target)
  #n_features_to_test = np.arange(1, 11)
  #alpha_to_test = 2.0**np.arange(-6, +6)
  #params = {
  #    'reduce_dim__n_components': n_features_to_test, 
  #    'regressor__alpha': alpha_to_test
  #    }
  #return GridSearchCV(model, params, verbose=1).fit(dataframe_train, 
  #                                                  dataframe_train_target)

In [138]:
def print_test_model(model, dataframe_test, dataframe_test_target):
  print('Test Score: {}'.format(model.score(dataframe_test, 
                                               dataframe_test_target)))
  #print('Best params: {}'.format(model.best_params_))

# Esecuzione

In [139]:
## MISCELLANEOUS ###############################################################

# Rende i numeri a virgola mobile più leggibili
np.set_printoptions(precision=3, suppress=True)

################################################################################

In [140]:
## MANUAL INTERVENTION #########################################################

dataframe_train_path = "./dataframe_train.csv"
dataframe_test_path = "./dataframe_test.csv"
#dataframe_train_path = "/content/drive/My Drive/datasets/dataframe_train.csv"
#dataframe_test_path = "/content/drive/My Drive/datasets/dataframe_test.csv"

################################################################################

## EXECUTION ###################################################################


#dataframe_train = io_load_csv(dataframe_train_path)
#dataframe_test = io_load_csv(dataframe_test_path)

dataframe_train, dataframe_test = io_load_multiple_csv([dataframe_train_path, 
                                                        dataframe_test_path])

################################################################################

In [141]:
dataframe_train.info()
dataframe_train.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6499 entries, 0 to 6498
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class_edible              6499 non-null   object
 1   cap-shape                 6499 non-null   object
 2   cap-surface               6499 non-null   object
 3   cap-color                 6499 non-null   object
 4   bruises                   6499 non-null   object
 5   odor                      6499 non-null   object
 6   gill-attachment           6499 non-null   object
 7   gill-spacing              6499 non-null   object
 8   gill-size                 6499 non-null   object
 9   gill-color                6499 non-null   object
 10  stalk-shape               6499 non-null   object
 11  stalk-root                6499 non-null   object
 12  stalk-surface-above-ring  6499 non-null   object
 13  stalk-surface-below-ring  6499 non-null   object
 14  stalk-color-above-ring  

Unnamed: 0,class_edible,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,p,f,c,f,c,n,p,e,b,s,s,w,w,p,w,o,p,k,v,d
1,e,x,y,y,t,a,f,c,b,w,e,c,s,s,w,w,p,w,o,p,n,s,m
2,p,f,y,e,f,y,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,p
3,p,f,s,n,f,y,f,c,n,b,t,?,k,k,w,p,p,w,o,e,w,v,d
4,e,b,f,w,f,n,f,w,b,g,e,?,s,k,w,w,p,w,t,p,w,s,g


In [142]:
## MANUAL INTERVENTION #########################################################

nan_list = [
            "", 
            " ", 
            "?",
            "unknown"
] # All the ways a nan element appears inside the dataframe

feature_list = [

] # All the useful features to plot 

feature_target = ""

################################################################################

## EXECUTION ###################################################################

# TODO Investigare l'utilizzo di map che ritorna valori _o_ 
#      substitute che agisce in utomatico senza a = a.sub(b)

# Clean the dataframe from nan values
dataframe_train = normalize_nan(dataframe_train, nan_list)
dataframe_test = normalize_nan(dataframe_test, nan_list)

# Print dataframe's infographic
print_infos(dataframe_train)
print_feature_plots(dataframe_train, feature_list, feature_target)

################################################################################

Dataframe miscellaneous:

Rows     : 6499
Columns  : 23

Features :
['class_edible', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape', 'stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat']

Unique values :
class_edible                 2
cap-shape                    6
cap-surface                  4
cap-color                   10
bruises                      2
odor                         9
gill-attachment              2
gill-spacing                 2
gill-size                    2
gill-color                  12
stalk-shape                  2
stalk-root                   4
stalk-surface-above-ring     4
stalk-surface-below-ring     4
stalk-color-above-ring       9
stalk-color-below-ring       9
veil-type                    1
veil-color 

In [143]:
## MANUAL INTERVENTION #########################################################

useless_list = [
                
] # All the meaningless attributes in relation to the prediction 

################################################################################

## EXECUTION ###################################################################

# Clean the dataframe from meningless attributes
dataframe_train = clean_useless(dataframe_train, useless_list)
dataframe_test = clean_useless(dataframe_test, useless_list)

# Transform every column in a float value
# dataframe_train, label_encoder_dict = attributes_to_float(dataframe_train)

################################################################################

In [144]:
from sklearn.preprocessing import LabelBinarizer, LabelEncoder


## MANUAL INTERVENTION #########################################################

target_label = "class_edible"

################################################################################

## EXECUTION ###################################################################

# lb = LabelBinarizer()
le = LabelEncoder()

# dataframe_train_target = lb.fit_transform(dataframe_train.pop(target_label))
# dataframe_test_target = lb.fit_transform(dataframe_test.pop(target_label))

dataframe_train_target = le.fit_transform(dataframe_train.pop(target_label))
dataframe_test_target = le.fit_transform(dataframe_test.pop(target_label))

# Create the model
model = create_model(dataframe_train)
print_model(model)

# Train the model
model = train_model(model, dataframe_train, dataframe_train_target)

# Test the model
print_test_model(model, dataframe_test, dataframe_test_target)

################################################################################

Test Score: 0.9661538461538461
