<a href="https://colab.research.google.com/github/AndreaBertoglio/MLDM/blob/master/Pre-processing/Data_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Import

In [None]:
import pip
import sys
#if not 'sklearn' in sys.modules.keys():
#    pip.main(['install', 'sklearn'])
#if not 'kaggle' in sys.modules.keys():
#    pip.main(['install', 'kaggle'])
import random
import shelve

print("Random number with seed 2020")
# first call
random.seed(2020)

Random number with seed 2020


In [None]:
import numpy as np
import pandas as pd
import graphviz

from pylab import *
from numpy import *

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt

### Funzioni di caricamento e salvataggio del training set

In [None]:
# Carica il training set dalla fonte specificata

# Parametri:
# filePath = stringa, locazione del file (default file GitHub di Serina)

# Return:
# pandas dataFrame, training set

def loadTrainingSet(changeQuality=True, filePath='https://raw.githubusercontent.com/serivan/mldmlab/master/Datasets/Kaggle2020/train.csv'):
  train = pd.read_csv(filePath)
  if changeQuality:
    train["Quality"] = np.where(train["Quality"].str.contains("Good"), 1, 0)

  return train

In [None]:
# Esporta un training set rielaborato in un file csv

# Parametri:
# training_set = pandas DataFrame, training set da esportare
# faileName = stringa, nome del file

def exportTrainingSet(training_set, fileName):
  
    training_set.to_csv(fileName, index=False)

# Funzioni di Pre-processing

### Unità di misura

In [None]:
# Riscrive parte dei dati per uniformare le unità di misura.
# Nello specifico riscala la volatile acidity da milligrammi a grammi
# la densità da kg/m3 a g/cm3, infine se specificato riscala anche i chlorides (da mg a g)

# Parametri: 
# trainig_set = pandas DataFrame, set di dati da elaborare
# scale_chlorides = booleano, specifica se scalare i chlorides

def scaleMeasureUnit(training_set, scale_chlorides=False):
  
  row = training_set.shape[0]
  
  for i in range(row):
    if training_set['volatile.acidity'][i]>1:
      training_set['volatile.acidity'][i] = training_set['volatile.acidity'][i] / 1000
    
    if scale_chlorides:
      if training_set['chlorides'][i]>1:
        training_set['chlorides'][i] = training_set['chlorides'][i] / 1000
    
    if training_set['density'][i]>1.5:
      training_set['density'][i] = training_set['density'][i] /1000

### Missing Values

In [None]:
# Sostituisce i missing value in maniera semplicistica
# In particolare usa un Imputer specificato con strategia 'mean'

# Parametri:
# training_set = pandas DataFrame, set di dati da elaborare
# imputer = stringa, definisce il tipo di imputer da utilizzare ('simple' o 'iterative', default 'simple')

# Return:
# panda DataFrame, set di dati con missing values sostituiti

def handleMissingValue(training_set, imputer='simple'):
  train_missing = training_set

  if imputer=='simple':
    imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
  else:
    if imputer=='iterative':
      imputer = IterativeImputer(missing_values=np.nan, initial_strategy='mean')

  imputer = imputer.fit(train_missing)
  train_filled = imputer.transform(train_missing)


  train_filled = np.transpose(train_filled)
  train_filled.shape

  dati = {'Id': train_filled[0],
          'fixed.acidity': train_filled[1],
          'volatile.acidity': train_filled[2],
          'citric.acid': train_filled[3],
          'residual.sugar': train_filled[4],
          'chlorides': train_filled[5],
          'free.sulfur.dioxide': train_filled[6],
          'total.sulfur.dioxide': train_filled[7],
          'density': train_filled[8],
          'pH': train_filled[9],
          'sulphates': train_filled[10],
          'alcohol': train_filled[11],
          'Quality': train_filled[12],
          'Cluster': train_filled[13]}

  training_set_filled = pd.DataFrame(data=dati)
  return training_set_filled

### Scalatura

In [None]:
# Riscrive i dati del set portandoli in una scala uniforme.
# In particolare sostituisce ogni dato d con (d-u)/v
# dove u è la media e v la deviazione standard dei valori della feature

# Parametri:
# trainig_set = pandas DataFrame, set di dati da elaborare
# columns = lista di stringhe, nomi delle colonne su cui effettuare la scalatura


def scaler(training_set, columns):
  row = training_set.shape[0]

  for feature in columns:
    _mean = pd.Series.mean(training_set[feature])
    _var = pd.Series.std(training_set[feature])
    for i in range (row):
      training_set[feature][i] = (training_set[feature][i]-_mean)/_var

### Gestione valori fuori scala

In [None]:
# Calcola e sostituisce gli outliers rilevati per ogni feature di ogni dato.
# In particolare attribuisce ad ogni dato uno score e sostituisce tutti i valori che hanno uno score superiore
# a una soglia specificata.
# Lo score è così calcolato: score = |dato-u|/v
# con u la media e v la deviazione standard della feature considerata

# Parametri:
# training_set = pandas DataFrame, set di dati da elaborare
# columns = lista di stringhe, nomi delle feature su cui analizzare i dati
# threeshold = numero, soglia di rilevazione degli outliers, considera outliers i dati con distanza dalla media superiore
#              a threeshold * deviazione standard (default 1)
# sub = numero, valore da sostituire agli outliers (default NaN)
# verbose = booleano, indica se stampare la descrizione dell'esecuzione (default False)

def featureOutlierSubstitution(training_set, columns, threeshold=1, sub=np.nan, verbose=False):
  row = training_set.shape[0]
  count = 0

  if verbose:
    print('start')

  for feature in columns:
    countF=0
    _mean = pd.Series.mean(training_set[feature])
    _var = pd.Series.std(training_set[feature])
    for i in range(row):
      score = abs(training_set[feature][i]-_mean)/_var
      if score>=threeshold:
        countF= countF+1
        count = count+1
        training_set[feature][i]=sub
        if verbose:
          print('elem ' + str(i) + ' in ' + feature + ' outlier with score: ' + str(score))
    if verbose:
      print(feature + ': ' + str(countF))
  if verbose:
    print('end, numero elementi: ' + str(count))

### Matrici delle distanze

In [None]:
# Calcola la distanza di ogni punto da tutti gli altri aventi una featurepari ad un determinato valore
# In particolare, colcola la distanza euclidea tra i punti che hanno in corrispondenza della feature targetClass
# il valore targetValue, per tutti gli altri valore si utilizza -1
# NOTA: IMPIEGA MOLTO TEMPO!!!

# Parametri:
# training = pandas DataFrame, set di dati da elaborare
# columns = lista di stringhe, nomi delle feature da considerare per la distanza
# targetClass = stringa, nome della feature su cui effettuare lo split per il calcolo delle distanze
# targetValue = numero, valore della feature che i dati devono rispettare per calcolarne la dostanza

# Return:
# matrice delle distanze

def quadraticDistance(training, columns, targetClass, targetValue):
  row = training.shape[0]
  distances = zeros((row,row))

  for i in range (row):
    if training[targetClass][i] == targetValue:
      for j in range (row):
        if i>j:
          distances[i][j] = distances[j][i]
        else:
          if (training[targetClass][j] == targetValue and j != i):  
            value = 0
            for feature in columns:
              value = value + (training[feature][i]-training[feature][j])**2
            #end for
            distances[i][j]=value
          else:
            distances[i][j]=-1
          #end if
        #end if
      #end for
    else:
      for j in range(row):
        distances[i][j]=-1
      #end for
    #end if
  #end for
  return distances

In [None]:
# Calcola le matrici delle distanze per Quality = good e Qualiti = bad

# Parametri:
# training_set = pandas DataFrame, set di dati da elaborare

# Return:
# una lista di due elementi, indice 0 = matrice per good, indice 1 = matrice per bad 

def computeDistanceMatrixes(training_set):
  col = ['fixed.acidity','volatile.acidity','citric.acid','residual.sugar','chlorides','free.sulfur.dioxide','total.sulfur.dioxide','density','pH','sulphates','alcohol']

  scaler(training_set,col)

  distance_matrix_bad = quadraticDistance(training=training_set, columns=col, targetClass='Quality', targetValue=0)
  distance_matrix_good = quadraticDistance(training=training_set, columns=col, targetClass='Quality', targetValue=1)

  return [distance_matrix_good, distance_matrix_bad]

In [None]:
# Salva le matrici in un file.
# In particolare salva una matrice sotto il nome di 'matrix_good' e l'altra con il nome 'matrix_bad'
# il file in cui viene salvato il tutto è .dat

# Parametri:
# matrix_good = matrice da salvare come 'matrix_good'
# matrix_bad = matrice da salvare come 'matrix_bad'
# fileName = stringa, nome del file che conterrà i dati (esclusa l'estensione .dat) (default 'matrix')

def saveMatrixes(matrix_good, matrix_bad, fileName='matrix'):
  name = fileName + '.dat'

  d= shelve.open(name)   #crea il file che conterrà il dizionario
  d['matrix_good']=matrix_good
  d['matrix_bad']=matrix_bad
  d.close()  #chiudo il file dizionario

In [None]:
# Carica due matrici delle distanze dal file specificato
# Il file deve essere .dat

# Parametri:
# fileName = stringa, nome del file (senza estensione)

# Return:
# una lista di 2 matrici; indice 0 = metrice Good, indice 1 = matrice Bad

def loadMatrixes(fileName):
  name = fileName + '.dat'

  d= shelve.open(name)  
  
  matrix_good= d['matrix_good'] 
  matrix_bad= d['matrix_bad']  
  
  d.close()

  return [matrix_good, matrix_bad]

### Processing degli outliers

In [None]:
# Calcola gli elementi outliers data la matrice delle distanze
# In particolare dalle distanze di ogni coppia di punti, computa la media delle distanze di ogni punto,
# e considera outlier tutti quelli che hanno una media superiore a una soglia specificata.
# Le distanze negative non vengono considerate nel calcolo della media 

# Perametri:
# distanceMatrix = matrice, contiene la distanza di ogni coppia di elementi
# threeshold = numero, soglia per l'identificazione degli outliers

# Return:
# lista degli outliers (indici interi)

def getOutliers(distanceMatrix, threeshold, verbose=False):
  
  row = distanceMatrix.shape[0]
  outliersList = []

  for i in range(0,row):
    val = 0
    count = 0

    for elem in distanceMatrix[i]:
      if elem >= 0 :
        count=count+1
        val=val+elem
    if (val==0 and count==0):
      val=-1
    else:
      val = val/count
    
    if val>=threeshold:
      if verbose:
        print('Elemento ' + str(i) + ' è un outlier, con valore: ' + str(val))
      outliersList.append(i)
  
  if verbose:
    print('Outliers trovati con soglia '+ str(threeshold)+ ': ' + str(len(outliersList)))
  
  return outliersList

In [None]:
# Elimina i valori ritenuti Outliers da un set di dati

# Parametri:
# outliersList = lista di interi, contiene gli indici degli outlier da eliminare
# training_set = pandas DataFrame, il set di dati da elaborare

# Return:
# pandas DataFrame, set di dati senza gli outliers

def dropOutliers(outliersList, training_set):
  result = training_set.drop(outliersList)
  return result

In [None]:
# Processa gli outlier globali eliminandoli, a partire dalle matrici delle distanze

# Paramentri:
# training_set = pandas DataFrame, set di dati da elaborare
# distance_matrix_good = matrice, contiene le distanze degli elementi con Quality=Good (oppure 1)
# distance_matrix_bad = matrice, contiene le distanze degli elementi con Quality=Bad (oppure 0)
# threeshold = soglia di definizione degli outliers
# verbose = booleano, indica se stampare la descrizione dell'esecuzione

def outliersProcessing(training_set, distance_matrix_good, distance_matrix_bad, threeshold, verbose=False):

  dropListBad = getOutliers(distanceMatrix=distance_matrix_bad, threeshold=threeshold, verbose=verbose)
  
  dropListGood = getOutliers(distanceMatrix=distance_matrix_good, threeshold=threeshold, verbose=verbose)
  
  training_set2 = dropOutliers(outliersList=dropListBad, training_set=training_set)
  training_set2 = dropOutliers(outliersList=dropListGood, training_set=training_set2)
  
  return training_set2

# Computazione
Contiene esempi di utilizzo delle funzioni riportate sopra

### Load train set

---



In [None]:
# Carica i dati
train = loadTrainingSet(changeQuality=True)

# effettua una scalatura per uniformare le unità di misura
scaleMeasureUnit(training_set=train, scale_chlorides=False)
# Sostituisce i missing value con strategia simple
train = handleMissingValue(training_set=train, imputer='simple')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
train = loadTrainingSet(filePath='TrainingWithCluster.csv', changeQuality=False)
train

Unnamed: 0,Id,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol,Quality,Cluster
0,3940,6.4,0.39,0.21,1.2,41.00,35.0,136.0,0.99225,3.15,0.46,10.2,0,2
1,1655,7.5,305.00,0.40,18.9,59.00,44.0,170.0,1.00000,2.99,0.46,9.0,0,5
2,1867,6.3,0.28,0.30,3.1,39.00,24.0,115.0,0.99420,3.05,0.43,8.6,0,2
3,4476,7.4,0.18,0.30,10.4,45.00,44.0,174.0,0.99660,3.11,0.57,9.7,1,4
4,453,6.9,0.20,0.36,1.5,31.00,38.0,147.0,0.99310,3.35,0.56,11.0,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3484,2768,6.9,0.14,0.38,1.0,41.00,22.0,81.0,0.99043,3.03,0.54,11.4,1,0
3485,4347,9.0,0.20,0.33,3.5,49.00,10.0,40.0,0.99440,3.14,0.36,9.8,1,0
3486,1870,7.6,0.29,0.26,6.5,42.00,32.0,160.0,0.99440,3.14,0.47,10.7,0,4
3487,613,7.5,0.17,0.32,1.7,0.04,51.0,148.0,0.99160,3.21,0.44,11.5,1,4


In [None]:
# Carica i dati
train = loadTrainingSet(filePath='TrainingWithCluster.csv', changeQuality=False)

# effettua una scalatura per uniformare le unità di misura
scaleMeasureUnit(training_set=train, scale_chlorides=False)
# Sostituisce i missing value con strategia simple
train = handleMissingValue(training_set=train, imputer='simple')
train

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,Id,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol,Quality,Cluster
0,3940.0,6.4,0.390,0.21,1.2,41.00,35.0,136.0,0.99225,3.15,0.46,10.2,0.0,2.0
1,1655.0,7.5,0.305,0.40,18.9,59.00,44.0,170.0,1.00000,2.99,0.46,9.0,0.0,5.0
2,1867.0,6.3,0.280,0.30,3.1,39.00,24.0,115.0,0.99420,3.05,0.43,8.6,0.0,2.0
3,4476.0,7.4,0.180,0.30,10.4,45.00,44.0,174.0,0.99660,3.11,0.57,9.7,1.0,4.0
4,453.0,6.9,0.200,0.36,1.5,31.00,38.0,147.0,0.99310,3.35,0.56,11.0,1.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3484,2768.0,6.9,0.140,0.38,1.0,41.00,22.0,81.0,0.99043,3.03,0.54,11.4,1.0,0.0
3485,4347.0,9.0,0.200,0.33,3.5,49.00,10.0,40.0,0.99440,3.14,0.36,9.8,1.0,0.0
3486,1870.0,7.6,0.290,0.26,6.5,42.00,32.0,160.0,0.99440,3.14,0.47,10.7,0.0,4.0
3487,613.0,7.5,0.170,0.32,1.7,0.04,51.0,148.0,0.99160,3.21,0.44,11.5,1.0,4.0


### feature outliers sub

In [None]:
# scelta delle colonne su cui effettuare ricerca degli outliers
cols = ['fixed.acidity','volatile.acidity','citric.acid','residual.sugar','chlorides','free.sulfur.dioxide','total.sulfur.dioxide','density','pH','sulphates','alcohol']
# eliminazione outliers
featureOutlierSubstitution(training_set=train, columns=cols, threeshold=3, sub=np.nan, verbose=False)
train

Unnamed: 0,Id,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol,Quality,Cluster
0,3940.0,6.4,0.390,0.21,1.2,41.00,35.0,136.0,0.99225,3.15,0.46,10.2,0.0,2.0
1,1655.0,7.5,0.305,0.40,18.9,59.00,44.0,170.0,1.00000,2.99,0.46,9.0,0.0,5.0
2,1867.0,6.3,0.280,0.30,3.1,39.00,24.0,115.0,0.99420,3.05,0.43,8.6,0.0,2.0
3,4476.0,7.4,0.180,0.30,10.4,45.00,44.0,174.0,0.99660,3.11,0.57,9.7,1.0,4.0
4,453.0,6.9,0.200,0.36,1.5,31.00,38.0,147.0,0.99310,3.35,0.56,11.0,1.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3484,2768.0,6.9,0.140,0.38,1.0,41.00,22.0,81.0,0.99043,3.03,0.54,11.4,1.0,0.0
3485,4347.0,9.0,0.200,0.33,3.5,49.00,10.0,40.0,0.99440,3.14,0.36,9.8,1.0,0.0
3486,1870.0,7.6,0.290,0.26,6.5,42.00,32.0,160.0,0.99440,3.14,0.47,10.7,0.0,4.0
3487,613.0,7.5,0.170,0.32,1.7,0.04,51.0,148.0,0.99160,3.21,0.44,11.5,1.0,4.0


In [None]:
# rielabora i dati mancanti (precedenti outliers) con strategia iterative
train = handleMissingValue(training_set=train, imputer='iterative')

train

Unnamed: 0,Id,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol,Quality,Cluster
0,3940.0,6.4,0.390,0.21,1.2,41.00,35.0,136.0,0.99225,3.15,0.46,10.2,0.0,2.0
1,1655.0,7.5,0.305,0.40,18.9,59.00,44.0,170.0,1.00000,2.99,0.46,9.0,0.0,5.0
2,1867.0,6.3,0.280,0.30,3.1,39.00,24.0,115.0,0.99420,3.05,0.43,8.6,0.0,2.0
3,4476.0,7.4,0.180,0.30,10.4,45.00,44.0,174.0,0.99660,3.11,0.57,9.7,1.0,4.0
4,453.0,6.9,0.200,0.36,1.5,31.00,38.0,147.0,0.99310,3.35,0.56,11.0,1.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3484,2768.0,6.9,0.140,0.38,1.0,41.00,22.0,81.0,0.99043,3.03,0.54,11.4,1.0,0.0
3485,4347.0,9.0,0.200,0.33,3.5,49.00,10.0,40.0,0.99440,3.14,0.36,9.8,1.0,0.0
3486,1870.0,7.6,0.290,0.26,6.5,42.00,32.0,160.0,0.99440,3.14,0.47,10.7,0.0,4.0
3487,613.0,7.5,0.170,0.32,1.7,0.04,51.0,148.0,0.99160,3.21,0.44,11.5,1.0,4.0


### export

In [None]:
exportTrainingSet(training_set=train,fileName='TrainingOutlier_Feature3_globali40_cluster.csv')

### Scaling

In [None]:
cols = ['fixed.acidity','volatile.acidity','citric.acid','residual.sugar','chlorides','free.sulfur.dioxide','total.sulfur.dioxide','density','pH','sulphates','alcohol','Cluster']

# effettua una scalatura dei dati, in modo che i range di valori siano gli stessi per tutte le feature
# e avere valori di distanze comparabili
scaler(training_set=train, columns=cols)
train

Unnamed: 0,Id,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol,Quality,Cluster
0,3940.0,-0.556359,1.374662,-1.131122,-1.054165,0.143413,0.001752,-0.068256,-0.628092,-0.237056,-0.241880,-0.239950,0.0,-0.449490
1,1655.0,0.809226,0.377659,0.698818,2.495080,1.159882,0.580515,0.748305,2.054904,-1.330853,-0.241880,-1.216740,0.0,0.609914
2,1867.0,-0.680503,0.084423,-0.264308,-0.673173,0.030472,-0.705624,-0.572603,0.046985,-0.920679,-0.528179,-1.542337,0.0,-0.449490
3,4476.0,0.685082,-1.088521,-0.264308,0.790640,0.369295,0.580515,0.844371,0.877848,-0.510505,0.807883,-0.646946,1.0,0.256779
4,453.0,0.064361,-0.853932,0.313568,-0.994009,-0.421292,0.194673,0.195925,-0.333828,1.130190,0.712450,0.411244,1.0,0.256779
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3484,2768.0,0.064361,-1.557699,0.506193,-1.094270,0.143413,-0.834238,-1.389165,-1.258163,-1.057403,0.521584,0.736841,1.0,-1.155759
3485,4347.0,2.671387,-0.853932,0.024630,-0.592964,0.595177,-1.605922,-2.373842,0.116223,-0.305418,-1.196210,-0.565546,1.0,-1.155759
3486,1870.0,0.933370,0.201718,-0.649559,0.008603,0.199884,-0.191169,0.508140,0.116223,-0.305418,-0.146447,0.167046,0.0,0.256779
3487,613.0,0.809226,-1.205815,-0.071683,-0.953904,-2.169618,1.030663,0.219942,-0.853117,0.173118,-0.432746,0.818240,1.0,0.256779


### distance Matrix

In [None]:
matrixes = computeDistanceMatrixes(training_set=train)

In [None]:
# Salva le due matrici delle distanze
matrixGood=matrixes[0] #matrice dei quality good
matrixBad=matrixes[1] #matrice dei qualitì bad

# salvataggio in locale, verranno cancellate alla chiusura del notebook
saveMatrixes(matrix_good=matrixGood, matrix_bad=matrixBad, fileName='distanceMatrixDopoF3_cluster')

In [None]:
# carica le matrici delle distanze
matrixes = loadMatrixes(fileName='distanceMatrixDopoF4,5')
distanceMatrixGood = matrixes[0]
distanceMatrixBad = matrixes[1]

### delete outlier

In [None]:
# elimina gli outliers
train = outliersProcessing(training_set=train, distance_matrix_good=distanceMatrixGood, distance_matrix_bad=distanceMatrixBad, threeshold=40, verbose=True)

train

Elemento 22 è un outlier, con valore: 90.31381227692957
Elemento 52 è un outlier, con valore: 81.66592352310829
Elemento 112 è un outlier, con valore: 133.34305257421005
Elemento 121 è un outlier, con valore: 66.65591977875766
Elemento 164 è un outlier, con valore: 47.91627894996523
Elemento 165 è un outlier, con valore: 62.813999499073404
Elemento 275 è un outlier, con valore: 42.73344663152081
Elemento 374 è un outlier, con valore: 59.08204957692033
Elemento 375 è un outlier, con valore: 62.41023737353894
Elemento 513 è un outlier, con valore: 45.32637263323703
Elemento 632 è un outlier, con valore: 54.934229442051034
Elemento 728 è un outlier, con valore: 57.650829418779544
Elemento 879 è un outlier, con valore: 43.22974407175502
Elemento 907 è un outlier, con valore: 41.6180475298674
Elemento 1056 è un outlier, con valore: 51.45999397612314
Elemento 1066 è un outlier, con valore: 46.73531784351397
Elemento 1099 è un outlier, con valore: 164.24015352628578
Elemento 1142 è un outlier

Unnamed: 0,Id,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol,Quality,Cluster
0,3940.0,-0.556359,1.374662,-1.131122,-1.054165,0.143413,0.001752,-0.068256,-0.628092,-0.237056,-0.241880,-0.239950,0.0,-0.449490
1,1655.0,0.809226,0.377659,0.698818,2.495080,1.159882,0.580515,0.748305,2.054904,-1.330853,-0.241880,-1.216740,0.0,0.609914
2,1867.0,-0.680503,0.084423,-0.264308,-0.673173,0.030472,-0.705624,-0.572603,0.046985,-0.920679,-0.528179,-1.542337,0.0,-0.449490
3,4476.0,0.685082,-1.088521,-0.264308,0.790640,0.369295,0.580515,0.844371,0.877848,-0.510505,0.807883,-0.646946,1.0,0.256779
4,453.0,0.064361,-0.853932,0.313568,-0.994009,-0.421292,0.194673,0.195925,-0.333828,1.130190,0.712450,0.411244,1.0,0.256779
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3484,2768.0,0.064361,-1.557699,0.506193,-1.094270,0.143413,-0.834238,-1.389165,-1.258163,-1.057403,0.521584,0.736841,1.0,-1.155759
3485,4347.0,2.671387,-0.853932,0.024630,-0.592964,0.595177,-1.605922,-2.373842,0.116223,-0.305418,-1.196210,-0.565546,1.0,-1.155759
3486,1870.0,0.933370,0.201718,-0.649559,0.008603,0.199884,-0.191169,0.508140,0.116223,-0.305418,-0.146447,0.167046,0.0,0.256779
3487,613.0,0.809226,-1.205815,-0.071683,-0.953904,-2.169618,1.030663,0.219942,-0.853117,0.173118,-0.432746,0.818240,1.0,0.256779


## Test set

In [None]:
# Carica i dati
test_set = loadTrainingSet(changeQuality=False, filePath='TestWithCluster.csv')

# effettua una scalatura per uniformare le unità di misura
scaleMeasureUnit(training_set=test_set, scale_chlorides=False)

cols = ['fixed.acidity','volatile.acidity','citric.acid','residual.sugar','chlorides','free.sulfur.dioxide','total.sulfur.dioxide','density','pH','sulphates','alcohol', 'Cluster']

# effettua una scalatura dei dati, in modo che i range di valori siano gli stessi per tutte le feature
# e avere valori di distanze comparabili
scaler(training_set=test_set, columns=cols)

# esporta il test set
exportTrainingSet(training_set=test_set, fileName='test_set_cluster_scalato.csv')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
