<a href="https://colab.research.google.com/github/AndreaBertoglio/MLDM/blob/master/Pre-processing/ClusterASFeature.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Import

In [None]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(2020)

In [None]:
import pandas as pd
import graphviz
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.cluster import KMeans

### Preparazione e merge dei dati

In [None]:
train = pd.read_csv('https://raw.githubusercontent.com/serivan/mldmlab/master/Datasets/Kaggle2020/train.csv')
test = pd.read_csv('https://raw.githubusercontent.com/serivan/mldmlab/master/Datasets/Kaggle2020/test.csv')
#Le classi sono 1=Good 0=Disappointing
train["Quality"] = np.where(train["Quality"].astype(str).str.contains("Good"), 1, 0)


In [None]:

#La y è la classe, cioè la Quality
train_y = train.Quality
#seleziono colonne di interesse, non metto l'ID perchè non mi interessa
predictor_cols = ['fixed.acidity','volatile.acidity','citric.acid','residual.sugar','chlorides','free.sulfur.dioxide','total.sulfur.dioxide','density','pH','sulphates','alcohol']


train_X = train[predictor_cols]
test_X = test[predictor_cols]



# Sostituisce i missing values con la media e lo applica alle x

imp = IterativeImputer(missing_values=np.nan, max_iter=30)
imp = imp.fit(train_X)
train_X_imp = imp.transform(train_X)

#unisco training set e test set per ottenere un clustering uniforme e coerente tra i due insiemi di dati
conc = np.concatenate((train_X_imp, test_X), axis=0)

### Clustering

In [None]:
# Clustering
km = sklearn.cluster.KMeans(n_clusters=10)
pred = km.fit_predict(conc)

# labels
labels = km.labels_
pred

array([7, 3, 7, ..., 7, 5, 0], dtype=int32)

In [None]:
#divido le label tra elementi del training set e del test set
pred_train=pred[0:3489:1]
pred_test=pred[3489::1]

In [None]:
# aggiungo le nuove colonne ai due dataset
train['Cluster']=pred_train
test['Cluster']=pred_test

Unnamed: 0,Id,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol,Quality,Cluster
0,3940,6.4,0.39,0.21,1.2,41.00,35.0,136.0,0.99225,3.15,0.46,10.2,0,7
1,1655,7.5,305.00,0.40,18.9,59.00,44.0,170.0,1.00000,2.99,0.46,9.0,0,3
2,1867,6.3,0.28,0.30,3.1,39.00,24.0,115.0,0.99420,3.05,0.43,8.6,0,7
3,4476,7.4,0.18,0.30,10.4,45.00,44.0,174.0,0.99660,3.11,0.57,9.7,1,6
4,453,6.9,0.20,0.36,1.5,31.00,38.0,147.0,0.99310,3.35,0.56,11.0,1,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3484,2768,6.9,0.14,0.38,1.0,41.00,22.0,81.0,0.99043,3.03,0.54,11.4,1,0
3485,4347,9.0,0.20,0.33,3.5,49.00,10.0,40.0,0.99440,3.14,0.36,9.8,1,0
3486,1870,7.6,0.29,0.26,6.5,42.00,32.0,160.0,0.99440,3.14,0.47,10.7,0,6
3487,613,7.5,0.17,0.32,1.7,0.04,51.0,148.0,0.99160,3.21,0.44,11.5,1,6


### Esportazione dei dataset

In [None]:
#esporto i dataset
train.to_csv("TrainCluster2.csv", index=False)
test.to_csv("TestCluster2.csv", index=False)