In [54]:
import pandas as pd
import numpy as np

df = pd.read_csv("data/wine.csv", sep=";")
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,color,quality
0,7.4,0.7,0.0,1.9,E,11.0,34.0,0.9978,3.51,medium,9.4,red,5
1,7.8,0.88,0.0,2.6,C,25.0,67.0,0.9968,3.2,high,9.8,red,5
2,7.8,0.76,0.04,2.3,C,15.0,54.0,0.997,3.26,high,9.8,?,5
3,11.2,0.28,0.56,1.9,E,17.0,60.0,0.998,3.16,medium,9.8,red,6
4,7.4,0.7,0.0,1.9,E,11.0,34.0,0.9978,3.51,medium,9.4,red,5


In [55]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 13 columns):
fixed acidity           6497 non-null float64
volatile acidity        6497 non-null float64
citric acid             6497 non-null float64
residual sugar          6497 non-null float64
chlorides               6497 non-null object
free sulfur dioxide     6497 non-null float64
total sulfur dioxide    6497 non-null float64
density                 6497 non-null float64
pH                      6497 non-null object
sulphates               6497 non-null object
alcohol                 6497 non-null float64
color                   6497 non-null object
quality                 6497 non-null int64
dtypes: float64(8), int64(1), object(4)
memory usage: 659.9+ KB


In [56]:
df = df.replace(["?"], np.nan)
df["pH"] = df["pH"].astype(np.float64) #convert pH column to numeric
df.info()

X = df.drop("quality", axis=1)
y = df[["quality"]]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 13 columns):
fixed acidity           6497 non-null float64
volatile acidity        6497 non-null float64
citric acid             6497 non-null float64
residual sugar          6497 non-null float64
chlorides               6497 non-null object
free sulfur dioxide     6497 non-null float64
total sulfur dioxide    6497 non-null float64
density                 6497 non-null float64
pH                      5855 non-null float64
sulphates               6497 non-null object
alcohol                 6497 non-null float64
color                   5787 non-null object
quality                 6497 non-null int64
dtypes: float64(9), int64(1), object(3)
memory usage: 659.9+ KB


In [57]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)


# IMPUTE MISSING VALUES

In [58]:
#copy the datasets
X_train = X_train.copy()
X_test  = X_test .copy()

#make a third category for color
X_train["color"] = X_train["color"].fillna("unknown")
X_test ["color"] = X_test ["color"].fillna("unknown")

#use mean value of the training set for pH
from sklearn.impute import SimpleImputer
imp = SimpleImputer(strategy="mean")
imp.fit(X_train[["pH"]])
X_train["pH"] = imp.transform(X_train[["pH"]])
X_test ["pH"] = imp.transform(X_test [["pH"]])

X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4872 entries, 3731 to 2732
Data columns (total 12 columns):
fixed acidity           4872 non-null float64
volatile acidity        4872 non-null float64
citric acid             4872 non-null float64
residual sugar          4872 non-null float64
chlorides               4872 non-null object
free sulfur dioxide     4872 non-null float64
total sulfur dioxide    4872 non-null float64
density                 4872 non-null float64
pH                      4872 non-null float64
sulphates               4872 non-null object
alcohol                 4872 non-null float64
color                   4872 non-null object
dtypes: float64(9), object(3)
memory usage: 494.8+ KB


# Column transformation:
- Convert categorical/text column into integers (oneHotEncoded)
- Scale numerical columns

In [59]:
categories = X.dtypes == object
categories

fixed acidity           False
volatile acidity        False
citric acid             False
residual sugar          False
chlorides                True
free sulfur dioxide     False
total sulfur dioxide    False
density                 False
pH                      False
sulphates                True
alcohol                 False
color                    True
dtype: bool

In [60]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler


tf = make_column_transformer(
    (OneHotEncoder(sparse=False),  categories), 
    (StandardScaler(), ~categories),     
    remainder="passthrough")
tf.fit(X_train)
X_train = tf.transform(X_train)
X_test  = tf.transform(X_test)

#now, X_train,X_test are not Pandas dataframes anymore, but are numpy arrays
print("type=", type(X_train))
X_train[:5]

type= <class 'numpy.ndarray'>


array([[ 1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
         1.        , -0.08802117, -0.72820076,  0.00517689,  2.48517455,
         1.02972262,  0.744625  ,  1.75251273,  1.01481219, -0.75281497],
       [ 0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         1.        , -0.78538358, -1.09072305,  0.07390119, -0.91964663,
        -0.53015235, -0.59665306, -1.25026338,  0.68577368,  0.34073703],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
         0.        ,  0.        ,  1.        ,  0.        ,  1.        ,
         0.        , -0.39796002, -0.60736   , -0.13227171, -0.32703535,
         4.14947258,  1.1152413 , -0.94657352, -1.15684202,  0.67721458],
       [ 0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  

In [61]:
from sklearn.ensemble import AdaBoostRegressor
regr = AdaBoostRegressor(random_state=0, n_estimators=20)
regr.fit(X_train, y_train)  
print("train", regr.score(X_train, y_train)  )
print("test ", regr.score(X_test, y_test)  )

  y = column_or_1d(y, warn=True)


train 0.3214006392493878
test  0.28792648610559846


In [62]:
regr.feature_importances_  

array([0.        , 0.        , 0.04221051, 0.        , 0.00248454,
       0.02485741, 0.        , 0.        , 0.        , 0.00191911,
       0.        , 0.0225236 , 0.1435816 , 0.05857267, 0.05772124,
       0.16386258, 0.0835261 , 0.06418548, 0.00761266, 0.3269425 ])