In [1]:
import os, sys

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from dataClass import dataMethods

In [2]:
d = dataMethods(data_path="data")

Loaded: White Wine Data	Length: 1599
Loaded: Red Wine Data	Length: 4898

Unified Data, with Total Length: 6497


## Load Raw Data

In [3]:
raw_white = pd.read_csv("data\winequality-red.csv",sep=";")
raw_white["type"] = "white" #Add identifier

raw_red = pd.read_csv("data\winequality-white.csv",sep=";")
raw_red["type"] = "red" #Add identifier

raw_join = pd.concat([raw_white,raw_red])
raw_join.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,white
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,white
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,white
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,white
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,white


# Normalizing / Scaling Data
This step is not required for all regression methods, most do not require it at all. Howwever it is a good step to take on principle, as it can be useful for other purposes. E.g. Clustering.

Two options exist, either a 0-1 normalisation of each feature, or normalising around 0 with common standard deviations. 
The better option depends on the regression approch taken.

If we decide to do regression including both red and white wine, it is important to digitise the "type" feature. As there are only two groups, a simple binary true/false flag would work. 

In [4]:
d.digitizeType()

In [5]:
def scaleData(data,type = "standard"):
    if type == "standard":
        from sklearn.preprocessing import StandardScaler
        scaler = StandardScaler()
    elif type == "maxmin":
        from sklearn.preprocessing import MinMaxScaler
        scaler = MinMaxScaler()
    scaler.fit(raw_join)
    data_out = scaler.transform(raw_join)
    if isinstance(data,pd.DataFrame):
        data_out = pd.DataFrame(data_out, columns=raw_join.columns)
    return(data_out)

In [6]:
## Transform features by scaling each feature to zero and one
minmax_join = scaleData(raw_join,type = "maxmin")

## Standardize features by removing the mean and scaling to unit variance.
standard_join = scaleData(raw_join,type = "standard")


In [7]:
from sklearn.model_selection import train_test_split


In [8]:
from sklearn.ensemble import GradientBoostingRegressor

regr = GradientBoostingRegressor(random_state=0)

X = raw_white.drop(columns=["quality","type"]).to_numpy()
y = raw_white.quality.to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


regr.fit(X_train, y_train)
regr.score(X_test, y_test)

0.3924097475504815

In [9]:
from sklearn.ensemble import RandomForestRegressor

regr = RandomForestRegressor(random_state=0)

regr.fit(X_train, y_train)
regr.score(X_test, y_test)

0.46464124206764834

In [10]:
from sklearn.ensemble import HistGradientBoostingRegressor

regr = HistGradientBoostingRegressor(random_state=0)

regr.fit(X_train, y_train)
regr.score(X_test, y_test)

0.4310691812588745

In [11]:
from sklearn.ensemble import AdaBoostRegressor

regr = AdaBoostRegressor(random_state=0)

regr.fit(X_train, y_train)
regr.score(X_test, y_test)

0.3519916614194514