In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Lasso
from sklearn import metrics

In [4]:
import statsmodels.api as sm

## Read in the wine data

In [5]:
df_red = pd.read_csv('./dataset/winequality-red.csv', sep=';', index_col=None)
df_white = pd.read_csv('./dataset/winequality-white.csv', sep=';', index_col=None)

# use OLS regression

First use AIC to do model selection and then use 5-fold cross-validation to get a performance

In [6]:
len(df_red)

1599

In [7]:
X_red = df_red.iloc[:, :-1].values
y_red = df_red.iloc[:, -1].values
X_white = df_white.iloc[:, :-1].values
y_white = df_white.iloc[:, -1].values

In [8]:
X_red = sm.add_constant(X_red)
X_white = sm.add_constant(X_white)

In [40]:
model = sm.OLS(y_red, X_red).fit()
print("Full model AIC:", model.aic)
print("MSE of the model:", model.mse_resid)
# print(model.summary())

model = sm.OLS(y_red, X_red[:, [0, 1, 3, 4, 6, 8, 9]]).fit()
print("Alternative model AIC:", model.aic)
print("MSE of the model:", model.mse_resid)

Full model AIC: 3162.2765525826335
MSE of the model: 0.4199185257637249
Alternative model AIC: 3530.1060744491874
MSE of the model: 0.5301744977678724


For red wine, the full model has a smaller AIC, so it is better, with an MSE of 0.4199185257637249

In [41]:
model = sm.OLS(y_white, X_white).fit()
print("Full model AIC:", model.aic)
print("MSE of the model:", model.mse_resid)
# model.summary()
model = sm.OLS(y_white, np.delete(X_white, obj=(3, 5, 7), axis=1)).fit()
print("Alternative model AIC:", model.aic)
print("MSE of the model:", model.mse_resid)

Full model AIC: 11111.48027072974
MSE of the model: 0.5645371675232171
Alternative model AIC: 11106.287753625897
MSE of the model: 0.5642837748574745


For white wine, the alternative model has a smaller AIC, so it is better, with an MSE of 0.5642837748574745