### Importing the Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

pd.set_option('precision', 3)

### Importing the dataset

In [None]:
labels = ('fixed_acidity','volatile_acidity','citric_acid','residual_sugar','chlorides','free_sulfur_dioxide','total_sulfur_dioxide','density','pH','sulphates','alcohol','quality')
df_red = pd.read_csv('Datasets/winequality-red.csv', header = 0, names = labels,sep = ';')
# df_white = pd.read_csv('Datasets/winequality-white.csv', header = 0, names = labels,sep = ';')

### Display the dataset

In [None]:
df_red.head(2)

In [None]:
def drop_constant_column(dataframe):
    return dataframe.loc[:, (dataframe != dataframe.iloc[0]).any()]

def cleanDf(df):
    columns_len = len(df.columns)
    # clear empty or "singular" columns
    df = df.dropna(axis='columns', how='all')
    df = drop_constant_column(df)
    len_before_drop = len(df)
    df = df.drop_duplicates()
    print(f"Loaded {len_before_drop} records, removed {len_before_drop - len(df)} duplicates, left {len(df)}, removed {columns_len-len(df.columns)} of {columns_len} columns")

    return df

#### Dropping the Duplicate Values

In [None]:
df_red = cleanDf(df_red)
# df_white = cleanDf(df_white)

## Exploratory Data Analysis

In [None]:
def get_features(df, label, threshold):
    correlations = df.corr()[label].drop(label)
    features = correlations[correlations.abs() > threshold].index.tolist()
    df_filtered = df[features + [label]]

    return df_filtered, features, correlations

In [None]:
label = 'quality'
df_red_features, features, _ = get_features(df_red, label, 0.1)

In [None]:
plt.figure(figsize=(18,6))
sns.heatmap(df_red_features.corr(),annot=True)
plt.title('Correlation Matrix of Red Wine for Quality');

# Compare Predictions for Red Wine

### Splitting into Train and Test sets

In [None]:
from sklearn.model_selection import train_test_split

X = df_red[features]
y = df_red[label]
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

In [None]:
X.head(2)

In [None]:
y.head(2)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn import metrics

lr = LinearRegression()
lr.fit(X_train, y_train)

In [None]:
lr_train_pred = lr.predict(X_train)
print(f'Train Prediction: {lr_train_pred[:15].astype(int)}\n   Actual values: {y_train[:15].values}')
lr_test_pred = lr.predict(X_test)
print(f'Test  Prediction: {lr_test_pred[:15].astype(int)}\n   Actual values: {y_test[:15].values}')

In [None]:
lr_rmse_train = metrics.mean_squared_error(lr_train_pred, y_train) ** 0.5
print(f'LinearRegression train RMSE {lr_rmse_train:.2f}')

lr_rmse_test = metrics.mean_squared_error(lr_test_pred, y_test) ** 0.5
print(f'LinearRegression test  RMSE {lr_rmse_test:.2f}')

#### Using AUTO ML

In [None]:
import autosklearn
from sklearn.metrics import mean_absolute_error
from autosklearn.regression import AutoSklearnRegressor
from autosklearn.metrics import mean_absolute_error as auto_mean_absolute_error
print(f'autosklearn: {autosklearn.__version__}')

In [None]:
# define search
model_mae = AutoSklearnRegressor(time_left_for_this_task=6*60, metric=auto_mean_absolute_error, per_run_time_limit=60, n_jobs=-1)

In [None]:
# perform the search
model_mae.fit(X_train, y_train)

In [None]:
# summarize
print(model_mae.sprint_statistics())
# evaluate best model_mae
ar_train_pred = model_mae.predict(X_train)
ar_test_pred = model_mae.predict(X_test)
mae = mean_absolute_error(y_test, ar_test_pred)
print(f"MAE: {mae:.2f}")

In [None]:
ar_rmse_train = metrics.mean_squared_error(ar_train_pred, y_train) ** 0.5
print(f'AutoSklearnRegressor train RMSE {ar_rmse_train:.2f}')

ar_rmse_test = metrics.mean_squared_error(ar_test_pred, y_test) ** 0.5
print(f'AutoSklearnRegressor test  RMSE {ar_rmse_test:.2f}')

**CONCLUSION:** The best results