In [1]:
# Importing Libraries

# Data Handling
import pandas as pd
import numpy as np


# Data Visualization
import plotly.express as px
import plotly.graph_objs as go
import plotly.subplots as sp
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
from IPython.display import display
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)

# Statistics & Mathematics
import scipy.stats as stats
from scipy.stats import shapiro, skew
import math

from sklearn.feature_selection import RFECV

# Machine Learning Pipeline & process
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin

# Preprocessing data
from sklearn.preprocessing import RobustScaler, StandardScaler, QuantileTransformer, FunctionTransformer

from sklearn.compose import ColumnTransformer

# Model Selection for Cross Validation
from sklearn.model_selection import StratifiedKFold, KFold, train_test_split

# Machine Learning metrics
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, cohen_kappa_score, make_scorer

# ML regressors
from sklearn.linear_model import HuberRegressor,RANSACRegressor, TheilSenRegressor
from sklearn.ensemble import HistGradientBoostingRegressor, StackingRegressor, AdaBoostRegressor, RandomForestRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# ML classifiers
from sklearn.ensemble import HistGradientBoostingClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.ensemble import StackingClassifier, VotingClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.cluster import KMeans

import optuna

import random

# Encoder of categorical variables
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

# Hiding warnings 
import warnings
warnings.filterwarnings("ignore")

In [2]:
train_data = pd.read_csv('train.csv', sep = ';')
train_data.head(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type
0,6.6,0.3,0.36,1.2,0.035,43,126,0.9909,3.01,0.63,11.4,6,white
1,7.7,0.5,0.26,1.9,0.062,9,31,0.9966,3.39,0.64,9.6,5,red
2,8.4,0.5,0.35,2.9,0.076,21,127,0.9976,3.23,0.63,9.2,5,red
3,7.5,0.4,0.33,5.0,0.045,30,131,0.9942,3.32,0.44,10.9,6,white
4,6.4,0.2,0.25,20.2,0.083,35,157,0.9998,3.17,0.5,9.1,5,white


In [3]:
def feat_eng(df):
    df.columns = df.columns.str.replace(' ', '_')
    df['total_acidity'] = df['fixed_acidity'] + df['volatile_acidity'] + df['citric_acid']
    df['acidity_to_pH_ratio'] = df['total_acidity'] / df['pH']
    df['free_sulfur_dioxide_to_total_sulfur_dioxide_ratio'] = df['free_sulfur_dioxide'] / df['total_sulfur_dioxide']
    df['alcohol_to_acidity_ratio'] = df['alcohol'] / df['total_acidity']
    df['residual_sugar_to_citric_acid_ratio'] = df['residual_sugar'] / df['citric_acid']
    df['alcohol_to_density_ratio'] = df['alcohol'] / df['density']
    df['total_alkalinity'] = df['pH'] + df['alcohol']
    df['total_minerals'] = df['chlorides'] + df['sulphates'] + df['residual_sugar']
    
    # Cleaning inf or null values that may result from the operations above
    df = df.replace([np.inf, -np.inf], 0)
    df = df.dropna()
    
    return df

In [4]:
train_data = feat_eng(train_data)
train_data = train_data.drop(columns = 'type')
train_data.head(5)

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,total_acidity,acidity_to_pH_ratio,free_sulfur_dioxide_to_total_sulfur_dioxide_ratio,alcohol_to_acidity_ratio,residual_sugar_to_citric_acid_ratio,alcohol_to_density_ratio,total_alkalinity,total_minerals
0,6.6,0.3,0.36,1.2,0.035,43,126,0.9909,3.01,0.63,11.4,6,7.26,2.41196,0.34127,1.570248,3.333333,11.504693,14.41,1.865
1,7.7,0.5,0.26,1.9,0.062,9,31,0.9966,3.39,0.64,9.6,5,8.46,2.495575,0.290323,1.134752,7.307692,9.632751,12.99,2.602
2,8.4,0.5,0.35,2.9,0.076,21,127,0.9976,3.23,0.63,9.2,5,9.25,2.863777,0.165354,0.994595,8.285714,9.222133,12.43,3.606
3,7.5,0.4,0.33,5.0,0.045,30,131,0.9942,3.32,0.44,10.9,6,8.23,2.478916,0.229008,1.324423,15.151515,10.963589,14.22,5.485
4,6.4,0.2,0.25,20.2,0.083,35,157,0.9998,3.17,0.5,9.1,5,6.85,2.160883,0.22293,1.328467,80.8,9.10182,12.27,20.783


In [5]:
def X_y_split(df, target_variable):
    
    '''
    This function takes a dataframe and a target variable to create an X (predictors) dataframe and a y Series
    '''
    
    X, y = df.drop([target_variable], axis = 1), df[target_variable] 

    #Printing info on X and y
    print(f'\nX shape: {X.shape}\n')
    print(f'\n{len(X)} Samples \n')
    print(f'\n{len(X.columns)} Attributes \n')
    display(X.head(10))
    print('\n')
    print(f'\ny shape: {y.shape}\n')
    print(f'\n{len(y)} Samples \n')
    display(y.head(10))
    
    return X, y

In [6]:
X, y = X_y_split(train_data, 'quality')


X shape: (6714, 19)


6714 Samples 


19 Attributes 



Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,total_acidity,acidity_to_pH_ratio,free_sulfur_dioxide_to_total_sulfur_dioxide_ratio,alcohol_to_acidity_ratio,residual_sugar_to_citric_acid_ratio,alcohol_to_density_ratio,total_alkalinity,total_minerals
0,6.6,0.3,0.36,1.2,0.035,43,126,0.9909,3.01,0.63,11.4,7.26,2.41196,0.34127,1.570248,3.333333,11.504693,14.41,1.865
1,7.7,0.5,0.26,1.9,0.062,9,31,0.9966,3.39,0.64,9.6,8.46,2.495575,0.290323,1.134752,7.307692,9.632751,12.99,2.602
2,8.4,0.5,0.35,2.9,0.076,21,127,0.9976,3.23,0.63,9.2,9.25,2.863777,0.165354,0.994595,8.285714,9.222133,12.43,3.606
3,7.5,0.4,0.33,5.0,0.045,30,131,0.9942,3.32,0.44,10.9,8.23,2.478916,0.229008,1.324423,15.151515,10.963589,14.22,5.485
4,6.4,0.2,0.25,20.2,0.083,35,157,0.9998,3.17,0.5,9.1,6.85,2.160883,0.22293,1.328467,80.8,9.10182,12.27,20.783
5,7.6,0.3,0.52,13.2,0.042,61,148,0.9984,2.98,0.47,9.1,8.42,2.825503,0.412162,1.08076,25.384615,9.114583,12.08,13.712
6,6.4,0.3,0.4,6.2,0.04,46,169,0.9953,3.15,0.46,9.3,7.1,2.253968,0.272189,1.309859,15.5,9.343916,12.45,6.7
7,6.6,0.3,0.36,1.7,0.038,22,101,0.9912,3.29,0.57,11.6,7.26,2.206687,0.217822,1.597796,4.722222,11.702986,14.89,2.308
8,7.6,0.3,0.25,9.5,0.03,15,136,0.9937,3.1,0.44,12.1,8.15,2.629032,0.110294,1.484663,38.0,12.176713,15.2,9.97
9,7.6,0.3,0.49,20.2,0.06,30,145,1.002,3.01,0.44,8.5,8.39,2.787375,0.206897,1.013111,41.22449,8.483034,11.51,20.7





y shape: (6714,)


6714 Samples 



0    6
1    5
2    5
3    6
4    5
5    6
6    6
7    6
8    6
9    5
Name: quality, dtype: int64

In [7]:
transformer = QuantileTransformer(output_distribution='normal')
X_transformed = transformer.fit_transform(X)
X_transformed = pd.DataFrame(X_transformed, columns=X.columns)
X_transformed

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,total_acidity,acidity_to_pH_ratio,free_sulfur_dioxide_to_total_sulfur_dioxide_ratio,alcohol_to_acidity_ratio,residual_sugar_to_citric_acid_ratio,alcohol_to_density_ratio,total_alkalinity,total_minerals
0,-0.469603,-0.056486,0.457033,-1.529558,-0.947580,0.756333,0.171458,-1.194628,-1.365257,0.830234,0.758004,-0.433482,0.073608,0.622818,0.774557,-1.337127,0.776528,0.618059,-1.337962
1,0.656486,0.928105,-0.517790,-0.593940,0.558392,-1.269664,-1.266860,0.492137,1.093272,0.892038,-0.475214,0.694311,0.278374,0.212353,-0.839661,-0.401995,-0.475214,-0.353487,-0.576075
2,1.111702,0.928105,0.391137,-0.013801,0.862544,-0.396560,0.191865,0.880888,0.144780,0.830234,-1.123415,1.144896,1.048636,-1.068870,-1.441971,-0.296431,-1.148124,-1.016672,-0.013212
3,0.486480,0.564267,0.227777,0.266584,-0.188033,0.100535,0.262686,-0.194422,0.659602,-0.608973,0.434861,0.529301,0.241509,-0.397706,-0.006790,0.087752,0.432448,0.493553,0.253776
4,-0.726606,-0.949547,-0.615025,2.877846,1.102440,0.358833,0.728240,1.824556,-0.239380,-0.090452,-1.315958,-0.907080,-0.709469,-0.460403,0.007112,1.979084,-1.387410,-1.272478,2.880212
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6709,0.204660,-0.949547,-1.044409,0.613509,-0.188033,1.250242,1.044409,0.231641,-0.239380,-1.202347,-0.604448,-0.037646,0.031432,0.298001,-0.307111,1.072678,-0.579039,-0.692715,0.600346
6710,-0.346817,-0.056486,0.313694,0.584984,-0.851675,0.567211,0.130848,-1.077446,-1.513589,-1.994971,1.368450,-0.332198,0.191068,0.403356,1.151502,0.423517,1.356022,1.204935,0.558175
6711,-0.469603,-0.056486,-0.710366,0.048948,-1.040090,0.042668,-0.333524,-1.449682,-0.736442,-0.959440,1.292465,-0.561327,-0.241963,0.230301,1.248549,0.018820,1.306042,1.215372,0.012036
6712,0.880888,-0.949547,0.036391,0.334851,0.080379,-0.232930,-0.374936,-0.503497,-0.736442,-0.786745,0.434861,0.744694,0.834501,-0.221679,-0.190714,0.250957,0.441766,0.348150,0.323000


In [8]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_transformed)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

In [9]:
cv = StratifiedKFold(n_splits = 5,shuffle = True, random_state = 42)
cv_splits = list(cv.split(X,y))

In [10]:
def quadratic_weighted_kappa(y_true, y_pred):
    '''
    This function returns the evaluation metric of this competition
    '''
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

In [11]:
estimator = RandomForestClassifier(random_state= 42)
rfe = RFECV(estimator=estimator, cv=cv, scoring=make_scorer(quadratic_weighted_kappa))
rfe.fit(X, y)

selected_features = []

for i, feature in enumerate(X_scaled.columns):
    if rfe.support_[i]:
        selected_features.append(feature)

print(f'{len(selected_features)} features selected out of {len(X_scaled.columns)}.')
print(f'\nSelected Features: \n')
for feature in selected_features:
    print(feature)

19 features selected out of 19.

Selected Features: 

fixed_acidity
volatile_acidity
citric_acid
residual_sugar
chlorides
free_sulfur_dioxide
total_sulfur_dioxide
density
pH
sulphates
alcohol
total_acidity
acidity_to_pH_ratio
free_sulfur_dioxide_to_total_sulfur_dioxide_ratio
alcohol_to_acidity_ratio
residual_sugar_to_citric_acid_ratio
alcohol_to_density_ratio
total_alkalinity
total_minerals


In [12]:
class CustomQuantileTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, random_state=None):
        self.random_state = random_state
        self.quantile_transformer = QuantileTransformer(output_distribution='normal', random_state=self.random_state)

    def fit(self, X_train, y=None):
        self.quantile_transformer.fit(X_train)
        return self

    def transform(self, X):
        X_transformed = self.quantile_transformer.transform(X)
        X = pd.DataFrame(X_transformed, columns=X.columns)
        return X

In [13]:
class CustomStandardScaler(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.scaler = StandardScaler()

    def fit(self, X_train, y=None):
        self.scaler.fit(X_train)
        return self

    def transform(self, X):
        X_transformed = self.scaler.transform(X)
        X = pd.DataFrame(X_transformed, columns=X.columns)
        return X

In [14]:
class KMeansTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, n_clusters=3, random_state=42):
        self.n_clusters = n_clusters
        self.random_state = random_state
        self.kmeans = KMeans(n_clusters=self.n_clusters, random_state=self.random_state)
   
    def fit(self, X_train, y=None):
        self.kmeans.fit(X_train)
        return self
    
    def transform(self, X):
        X_clustered = pd.DataFrame(X.copy())
        cluster_labels = self.kmeans.predict(X)
        X_clustered['Cluster'] = cluster_labels
        return X_clustered

In [15]:
pipeline = Pipeline([
    ('Feature Engineering', FunctionTransformer(feat_eng)),
    ('Transforming Distribution', CustomQuantileTransformer()),
    ('Standard Scaler', CustomStandardScaler()),
    ('Clustering', KMeansTransformer()),
    ('Model', None)
])

In [16]:
def objective2(trial):
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.5, step=0.01),
        'iterations': trial.suggest_int('iterations', 100, 1000, step=50),
        'max_depth': trial.suggest_int('max_depth', 4, 10),
        'subsample': trial.suggest_float('subsample', 0.1, 1.0, step=0.1),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 0.1, 10, step=0.1),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 50),
        'random_strength': trial.suggest_float('random_strength', 0.1, 10, step=0.1),
        'verbose': False,
        'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['Bernoulli', 'MVS']),
        'grow_policy': trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide']),
        'leaf_estimation_method': trial.suggest_categorical('leaf_estimation_method', ['Newton', 'Gradient']),
        'eval_metric': 'Accuracy'
    }
    
    pipeline.set_params(Model = CatBoostClassifier(**params, random_state = 42))
    scores = []
    
    for i, (train_index, val_index) in enumerate(cv_splits):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_val)
        
        kappa = cohen_kappa_score(y_val, y_pred, weights = 'quadratic')
        
        scores.append(kappa)
    
    if i == len(cv_splits) - 1:
        mean_score = np.mean(scores)
        
        
    return mean_score        
         

study2 = optuna.create_study(direction='maximize')
study2.optimize(objective2, n_trials = 15, show_progress_bar = True)

[I 2024-02-12 01:05:07,293] A new study created in memory with name: no-name-bca794a9-ccc2-4126-b8dd-54dff058aeb9


  0%|          | 0/15 [00:00<?, ?it/s]

[I 2024-02-12 01:05:52,830] Trial 0 finished with value: 0.6835215081877039 and parameters: {'learning_rate': 0.3, 'iterations': 450, 'max_depth': 7, 'subsample': 0.4, 'l2_leaf_reg': 4.7, 'min_data_in_leaf': 4, 'random_strength': 0.9, 'bootstrap_type': 'Bernoulli', 'grow_policy': 'Lossguide', 'leaf_estimation_method': 'Gradient'}. Best is trial 0 with value: 0.6835215081877039.
[I 2024-02-12 01:06:52,608] Trial 1 finished with value: 0.5787842240316667 and parameters: {'learning_rate': 0.05, 'iterations': 200, 'max_depth': 8, 'subsample': 0.6, 'l2_leaf_reg': 6.2, 'min_data_in_leaf': 23, 'random_strength': 1.0, 'bootstrap_type': 'Bernoulli', 'grow_policy': 'SymmetricTree', 'leaf_estimation_method': 'Gradient'}. Best is trial 0 with value: 0.6835215081877039.
[I 2024-02-12 01:07:58,222] Trial 2 finished with value: 0.6952697478092638 and parameters: {'learning_rate': 0.5, 'iterations': 200, 'max_depth': 8, 'subsample': 0.7000000000000001, 'l2_leaf_reg': 7.3, 'min_data_in_leaf': 16, 'rand

In [18]:
best_params2 = study2.best_params
best_rmse_score2 = study2.best_value
print(f'\n Best RMSE score = {best_rmse_score2} \n')
print(f'\n Best Params = {best_params2} \n')


 Best RMSE score = 0.7138798507120192 


 Best Params = {'learning_rate': 0.13, 'iterations': 650, 'max_depth': 10, 'subsample': 1.0, 'l2_leaf_reg': 9.9, 'min_data_in_leaf': 50, 'random_strength': 4.1, 'bootstrap_type': 'MVS', 'grow_policy': 'SymmetricTree', 'leaf_estimation_method': 'Gradient'} 



In [34]:
pipeline.set_params(Model = CatBoostClassifier(**best_params2,random_state = 42, verbose = False))

In [41]:
print('\nTuned CatBoostClassifier Cross-Validation:')
scores = []
feature_importance = []
print('\n')

for i, (train_index, val_index) in enumerate(cv_splits):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)
        
    kappa = cohen_kappa_score(y_val, y_pred, weights = 'quadratic')
        
    print(f'Fold {i + 1}:\n')
    print(f'  Quadratic Weighted Kappa = {kappa:.4f}')
        
    scores.append(kappa)
    feature_importance.append(pipeline[-1].feature_importances_)
        
    print('===================================================')
    
    if i == len(cv_splits) - 1:
        mean_score = np.mean(scores)
        fold_std = np.std(scores)
        
        print(f'\n  Mean Quadratic Weighted Kappa = = {mean_score:.4f} \u00B1 {fold_std:.4f}')


Tuned CatBoostClassifier Cross-Validation:


Fold 1:

  Quadratic Weighted Kappa = 0.7025
Fold 2:

  Quadratic Weighted Kappa = 0.7028
Fold 3:

  Quadratic Weighted Kappa = 0.7330
Fold 4:

  Quadratic Weighted Kappa = 0.7295
Fold 5:

  Quadratic Weighted Kappa = 0.7016

  Mean Quadratic Weighted Kappa = = 0.7139 ± 0.0142


In [42]:
test = pd.read_csv('test.csv', sep = ';')

In [43]:
test_id = test['id']
test_id

0      1257
1      6409
2       136
3      1631
4      6084
       ... 
815    4646
816     734
817    5090
818    1579
819    5688
Name: id, Length: 820, dtype: int64

In [44]:
test = test.drop(columns = 'id', axis = 1) # Removing 'Id' feature
test

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,type
0,7.2,0.25,0.37,2.5,0.063,11.0,41.0,0.99439,3.52,0.80,12.4,red
1,8.2,0.27,0.39,7.8,0.039,49.0,208.0,0.99760,3.31,0.51,9.5,white
2,8.9,0.32,0.49,1.6,0.050,17.0,131.0,0.99560,3.13,0.34,9.4,white
3,7.4,0.16,0.30,13.7,0.056,33.0,168.0,0.99825,2.90,0.44,8.7,white
4,6.4,0.28,0.56,1.7,0.156,49.0,106.0,0.99354,3.10,0.37,9.2,white
...,...,...,...,...,...,...,...,...,...,...,...,...
815,6.1,0.30,0.56,2.7,0.046,46.0,184.0,0.99240,3.31,0.57,10.9,white
816,6.7,0.33,0.34,6.6,0.067,35.0,156.0,0.99542,3.11,0.48,9.3,white
817,8.3,0.26,0.37,1.4,0.076,8.0,23.0,0.99740,3.26,0.70,9.6,red
818,6.3,0.29,0.29,3.3,0.037,32.0,140.0,0.98950,3.17,0.36,12.8,white


In [45]:
test = test.drop('type', axis = 1)

In [46]:
y_pred = pipeline.predict(test)
y_pred 

array([[7],
       [6],
       [5],
       [7],
       [6],
       [5],
       [5],
       [6],
       [7],
       [6],
       [5],
       [6],
       [7],
       [6],
       [7],
       [6],
       [6],
       [6],
       [5],
       [5],
       [6],
       [6],
       [5],
       [7],
       [6],
       [5],
       [5],
       [6],
       [5],
       [6],
       [6],
       [6],
       [5],
       [6],
       [7],
       [5],
       [6],
       [6],
       [6],
       [7],
       [5],
       [7],
       [5],
       [5],
       [5],
       [5],
       [5],
       [5],
       [5],
       [6],
       [5],
       [5],
       [5],
       [5],
       [6],
       [6],
       [5],
       [6],
       [6],
       [5],
       [6],
       [6],
       [5],
       [5],
       [5],
       [6],
       [6],
       [6],
       [5],
       [5],
       [5],
       [6],
       [5],
       [6],
       [5],
       [6],
       [5],
       [5],
       [5],
       [6],
       [5],
       [5],
       [6],
    

In [47]:
predictions = pd.DataFrame({
    'id': test_id,
    'quality': np.squeeze(y_pred)
})
predictions

Unnamed: 0,id,quality
0,1257,7
1,6409,6
2,136,5
3,1631,7
4,6084,6
...,...,...
815,4646,7
816,734,6
817,5090,6
818,1579,6


In [48]:
predictions.to_csv('results3.csv', index = False)