In [35]:
import pandas as pd
import numpy as np
import random

In [36]:
SEED = 42

In [None]:
import sys
sys.path.insert(0, '..')
from preprocessing import clean_data, add_new_features

In [37]:
random.seed(SEED)
np.random.seed(SEED)

In [38]:
data = pd.read_csv('../../data/beer_train.csv', index_col=['id'])

In [39]:
df = clean_data(data)
df = add_new_features(df)

# Evaluation

## Feature selection

In [40]:
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import make_scorer
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor

In [41]:
def rmse(y_true, y_pred):
  se = (y_true - y_pred) ** 2
  mse = se.mean()
  rmse = np.sqrt(mse)
  return rmse


rmse_score = make_scorer(rmse)

In [42]:
target = ['ibu']
cat_features = ['available', 'glass']
label_features = ['isOrganic']
num_features = ['originalGravity',
                'abv',
                'srm',
                'abv_mul_grav',
                'abv_mul_srm',
                'srm_div_abv',
                'srm_mull_grav',
                'srm_mull_grav_div_abv']

Create train and val dataset

In [43]:
df_train, df_val = train_test_split(df, test_size=0.2, random_state=SEED)
df_train.reset_index(drop=True, inplace=True)
df_val.reset_index(drop=True, inplace=True)

In [44]:
y_train = df_train['ibu'].values
y_val = df_val['ibu'].values


In [45]:
scaler = MinMaxScaler()


## TargetEncoding

Target Encoding is also known as likelihood encoding or mean encoding. It is basically, creating a new feature from existing features and the target variable.

We can take mean, mode, standard deviation or percentiles to create new features.

In [46]:
glass_target_encode = df_train.groupby(by='glass').agg({'ibu' : 'std'}).round(3)\
                                                               .to_dict()['ibu']

In [47]:
avail_target_encode = df_train.groupby(by='available').agg({'ibu' : 'std'}).round(3)\
                                                                    .to_dict()['ibu']

In [48]:
df_train = df_train.replace({'glass' : glass_target_encode, 'available' : avail_target_encode})

In [49]:
df_val = df_val.replace({'glass' : glass_target_encode, 'available' : avail_target_encode})

Normalize numeric features

In [50]:
df_train[num_features] = scaler.fit_transform(df_train[num_features])
df_val[num_features] = scaler.transform(df_val[num_features])

In [51]:
train_features = num_features + label_features + cat_features

In [52]:
X_train = df_train[train_features]
X_val = df_val[train_features]

### Ridge

In [53]:
ridge = Ridge()
selector = SequentialFeatureSelector(ridge)

In [54]:
selector.fit(X_train, y_train)

SequentialFeatureSelector(estimator=Ridge())

Best features.

In [55]:
selector.get_feature_names_out(X_train.columns)

array(['originalGravity', 'srm', 'abv_mul_grav', 'abv_mul_srm', 'glass'],
      dtype=object)

### RandomForestRegressor

In [56]:
rfr = RandomForestRegressor()

In [57]:
rfr.fit(X_train, y_train)

RandomForestRegressor()

In [58]:
list(zip(X_train.columns, rfr.feature_importances_.round(3)))

[('originalGravity', 0.457),
 ('abv', 0.04),
 ('srm', 0.014),
 ('abv_mul_grav', 0.117),
 ('abv_mul_srm', 0.084),
 ('srm_div_abv', 0.052),
 ('srm_mull_grav', 0.046),
 ('srm_mull_grav_div_abv', 0.095),
 ('isOrganic', 0.005),
 ('available', 0.046),
 ('glass', 0.044)]

### ExtraTreesRegressor

In [59]:
etr = ExtraTreesRegressor()

In [60]:
etr.fit(X_train, y_train)

ExtraTreesRegressor()

In [61]:
list(zip(X_train.columns, etr.feature_importances_.round(3)))

[('originalGravity', 0.256),
 ('abv', 0.171),
 ('srm', 0.049),
 ('abv_mul_grav', 0.154),
 ('abv_mul_srm', 0.061),
 ('srm_div_abv', 0.056),
 ('srm_mull_grav', 0.057),
 ('srm_mull_grav_div_abv', 0.057),
 ('isOrganic', 0.009),
 ('available', 0.06),
 ('glass', 0.07)]

### Summury

Even without hyperparameters tuning ensemble models provide better score.

Most important features:

- originalGravity 

- abv

- abv_mul_grav

- abv_mul_srm

- srm

- srm_div_abv

- srm_mull_grav

- srm_mull_grav_div_abv

## Tuning hyperparameters

In [62]:
from sklearn.model_selection import GridSearchCV

In spite of low importances of categorical features removing them lead to low metric.

Therefore, I save it in training data.

### Ridge

In [63]:
ridge_params = {
              'alpha' : [0.1, 0.5, 1, 2, 5],
              'max_iter' : [1000, 1500, 3000],
              'random_state' : [SEED]
              }
ridge = Ridge()       

In [64]:
%%time
ridge_gscv = GridSearchCV(ridge, ridge_params, scoring=rmse_score)
ridge_gscv.fit(X_train, y_train)

Wall time: 236 ms


GridSearchCV(estimator=Ridge(),
             param_grid={'alpha': [0.1, 0.5, 1, 2, 5],
                         'max_iter': [1000, 1500, 3000], 'random_state': [42]},
             scoring=make_scorer(rmse))

In [65]:
print('Best parameters:', ridge_gscv.best_params_)

Best parameters: {'alpha': 5, 'max_iter': 1000, 'random_state': 42}


In [66]:
ridge = ridge_gscv.best_estimator_

### RandomForest

In [67]:
rfr_params = {
              'n_estimators' : [100, 200],
              'max_depth' : [30, 50],
              'min_samples_split' : [5, 10],
              'max_features' : ['log2', 'sqrt', None],
              'random_state' : [SEED]
              }
rfr = RandomForestRegressor()

In [68]:
%%time
rfr_gscv = GridSearchCV(rfr, rfr_params, scoring=rmse_score)
rfr_gscv.fit(X_train, y_train)

Wall time: 1min 55s


GridSearchCV(estimator=RandomForestRegressor(),
             param_grid={'max_depth': [30, 50],
                         'max_features': ['log2', 'sqrt', None],
                         'min_samples_split': [5, 10],
                         'n_estimators': [100, 200], 'random_state': [42]},
             scoring=make_scorer(rmse))

In [69]:
print('Best parameters:', rfr_gscv.best_params_)

Best parameters: {'max_depth': 30, 'max_features': 'log2', 'min_samples_split': 5, 'n_estimators': 100, 'random_state': 42}


In [70]:
rfr = rfr_gscv.best_estimator_

### ExtraTree

In [71]:
etr_params = {
              'n_estimators' : [100, 200],
              'max_depth' : [30, 50, None],
              'min_samples_split' : [5, 10],
              'max_features' : ['log2', 'sqrt', None],
              'random_state' : [SEED]
              }
etr = ExtraTreesRegressor()

In [72]:
%%time
etr_gscv = GridSearchCV(etr, etr_params, scoring=rmse_score)
etr_gscv.fit(X_train, y_train)

Wall time: 1min 33s


GridSearchCV(estimator=ExtraTreesRegressor(),
             param_grid={'max_depth': [30, 50, None],
                         'max_features': ['log2', 'sqrt', None],
                         'min_samples_split': [5, 10],
                         'n_estimators': [100, 200], 'random_state': [42]},
             scoring=make_scorer(rmse))

In [73]:
print('Best parameters:', etr_gscv.best_params_)

Best parameters: {'max_depth': 30, 'max_features': 'log2', 'min_samples_split': 5, 'n_estimators': 100, 'random_state': 42}


In [74]:
etr = etr_gscv.best_estimator_

### Validation

In [75]:
ridge.fit(X_train, y_train)
print('Ridge rmse: ',rmse(y_val, ridge.predict(X_val)))

Ridge rmse:  20.653845370310325


In [76]:
rfr.fit(X_train, y_train)
print('RandomForest rmse: ',rmse(y_val, rfr.predict(X_val)))

RandomForest rmse:  17.053296268547953


In [77]:
etr.fit(X_train, y_train)
print('ExtraTree rmse:', rmse(y_val, etr.predict(X_val)))

ExtraTree rmse: 17.628369254391444


After tuned hyperparameters RandomForest showed best result on validation.