In [1]:
import pandas as pd
import numpy as np
import random
from preprocessing import clean_data, add_new_features

In [2]:
SEED = 42

In [3]:
random.seed(SEED)
np.random.seed(SEED)

In [4]:
data = pd.read_csv('../../data/beer_train.csv', index_col=['id'])

In [5]:
df = clean_data(data)
df = add_new_features(df)

# Evaluation

## Feature selection

In [6]:
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import make_scorer
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor

In [7]:
def rmse(y_true, y_pred):
  se = (y_true - y_pred) ** 2
  mse = se.mean()
  rmse = np.sqrt(mse)
  return rmse


rmse_score = make_scorer(rmse)

In [8]:
target = ['ibu']
cat_features = ['available', 'glass']
label_features = ['isOrganic']
num_features = ['originalGravity',
                'abv',
                'srm',
                'abv_mul_grav',
                'abv_mul_srm',
                'srm_div_abv',
                'srm_mull_grav',
                'srm_mull_grav_div_abv']

Create train and val dataset

In [9]:
df_train, df_val = train_test_split(df, test_size=0.2, random_state=SEED)
df_train.reset_index(drop=True, inplace=True)
df_val.reset_index(drop=True, inplace=True)

In [10]:
y_train = df_train['ibu'].values
y_val = df_val['ibu'].values


In [11]:
scaler = MinMaxScaler()
dv = DictVectorizer()

Normalize numeric features

In [12]:
df_train[num_features] = scaler.fit_transform(df_train[num_features])
df_val[num_features] = scaler.transform(df_val[num_features])

In [13]:
train_features = num_features + label_features + cat_features

#### OneHotEncoding

In [14]:
train_dict = df_train[train_features].to_dict('records')
val_dict = df_val[train_features].to_dict('records')

In [15]:
X_train = dv.fit_transform(train_dict)
X_val = dv.transform(val_dict)

### Ridge

Feature selection.

In [16]:
ridge = Ridge()
selector = SequentialFeatureSelector(ridge)

In [17]:
selector.fit(X_train, y_train)

SequentialFeatureSelector(estimator=Ridge())

Best features.

In [18]:
selector.get_feature_names_out(dv.get_feature_names_out())

array(['abv', 'abv_mul_grav', 'abv_mul_srm',
       'available=Available at the same time of year, every year.',
       'available=Available during the fall months.',
       'available=Available year round as a staple beer.', 'glass=Flute',
       'glass=Nglass', 'glass=Oversized Wine Glass', 'glass=Pilsner',
       'glass=Pint', 'glass=Weizen', 'glass=Willi', 'originalGravity',
       'srm'], dtype=object)

### RandomForestRegressor

In [20]:
rfr = RandomForestRegressor()

In [22]:
rfr.fit(X_train, y_train)

RandomForestRegressor()

In [23]:
list(zip(dv.get_feature_names_out(), rfr.feature_importances_.round(3)))

[('abv', 0.04),
 ('abv_mul_grav', 0.115),
 ('abv_mul_srm', 0.081),
 ('available=Available at the same time of year, every year.', 0.012),
 ('available=Available during the fall months.', 0.002),
 ('available=Available during the spring months.', 0.002),
 ('available=Available during the summer months.', 0.003),
 ('available=Available during the winter months.', 0.003),
 ('available=Available year round as a staple beer.', 0.017),
 ('available=Beer is not available.', 0.006),
 ('available=Limited availability.', 0.011),
 ('glass=Flute', 0.001),
 ('glass=Goblet', 0.003),
 ('glass=Mug', 0.002),
 ('glass=Nglass', 0.01),
 ('glass=Oversized Wine Glass', 0.0),
 ('glass=Pilsner', 0.003),
 ('glass=Pint', 0.015),
 ('glass=Snifter', 0.005),
 ('glass=Stange', 0.002),
 ('glass=Thistle', 0.0),
 ('glass=Tulip', 0.006),
 ('glass=Weizen', 0.003),
 ('glass=Willi', 0.0),
 ('isOrganic', 0.005),
 ('originalGravity', 0.455),
 ('srm', 0.014),
 ('srm_div_abv', 0.05),
 ('srm_mull_grav', 0.042),
 ('srm_mull_gra

### ExtraTreesRegressor

In [24]:
etr = ExtraTreesRegressor()

In [26]:
etr.fit(X_train, y_train)

ExtraTreesRegressor()

In [27]:
list(zip(dv.get_feature_names_out(), etr.feature_importances_.round(3)))

[('abv', 0.151),
 ('abv_mul_grav', 0.167),
 ('abv_mul_srm', 0.055),
 ('available=Available at the same time of year, every year.', 0.015),
 ('available=Available during the fall months.', 0.005),
 ('available=Available during the spring months.', 0.004),
 ('available=Available during the summer months.', 0.005),
 ('available=Available during the winter months.', 0.005),
 ('available=Available year round as a staple beer.', 0.025),
 ('available=Beer is not available.', 0.008),
 ('available=Limited availability.', 0.016),
 ('glass=Flute', 0.002),
 ('glass=Goblet', 0.004),
 ('glass=Mug', 0.002),
 ('glass=Nglass', 0.017),
 ('glass=Oversized Wine Glass', 0.001),
 ('glass=Pilsner', 0.003),
 ('glass=Pint', 0.021),
 ('glass=Snifter', 0.007),
 ('glass=Stange', 0.002),
 ('glass=Thistle', 0.001),
 ('glass=Tulip', 0.01),
 ('glass=Weizen', 0.006),
 ('glass=Willi', 0.001),
 ('isOrganic', 0.008),
 ('originalGravity', 0.253),
 ('srm', 0.045),
 ('srm_div_abv', 0.053),
 ('srm_mull_grav', 0.052),
 ('srm_

### Summury

Even without hyperparameters tuning ensemble models provide better score.

Most important features:

- originalGravity 

- abv

- abv_mul_grav

- abv_mul_srm

- srm

- srm_div_abv

- srm_mull_grav

- srm_mull_grav_div_abv

## Tuning hyperparameters

In [28]:
from sklearn.model_selection import GridSearchCV

In spite of low importances of categorical features removing them lead to low metric.

Therefore, I save it in training data.

### Ridge

In [29]:
ridge_params = {
              'alpha' : [0.1, 0.5, 1, 2, 5],
              'max_iter' : [1000, 1500, 3000],
              'random_state' : [SEED]
              }
ridge = Ridge()              

In [30]:
%%time
ridge_gscv = GridSearchCV(ridge, ridge_params, scoring=rmse_score)
ridge_gscv.fit(X_train, y_train)

Wall time: 542 ms


GridSearchCV(estimator=Ridge(),
             param_grid={'alpha': [0.1, 0.5, 1, 2, 5],
                         'max_iter': [1000, 1500, 3000], 'random_state': [42]},
             scoring=make_scorer(rmse))

In [45]:
print('Best parameters:', ridge_gscv.best_params_)

Best parameters: {'alpha': 5, 'max_iter': 1000, 'random_state': 42}


In [None]:
ridge = ridge_gscv.best_estimator_

### RandomForest

In [32]:
rfr_params = {
              'n_estimators' : [100, 200],
              'max_depth' : [30, 50],
              'min_samples_split' : [5, 10],
              'max_features' : ['log2', 'sqrt', None],
              'random_state' : [SEED]
              }
rfr = RandomForestRegressor()

In [33]:
%%time
rfr_gscv = GridSearchCV(rfr, rfr_params, scoring=rmse_score)
rfr_gscv.fit(X_train, y_train)

Wall time: 8min 9s


GridSearchCV(estimator=RandomForestRegressor(),
             param_grid={'max_depth': [30, 50],
                         'max_features': ['log2', 'sqrt', None],
                         'min_samples_split': [5, 10],
                         'n_estimators': [100, 200], 'random_state': [42]},
             scoring=make_scorer(rmse))

In [44]:
print('Best parameters:', rfr_gscv.best_params_)

Best parameters: {'max_depth': 50, 'max_features': 'log2', 'min_samples_split': 5, 'n_estimators': 100, 'random_state': 42}


In [35]:
rfr = rfr_gscv.best_estimator_

### ExtraTree

In [36]:
etr_params = {
              'n_estimators' : [100, 200],
              'max_depth' : [30, 50, None],
              'min_samples_split' : [5, 10],
              'max_features' : ['log2', 'sqrt', None],
              'random_state' : [SEED]
              }
etr = ExtraTreesRegressor()

In [37]:
%%time
etr_gscv = GridSearchCV(etr, etr_params, scoring=rmse_score)
etr_gscv.fit(X_train, y_train)

Wall time: 13min 33s


GridSearchCV(estimator=ExtraTreesRegressor(),
             param_grid={'max_depth': [30, 50, None],
                         'max_features': ['log2', 'sqrt', None],
                         'min_samples_split': [5, 10],
                         'n_estimators': [100, 200], 'random_state': [42]},
             scoring=make_scorer(rmse))

In [43]:
print('Best parameters:', etr_gscv.best_params_)

Best parameters: {'max_depth': 50, 'max_features': 'log2', 'min_samples_split': 5, 'n_estimators': 100, 'random_state': 42}


In [39]:
etr = etr_gscv.best_estimator_

### Validation

Train and validate best estimaters.

In order to evaluate models I'm validating it on 20% of data.

I'm not using cross validation becouse this algorithm have be done in Grig Search CV.

In [40]:
ridge.fit(X_train, y_train)
print('Ridge rmse: ',rmse(y_val, ridge.predict(X_val)))

Ridge rmse:  20.20181841910326


In [41]:
rfr.fit(X_train, y_train)
print('RandomForest rmse: ',rmse(y_val, rfr.predict(X_val)))

RandomForest rmse:  17.17280871614406


In [42]:
etr.fit(X_train, y_train)
print('ExtraTree rmse:', rmse(y_val, etr.predict(X_val)))

ExtraTree rmse: 18.010300256801102


After tuned hyperparameters RandomForest showed best result on validation.