In [3]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [4]:
df = pd.read_csv('./data_csv/binned.csv', index_col=0)

In [5]:
df.head()

Unnamed: 0,year,month,type,city,average_price,category_vol
0,2015,January,conventional,Albany,1.22,medium
1,2015,January,organic,Albany,1.79,small
2,2015,January,conventional,Atlanta,1.0,wholesale
3,2015,January,organic,Atlanta,1.76,small
4,2015,January,conventional,Baltimore/Washington,1.08,wholesale


### Splitting Data

In [4]:
X = df.drop(columns=['average_price'])
y = df['average_price']

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [7]:
X_train.columns

Index(['type', 'year', 'month', 'city', 'category_vol'], dtype='object')

## 1. Hyperparameter LinearRegression dengan Polynomial preprocessing

In [45]:
from sklearn.linear_model import LinearRegression

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import GridSearchCV

In [46]:
transform = ColumnTransformer([
    ("encode", OneHotEncoder(), ['type', 'city', 'month', 'category_vol'])
], remainder = 'passthrough')

In [49]:
poly = Pipeline([
    ("transform", transform),
    ("poly", PolynomialFeatures())
])

lin_poly = Pipeline([
    ("prep_poly", poly),
    ("algo", LinearRegression())
])

In [50]:
lin_poly.get_params()

{'memory': None,
 'steps': [('prep_poly', Pipeline(steps=[('transform',
                    ColumnTransformer(remainder='passthrough',
                                      transformers=[('encode', OneHotEncoder(),
                                                     ['type', 'city', 'month',
                                                      'category_vol'])])),
                   ('poly', PolynomialFeatures())])),
  ('algo', LinearRegression())],
 'verbose': False,
 'prep_poly': Pipeline(steps=[('transform',
                  ColumnTransformer(remainder='passthrough',
                                    transformers=[('encode', OneHotEncoder(),
                                                   ['type', 'city', 'month',
                                                    'category_vol'])])),
                 ('poly', PolynomialFeatures())]),
 'algo': LinearRegression(),
 'prep_poly__memory': None,
 'prep_poly__steps': [('transform',
   ColumnTransformer(remainder='passthrough',
  

In [52]:
param_poly = {
    'prep_poly__poly__degree': [2, 3],
    "prep_poly__poly__include_bias" : ['True', 'False']
}

In [53]:
poly_gs = GridSearchCV(lin_poly, param_poly, cv=5, n_jobs=-1, verbose=1)

In [54]:
poly_gs.fit(X_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  20 | elapsed:   17.0s remaining:    1.8s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   17.6s finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('prep_poly',
                                        Pipeline(steps=[('transform',
                                                         ColumnTransformer(remainder='passthrough',
                                                                           transformers=[('encode',
                                                                                          OneHotEncoder(),
                                                                                          ['type',
                                                                                           'city',
                                                                                           'month',
                                                                                           'category_vol'])])),
                                                        ('poly',
                                                         PolynomialFeatures(

In [55]:
poly_gs.best_params_

{'prep_poly__poly__degree': 2, 'prep_poly__poly__include_bias': 'True'}

In [56]:
preprocess = Pipeline([
    ("transform", transform),
    ("poly", PolynomialFeatures(degree=2, include_bias='True'))
])

model_poly = Pipeline([
    ("prep", preprocess),
    ("algo", LinearRegression())
])

In [57]:
model_poly.fit(X_train, y_train)

Pipeline(steps=[('prep',
                 Pipeline(steps=[('transform',
                                  ColumnTransformer(remainder='passthrough',
                                                    transformers=[('encode',
                                                                   OneHotEncoder(),
                                                                   ['type',
                                                                    'city',
                                                                    'month',
                                                                    'category_vol'])])),
                                 ('poly',
                                  PolynomialFeatures(include_bias='True'))])),
                ('algo', LinearRegression())])

In [58]:
y_predPoly = model_poly.predict(X_test)

In [59]:
## evaluation

from sklearn.metrics import r2_score

In [60]:
r2_score(y_test, y_predPoly)

0.6536250250867395

## 2. Hyperparameter DecisionTree

In [8]:
from sklearn.tree import DecisionTreeRegressor

### Preprocessing

- numerical features
    karena hanya 1 parameter numerik di variabel X - 'year', maka tidak perlu dilakukan preprocessing untuk variabel numerik
- categorical features: OneHotEncoder

In [9]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18679 entries, 14976 to 26606
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   type          18679 non-null  object
 1   year          18679 non-null  int64 
 2   month         18679 non-null  object
 3   city          18679 non-null  object
 4   category_vol  18679 non-null  object
dtypes: int64(1), object(4)
memory usage: 875.6+ KB


In [11]:
hype_DT = Pipeline([
    ("encode", transform),
    ("algo", DecisionTreeRegressor())
])

In [12]:
hype_DT.get_params()

{'memory': None,
 'steps': [('encode',
   ColumnTransformer(remainder='passthrough',
                     transformers=[('encode', OneHotEncoder(),
                                    ['type', 'city', 'month', 'category_vol'])])),
  ('algo', DecisionTreeRegressor())],
 'verbose': False,
 'encode': ColumnTransformer(remainder='passthrough',
                   transformers=[('encode', OneHotEncoder(),
                                  ['type', 'city', 'month', 'category_vol'])]),
 'algo': DecisionTreeRegressor(),
 'encode__n_jobs': None,
 'encode__remainder': 'passthrough',
 'encode__sparse_threshold': 0.3,
 'encode__transformer_weights': None,
 'encode__transformers': [('encode',
   OneHotEncoder(),
   ['type', 'city', 'month', 'category_vol'])],
 'encode__verbose': False,
 'encode__encode': OneHotEncoder(),
 'encode__encode__categories': 'auto',
 'encode__encode__drop': None,
 'encode__encode__dtype': numpy.float64,
 'encode__encode__handle_unknown': 'error',
 'encode__encode__sparse':

In [13]:
param_DT = {
    'algo__max_depth': [None, 3, 5, 7],
    'algo__min_samples_leaf': range(1, 10, 2),
    'algo__min_samples_split': range(2, 20, 2),
    'algo__max_features': [None, 'log2', 'sqrt']
}

In [14]:
DT_GS = GridSearchCV(hype_DT, param_DT, cv=5, verbose=1)

In [15]:
DT_GS.fit(X_train, y_train)

Fitting 5 folds for each of 540 candidates, totalling 2700 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 2700 out of 2700 | elapsed:  2.0min finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('encode',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('encode',
                                                                         OneHotEncoder(),
                                                                         ['type',
                                                                          'city',
                                                                          'month',
                                                                          'category_vol'])])),
                                       ('algo', DecisionTreeRegressor())]),
             param_grid={'algo__max_depth': [None, 3, 5, 7],
                         'algo__max_features': [None, 'log2', 'sqrt'],
                         'algo__min_samples_leaf': range(1, 10, 2),
                         'algo__min_samples_split': range

In [16]:
DT_GS.best_params_

{'algo__max_depth': None,
 'algo__max_features': None,
 'algo__min_samples_leaf': 1,
 'algo__min_samples_split': 2}

In [25]:
model_DT = Pipeline([
    ("encode", transform),
    ("algo", DecisionTreeRegressor(max_depth=None, max_features=None, min_samples_leaf= 1, min_samples_split= 2))
])

In [26]:
model_DT.fit(X_train, y_train)

Pipeline(steps=[('encode',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('encode', OneHotEncoder(),
                                                  ['type', 'city', 'month',
                                                   'category_vol'])])),
                ('algo', DecisionTreeRegressor())])

In [27]:
y_predDT = model_DT.predict(X_test)

In [29]:
r2_score(y_test, y_predDT)

0.816091509366985

## 3. Hyperparameter dengan RandomForest

In [22]:
from sklearn.ensemble import RandomForestRegressorPolynomialFeatures

In [23]:
hype_RF = Pipeline([
    ("encode", transform),
    ("algo", RandomForestRegressor())
])

In [30]:
hype_RF.get_params()

{'memory': None,
 'steps': [('encode',
   ColumnTransformer(remainder='passthrough',
                     transformers=[('encode', OneHotEncoder(),
                                    ['type', 'city', 'month', 'category_vol'])])),
  ('algo', RandomForestRegressor())],
 'verbose': False,
 'encode': ColumnTransformer(remainder='passthrough',
                   transformers=[('encode', OneHotEncoder(),
                                  ['type', 'city', 'month', 'category_vol'])]),
 'algo': RandomForestRegressor(),
 'encode__n_jobs': None,
 'encode__remainder': 'passthrough',
 'encode__sparse_threshold': 0.3,
 'encode__transformer_weights': None,
 'encode__transformers': [('encode',
   OneHotEncoder(),
   ['type', 'city', 'month', 'category_vol'])],
 'encode__verbose': False,
 'encode__encode': OneHotEncoder(),
 'encode__encode__categories': 'auto',
 'encode__encode__drop': None,
 'encode__encode__dtype': numpy.float64,
 'encode__encode__handle_unknown': 'error',
 'encode__encode__sparse':

In [31]:
params_RF = {
    'algo__n_estimators': range(100, 200, 50),
    'algo__max_depth': [None, 3, 5, 7],
    'algo__min_samples_leaf': range(1, 10, 2),
    'algo__min_samples_split': range(2, 20, 4),
    'algo__max_features': [None, 'sqrt']
}

In [32]:
RF_gs = GridSearchCV(hype_RF, params_RF, cv=5, n_jobs=-1, verbose=1)

In [33]:
RF_gs.fit(X_train, y_train)

Fitting 5 folds for each of 400 candidates, totalling 2000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  8.2min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed: 12.2min
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed: 13.1min
[Parallel(n_jobs=-1)]: Done 1226 tasks      | elapsed: 14.0min
[Parallel(n_jobs=-1)]: Done 1776 tasks      | elapsed: 15.7min
[Parallel(n_jobs=-1)]: Done 2000 out of 2000 | elapsed: 16.1min finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('encode',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('encode',
                                                                         OneHotEncoder(),
                                                                         ['type',
                                                                          'city',
                                                                          'month',
                                                                          'category_vol'])])),
                                       ('algo', RandomForestRegressor())]),
             n_jobs=-1,
             param_grid={'algo__max_depth': [None, 3, 5, 7],
                         'algo__max_features': [None, 'sqrt'],
                         'algo__min_samples_leaf': range(1, 10, 2),
                         'algo__min_sampl

In [34]:
RF_gs.best_params_

{'algo__max_depth': None,
 'algo__max_features': None,
 'algo__min_samples_leaf': 1,
 'algo__min_samples_split': 2,
 'algo__n_estimators': 150}

In [35]:
model_RF = Pipeline([
    ("encode", transform),
    ("algo", RandomForestRegressor(max_depth=None, max_features=None, min_samples_leaf= 1, min_samples_split= 2, n_estimators=150))
])

In [36]:
model_RF.fit(X_train, y_train)

Pipeline(steps=[('encode',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('encode', OneHotEncoder(),
                                                  ['type', 'city', 'month',
                                                   'category_vol'])])),
                ('algo',
                 RandomForestRegressor(max_features=None, n_estimators=150))])

In [37]:
y_predRF = model_RF.predict(X_test)

In [38]:
r2_score(y_test, y_predRF)

0.8348297843992323

=========================================================================================

## evaluation

In [39]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [62]:
summary = pd.DataFrame()

In [63]:
summary['model'] = ['Polynomial Regression', 'Decision Tree', 'Random Forest']

In [64]:
summary['MAE'] = [mean_absolute_error(y_test, y_predPoly), mean_absolute_error(y_test, y_predDT), mean_absolute_error(y_test, y_predRF)]
summary['MSE'] = [mean_squared_error(y_test, y_predPoly), mean_squared_error(y_test, y_predDT), mean_squared_error(y_test, y_predRF)]
summary['RMSE'] = [np.sqrt(mean_squared_error(y_test, y_predPoly)), np.sqrt(mean_squared_error(y_test, y_predDT)), np.sqrt(mean_squared_error(y_test, y_predRF))]
summary['r2_score'] = [r2_score(y_test, y_predPoly), r2_score(y_test, y_predDT), r2_score(y_test, y_predRF)]

In [65]:
summary

Unnamed: 0,model,MAE,MSE,RMSE,r2_score
0,Polynomial Regression,0.17199,0.052045,0.228133,0.653625
1,Decision Tree,0.111627,0.027633,0.166233,0.816092
2,Random Forest,0.10807,0.024818,0.157537,0.83483


=========================================================================================

## Summary

Model dengan r2_score paling baik adalah Random Forest dengan parameter-parameter sbb:
- 'algo__max_depth': None,
- 'algo__max_features': None,
- 'algo__min_samples_leaf': 1,
- 'algo__min_samples_split': 2,
- 'algo__n_estimators': 150