# Configuration

In [12]:
!pip install dask

Collecting dask
  Downloading dask-2022.9.0-py3-none-any.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 3.7 MB/s eta 0:00:01
[?25hCollecting cloudpickle>=1.1.1
  Downloading cloudpickle-2.2.0-py3-none-any.whl (25 kB)
Collecting partd>=0.3.10
  Downloading partd-1.3.0-py3-none-any.whl (18 kB)
Collecting pyyaml>=5.3.1
  Using cached PyYAML-6.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (701 kB)
Collecting toolz>=0.8.2
  Downloading toolz-0.12.0-py3-none-any.whl (55 kB)
[K     |████████████████████████████████| 55 kB 6.7 MB/s  eta 0:00:01
[?25hCollecting fsspec>=0.6.0
  Downloading fsspec-2022.8.2-py3-none-any.whl (140 kB)
[K     |████████████████████████████████| 140 kB 51.9 MB/s eta 0:00:01
Collecting locket
  Downloading locket-1.0.0-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: cloudpickle, locket, toolz, partd, pyyaml, fsspec, dask
Successfully installed cloudpickle-2.2.0 dask-2022.9.0 fsspec-2022

In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

from sklearn.model_selection import train_test_split, validation_curve
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import PolynomialFeatures

from dask.diagnostics import ProgressBar

%matplotlib inline

plt.rcParams["figure.figsize"] = (10, 6)

# Importing Dataset

In [2]:
stand_df = pd.read_csv('processed_dataset/std_dataset.csv', index_col = 0)
stand_df.shape

(36733, 39)

In [3]:
countries = ['Austria', 'Belgium', 'Bulgaria', 'Croatia',
       'Czech Republic', 'Denmark', 'Estonia', 'Finland', 'France', 'Germany',
       'Greece', 'Hungary', 'Ireland', 'Italy', 'Latvia', 'Lithuania',
       'Luxembourg', 'Malta', 'Netherlands', 'Poland', 'Portugal',
       'Republic of Cyprus', 'Romania', 'Slovakia', 'Slovenia', 'Spain',
       'Sweden']

In [4]:
dev_df = stand_df.loc[:24487]
dev_df = dev_df.drop(columns = list(countries + ['TEY', 'CDP']))
dev_df.shape

(24488, 10)

In [5]:
std_df = dev_df.loc[dev_df['CO'] < 4.5, :]
std_df.shape

(21652, 10)

In [6]:
ext_df = dev_df.loc[dev_df['CO'] >= 4.5, :]
ext_df.shape

(2836, 10)

In [7]:
from sklearn.model_selection import RandomizedSearchCV

n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]

max_features = ['sqrt','log2', None]

max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

min_samples_split = [2, 4, 6]

min_samples_leaf = [1, 2, 4]

bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

print(random_grid)

{'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000], 'max_features': ['sqrt', 'log2', None], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 4, 6], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [8]:
poly = PolynomialFeatures(degree = 3)

### All Regressor

In [9]:
X = dev_df.drop(columns='CO')
y = dev_df['CO']

X = poly.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=42)

In [14]:
all_rf = RandomForestRegressor()
all_rf_random = RandomizedSearchCV(estimator= all_rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=4, random_state=42, n_jobs = -1)

with ProgressBar():
    all_rf_random.fit(X,y)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV 2/3] END bootstrap=False, max_depth=30, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=400;, score=0.698 total time= 1.7min
[CV 1/3] END bootstrap=True, max_depth=80, max_features=None, min_samples_leaf=4, min_samples_split=2, n_estimators=200;, score=0.782 total time=13.2min
[CV 1/3] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=600;, score=0.804 total time= 2.9min
[CV 1/3] END bootstrap=True, max_depth=60, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=800;, score=0.804 total time= 4.5min
[CV 2/3] END bootstrap=False, max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=0.578 total time=10.3min
[CV 2/3] END bootstrap=True, max_depth=40, max_features=None, min_samples_leaf=2, min_samples_split=6, n_estimators=800;, score=0.693 total time=48.5min
[CV 1/3] END bo



[CV 3/3] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=900;, score=0.799 total time= 6.9min
[CV 3/3] END bootstrap=True, max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=6, n_estimators=200;, score=0.788 total time=  44.7s
[CV 3/3] END bootstrap=False, max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300;, score=0.660 total time=21.6min
[CV 3/3] END bootstrap=True, max_depth=90, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=1000;, score=0.792 total time= 2.5min
[CV 3/3] END bootstrap=False, max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=400;, score=0.643 total time=40.9min
[CV 3/3] END bootstrap=True, max_depth=40, max_features=log2, min_samples_leaf=2, min_samples_split=6, n_estimators=800;, score=0.787 total time= 1.7min
[CV 3/3] END bootstrap=False, max_depth=80, max_features=None, min_samples_lea

In [24]:
best = all_rf_random.best_estimator_
best

In [22]:
best = all_rf_random.best_estimator_
best.fit(X_train, y_train)
pred = best.predict(X_test)
mse(y_test,pred)


1.1651780150253086

In [20]:
type(X_test.shape),type(y_train.shape)

((18366, 220), (18366,))

### Std Regressor

In [46]:
X_std = std_df.drop(columns='CO')
y_std = std_df['CO']

X_std = poly.fit_transform(X_std)

X_std_train, X_std_test, y_std_train, y_std_test = train_test_split(X_std, y_std, test_size = 0.25, random_state=42)

In [20]:
rf = RandomForestRegressor()

rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, random_state=42, n_jobs = -1)

rf_random.fit(X_std, y_std)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [21]:
rf_random.best_params_

{'n_estimators': 700,
 'min_samples_split': 2,
 'min_samples_leaf': 2,
 'max_features': None,
 'max_depth': 30,
 'bootstrap': True}

### Ext Regressor

In [10]:
X_ext = ext_df.drop(columns='CO')
y_ext = ext_df['CO']

X_ext = poly.fit_transform(X_ext)

X_ext_train, X_ext_test, y_ext_train, y_ext_test = train_test_split(X_ext, y_ext, test_size = 0.25, random_state=42)

In [None]:
rf = RandomForestRegressor()

rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose =3, random_state=42, n_jobs = -1)

rf_random.fit(X_ext, y_ext)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [27]:
rf_random.best_params_

{'n_estimators': 500,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': None,
 'bootstrap': False}

In [26]:
best_random = rf_random.best_estimator_
y_predict_random = best_random.predict(X_test)
mse(y_test, y_predict_random)

4.099295900845886