# Configuration

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

from sklearn.model_selection import train_test_split, validation_curve
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import PolynomialFeatures

%matplotlib inline

plt.rcParams["figure.figsize"] = (10, 6)

# Importing Dataset

In [3]:
stand_df = pd.read_csv('processed_dataset/std_dataset.csv', index_col = 0)
stand_df.shape

(36733, 39)

In [4]:
countries = ['Austria', 'Belgium', 'Bulgaria', 'Croatia',
       'Czech Republic', 'Denmark', 'Estonia', 'Finland', 'France', 'Germany',
       'Greece', 'Hungary', 'Ireland', 'Italy', 'Latvia', 'Lithuania',
       'Luxembourg', 'Malta', 'Netherlands', 'Poland', 'Portugal',
       'Republic of Cyprus', 'Romania', 'Slovakia', 'Slovenia', 'Spain',
       'Sweden']

In [5]:
dev_df = stand_df.loc[:24487]
dev_df = dev_df.drop(columns = list(countries + ['TEY', 'CDP']))
dev_df.shape

(24488, 10)

In [6]:
std_df = dev_df.loc[dev_df['CO'] < 4.5, :]
std_df.shape

(21652, 10)

In [7]:
ext_df = dev_df.loc[dev_df['CO'] >= 4.5, :]
ext_df.shape

(2836, 10)

In [8]:
from sklearn.model_selection import RandomizedSearchCV

n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]

max_features = ['sqrt','log2', None]

max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

min_samples_split = [2, 4, 6]

min_samples_leaf = [1, 2, 4]

bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

print(random_grid)

{'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000], 'max_features': ['sqrt', 'log2', None], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 4, 6], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [9]:
poly = PolynomialFeatures(degree = 3)

### All Regressor

In [45]:
X = dev_df.drop(columns='CO')
y = dev_df['CO']

X = poly.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=42)

In [28]:
all_rf = RandomForestRegressor()
all_rf_random = RandomizedSearchCV(estimator= all_rf, param_distributions = random_grid, n_iter = 100, cv = 3, random_state=42, n_jobs = -1)
all_rf_random.fit(X,y)

KeyboardInterrupt: 

### Std Regressor

In [46]:
X_std = std_df.drop(columns='CO')
y_std = std_df['CO']

X_std = poly.fit_transform(X_std)

X_std_train, X_std_test, y_std_train, y_std_test = train_test_split(X_std, y_std, test_size = 0.25, random_state=42)

In [20]:
rf = RandomForestRegressor()

rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, random_state=42, n_jobs = -1)

rf_random.fit(X_std, y_std)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [21]:
rf_random.best_params_

{'n_estimators': 700,
 'min_samples_split': 2,
 'min_samples_leaf': 2,
 'max_features': None,
 'max_depth': 30,
 'bootstrap': True}

### Ext Regressor

In [10]:
X_ext = ext_df.drop(columns='CO')
y_ext = ext_df['CO']

X_ext = poly.fit_transform(X_ext)

X_ext_train, X_ext_test, y_ext_train, y_ext_test = train_test_split(X_ext, y_ext, test_size = 0.25, random_state=42)

In [None]:
rf = RandomForestRegressor()

rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose =3, random_state=42, n_jobs = -1)

rf_random.fit(X_ext, y_ext)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [27]:
rf_random.best_params_

{'n_estimators': 500,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': None,
 'bootstrap': False}

In [26]:
best_random = rf_random.best_estimator_
y_predict_random = best_random.predict(X_test)
mse(y_test, y_predict_random)

4.099295900845886