# Configuration 

In [51]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support as metrics

%matplotlib inline

plt.rcParams["figure.figsize"] = (10, 6)

# Importing Dataset

In [2]:
std_df = pd.read_csv('processed_dataset/std_dataset.csv', index_col = 0)
std_df.shape

(36733, 39)

In [3]:
dev_df = std_df.loc[:24487]
dev_df.shape

(24488, 39)

In [4]:
eval_df = std_df.loc[24488:]
eval_df.shape

(12245, 39)

In [21]:
features_df = dev_df.drop(columns = 'CO')

In [19]:
def labelling_func(value):
    if value < 4.5:
        return 0
    if value >= 4.5:
        return 1

labels_df = dev_df['CO'].apply(labelling_func)
labels_df.value_counts()

0    21652
1     2836
Name: CO, dtype: int64

Labelling effettuato correttamente

# Baseline 

In [75]:
X_train, X_test, y_train, y_test = train_test_split(features_df, labels_df, test_size = 0.25, random_state=42)

In [76]:
rfc = RandomForestClassifier(random_state=42)
rfc.fit(X_train, y_train)
y_predict = rfc.predict(X_test)

In [77]:
metrics(y_test, y_predict)

(array([0.97274903, 0.86685962]),
 array([0.98288372, 0.80187416]),
 array([0.97779012, 0.83310153]),
 array([5375,  747], dtype=int64))

In [87]:
for f,i in zip(rfc.feature_names_in_,rfc.feature_importances_):   
    print(f,i)

YEAR 0.03091810060334125
AT 0.04642827828674638
AP 0.034520674742257326
AH 0.04033415058362883
AFDP 0.09424814604457554
GTEP 0.11989639519506173
TIT 0.2015032714576101
TAT 0.05007955738618992
TEY 0.1333143206988655
CDP 0.11172835493885978
NOX 0.09274056727200096
Austria 0.001613919906819862
Belgium 0.0015837566500369398
Bulgaria 0.0015737534385336105
Croatia 0.0019785855316970633
Czech Republic 0.001588918989747298
Denmark 0.0014160096359585796
Estonia 0.0017016819483784637
Finland 0.0015713543434230338
France 0.0016333654391027517
Germany 0.0024896322504785262
Greece 0.0012150710846580365
Hungary 0.002019718062580387
Ireland 0.0015089305582199656
Italy 0.0012607631231612523
Latvia 0.0014548279972770705
Lithuania 0.001872190291652429
Luxembourg 0.0013867219527468621
Malta 0.0021049197549367054
Netherlands 0.0018685458466020052
Poland 0.0015136830971629483
Portugal 0.0016589365675381614
Republic of Cyprus 0.0014058220596192752
Romania 0.0014954205676634543
Slovakia 0.0013586508578671244

In [86]:
rfc.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [80]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Number of features to consider at every split
max_features = ['sqrt','log2']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['sqrt', 'log2'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [81]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [82]:
rf_random.best_params_

{'n_estimators': 400,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'log2',
 'max_depth': None,
 'bootstrap': False}

In [83]:
best_random = rf_random.best_estimator_
y_predict_random = best_random.predict(X_test)
metrics(y_test, y_predict_random)

(array([0.97450111, 0.85774648]),
 array([0.9812093 , 0.81526104]),
 array([0.9778437 , 0.83596431]),
 array([5375,  747], dtype=int64))

In [88]:
metrics(y_test, y_predict)

(array([0.97274903, 0.86685962]),
 array([0.98288372, 0.80187416]),
 array([0.97779012, 0.83310153]),
 array([5375,  747], dtype=int64))