# Configuration

In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

from sklearn.model_selection import train_test_split, validation_curve
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import PolynomialFeatures

from regressors import stats

%matplotlib inline

plt.rcParams["figure.figsize"] = (10, 6)

# Importing Dataset 

In [10]:
std_df = pd.read_csv('processed_dataset/std_dataset.csv', index_col = 0)
std_df.shape

FileNotFoundError: [Errno 2] No such file or directory: 'processed_dataset/std_dataset.csv'

In [None]:
dev_df = std_df.loc[:24487]
dev_df.shape

In [None]:
eval_df = std_df.loc[24488:]
eval_df.shape

In [None]:
ambiental_pred = ['AT', 'AP', 'AH']
process_pred = ['AFDP', 'GTEP', 'TIT', 'TAT', 'TEY', 'CDP']
location_pred = ['Austria', 'Belgium', 'Bulgaria', 'Croatia',
       'Czech Republic', 'Denmark', 'Estonia', 'Finland', 'France', 'Germany',
       'Greece', 'Hungary', 'Ireland', 'Italy', 'Latvia', 'Lithuania',
       'Luxembourg', 'Malta', 'Netherlands', 'Poland', 'Portugal',
       'Republic of Cyprus', 'Romania', 'Slovakia', 'Slovenia', 'Spain',
       'Sweden']
nox = ['NOX']
year = ['YEAR']

# Random Forest

In [None]:
def include_features(model, df, features_lists, poly = None, target_feature = 'CO'):
    
    scores = {}
    
    
    for features in features_lists:
        X = df.drop(columns = 'CO')
        y = df['CO']
        
        if features == 'all':
            features = X.columns
        
        X_train, X_test, y_train, y_test = train_test_split(X[features], y, test_size = 0.25, random_state=42)
        
        
        if poly is not None:
            poly.fit(X_train)
            X_train, X_test = poly.transform(X_train), poly.transform(X_test)
            
        print(f'X_train shape = {X_train.shape}, X_test shape = {X_test.shape}')
        
        model.fit(X_train,y_train)
        y_pred = model.predict(X_test)
        
        if len(features) == len(df.columns):
            features == 'all'
        
        if poly is None:
            scores[tuple(model.feature_names_in_)] = (y_pred, model.feature_importances_, mse(y_test, y_pred))
            
        else:
            scores[tuple(poly.get_feature_names_out())] = (y_pred, model.feature_importances_ ,mse(y_test, y_pred))
        print(mse(y_test, y_pred))
        
    return scores

In [None]:
features_lists = [
    'all',
    ambiental_pred,
    process_pred,
    ambiental_pred+nox,
    ambiental_pred+year,
    ambiental_pred+nox+year,
    process_pred+nox,
    process_pred+year,
    process_pred+nox+year,
    ambiental_pred+process_pred,
    ambiental_pred+process_pred+nox,
    ambiental_pred+process_pred+year,
    ambiental_pred+process_pred+nox+year,
]

In [13]:
rf = RandomForestRegressor(random_state = 42)
scores = include_features(rf, dev_df, features_lists)

X_train shape = (18366, 38), X_test shape = (6122, 38)
1.3997627681382145
X_train shape = (18366, 3), X_test shape = (6122, 3)
4.8165523238941494
X_train shape = (18366, 6), X_test shape = (6122, 6)
1.5763313317709273
X_train shape = (18366, 4), X_test shape = (6122, 4)
3.0161905500238664
X_train shape = (18366, 4), X_test shape = (6122, 4)
3.936715055349334
X_train shape = (18366, 5), X_test shape = (6122, 5)
2.1804312242414654
X_train shape = (18366, 7), X_test shape = (6122, 7)
1.5266375790224977
X_train shape = (18366, 7), X_test shape = (6122, 7)
1.5471398664544946
X_train shape = (18366, 8), X_test shape = (6122, 8)
1.4599313279295645
X_train shape = (18366, 9), X_test shape = (6122, 9)
1.4159094685200502
X_train shape = (18366, 10), X_test shape = (6122, 10)
1.3913342194327116
X_train shape = (18366, 10), X_test shape = (6122, 10)
1.402681243838447
X_train shape = (18366, 11), X_test shape = (6122, 11)
1.3826777045898533


In [14]:
poly = PolynomialFeatures(degree = 2)

scores_poly = include_features(rf, dev_df, features_lists[-4:], poly = poly)

X_train shape = (18366, 55), X_test shape = (6122, 55)
1.3584735660364193
X_train shape = (18366, 66), X_test shape = (6122, 66)
1.3243092849372338
X_train shape = (18366, 66), X_test shape = (6122, 66)
1.3143580042498377
X_train shape = (18366, 78), X_test shape = (6122, 78)
1.314777698872248


# Extreme 

In [15]:
extreme_df = dev_df.loc[dev_df['CO'] > 4.5,:]
scores_poly = include_features(rf, extreme_df, features_lists[-4:], poly = poly)

X_train shape = (2126, 55), X_test shape = (709, 55)
3.3160416188719015
X_train shape = (2126, 66), X_test shape = (709, 66)
3.321139374169403
X_train shape = (2126, 66), X_test shape = (709, 66)
3.236979673157915
X_train shape = (2126, 78), X_test shape = (709, 78)
3.255428222753458


In [16]:
scores_extreme = include_features(rf, extreme_df, features_lists[-4:], poly = None)

X_train shape = (2126, 9), X_test shape = (709, 9)
3.2337039539382517
X_train shape = (2126, 10), X_test shape = (709, 10)
3.37212020333645
X_train shape = (2126, 10), X_test shape = (709, 10)
3.280655782292754
X_train shape = (2126, 11), X_test shape = (709, 11)
3.4325254259300024


# Standard

In [3]:
standard_df = dev_df.loc[dev_df['CO'] <= 4.5,:]
scores_std_poly = include_features(rf, standard_df, features_lists[-4:], poly = poly)

NameError: name 'dev_df' is not defined

In [19]:
scores_std = include_features(rf, standard_df, features_lists[-4:], poly = None)

X_train shape = (16239, 9), X_test shape = (5414, 9)
0.25812435622910435
X_train shape = (16239, 10), X_test shape = (5414, 10)
0.24410320734089236
X_train shape = (16239, 10), X_test shape = (5414, 10)
0.2308473839726596
X_train shape = (16239, 11), X_test shape = (5414, 11)
0.21770281234826516
