In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

from sklearn.model_selection import train_test_split, validation_curve
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import PolynomialFeatures

%matplotlib inline

plt.rcParams["figure.figsize"] = (10, 6)

In [21]:
standard_df = pd.read_csv("processed_dataset/std_dataset.csv", index_col=0)
standard_df.head()

Unnamed: 0_level_0,YEAR,AT,AP,AH,AFDP,GTEP,TIT,TAT,TEY,CDP,...,Malta,Netherlands,Poland,Portugal,Republic of Cyprus,Romania,Slovakia,Slovenia,Spain,Sweden
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,5,0.961592,-0.964193,-1.524936,-0.724114,-0.769905,-0.564983,0.563634,-0.872408,-0.813978,...,0,0,0,0,0,0,0,0,0,0
1,5,-0.455031,1.708863,0.2757,-0.837779,-1.31253,-2.518071,-1.827585,-1.499082,-1.254529,...,0,0,0,0,0,1,0,0,0,0
2,5,-0.827587,-0.608815,0.036992,1.274906,0.732639,0.714232,-0.988535,0.880742,0.848897,...,0,0,0,0,0,0,0,0,0,1
3,5,1.052456,-1.551338,-0.007405,-0.894806,-1.145422,-1.101797,0.55192,-1.323511,-1.317071,...,0,0,0,0,0,0,0,0,0,0
4,5,-0.568511,-0.299791,1.282019,0.349324,-0.314415,0.268791,0.570956,0.016982,-0.124179,...,0,0,0,0,0,0,0,0,0,0


In [22]:
countries = ['Austria', 'Belgium', 'Bulgaria', 'Croatia',
       'Czech Republic', 'Denmark', 'Estonia', 'Finland', 'France', 'Germany',
       'Greece', 'Hungary', 'Ireland', 'Italy', 'Latvia', 'Lithuania',
       'Luxembourg', 'Malta', 'Netherlands', 'Poland', 'Portugal',
       'Republic of Cyprus', 'Romania', 'Slovakia', 'Slovenia', 'Spain',
       'Sweden']

In [23]:
dev_df = standard_df.loc[:24487]
dev_df = dev_df.drop(columns = list(countries + ['TEY', 'CDP']))
dev_df.shape

(24488, 10)

In [5]:
eval_df = standard_df.loc[24488:]
eval_df = eval_df.drop(columns= list(countries + ['TEY', 'CDP']))
eval_df.shape

(12245, 10)

In [24]:
poly = PolynomialFeatures(degree=3)

In [25]:
X = poly.fit_transform(dev_df.drop(columns='CO'))
X.shape

(24488, 220)

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, dev_df['CO'], test_size=0.25, random_state=42)

In [27]:
rf = RandomForestRegressor(bootstrap=False, max_depth=110, max_features='sqrt', 
                           min_samples_leaf=2, n_estimators=300, random_state=42)

In [28]:
rf.fit(X_train, y_train)

In [43]:
y_pred = rf.predict(X_test)
mse(y_test, y_pred)

1.1770907951478295

In [59]:
cum_sum = 0
imp_sum = 0
features = []
for (f,v) in zip (poly.get_feature_names_out(), rf.feature_importances_):
    cum_sum = cum_sum + v
    if v>0.0015: 
        features.append(f)
        imp_sum = imp_sum + v
print(cum_sum,imp_sum)
print(features)

1.0000000000000004 0.89424663638431
['YEAR', 'AFDP', 'GTEP', 'TIT', 'TAT', 'NOX', 'YEAR^2', 'YEAR AT', 'YEAR AFDP', 'YEAR GTEP', 'YEAR TIT', 'YEAR TAT', 'AT GTEP', 'AT TIT', 'AH TAT', 'AFDP^2', 'AFDP GTEP', 'AFDP TIT', 'AFDP TAT', 'AFDP NOX', 'GTEP^2', 'GTEP TIT', 'GTEP TAT', 'GTEP NOX', 'TIT^2', 'TIT TAT', 'TIT NOX', 'TAT^2', 'TAT NOX', 'YEAR^3', 'YEAR^2 AT', 'YEAR^2 AFDP', 'YEAR^2 GTEP', 'YEAR^2 TIT', 'YEAR^2 TAT', 'YEAR^2 NOX', 'YEAR AT GTEP', 'YEAR AT TIT', 'YEAR AT TAT', 'YEAR AH TAT', 'YEAR AFDP^2', 'YEAR AFDP GTEP', 'YEAR AFDP TIT', 'YEAR AFDP TAT', 'YEAR AFDP NOX', 'YEAR GTEP^2', 'YEAR GTEP TIT', 'YEAR GTEP TAT', 'YEAR GTEP NOX', 'YEAR TIT^2', 'YEAR TIT TAT', 'YEAR TIT NOX', 'YEAR TAT^2', 'YEAR TAT NOX', 'AT^2 TIT', 'AT^2 TAT', 'AT AH TAT', 'AT AFDP^2', 'AT AFDP TAT', 'AT GTEP^2', 'AT TIT^2', 'AT TIT TAT', 'AT TIT NOX', 'AT TAT^2', 'AT TAT NOX', 'AP^2 GTEP', 'AP^2 TIT', 'AP^2 TAT', 'AP TIT NOX', 'AP TAT^2', 'AH^2 TIT', 'AH^2 TAT', 'AH TIT^2', 'AH TIT TAT', 'AH TAT^2', 'AFDP^3',

In [60]:
df_poly = pd.DataFrame(X, columns=poly.get_feature_names_out())
X_train, X_test, y_train, y_test = train_test_split(df_poly, dev_df['CO'], test_size=0.25, random_state=42)

In [61]:
rf_feat = RandomForestRegressor()
rf_feat.fit(X_train.loc[:,features], y_train)
y_pred_feat = rf_feat.predict(X_test.loc[:,features])
mse(y_test, y_pred_feat)

1.2149431073661583

In [15]:
evaluation = poly.fit_transform(eval_df.drop(columns='CO'))
pred = rf.predict(evaluation)


In [16]:
submission_df = pd.DataFrame(columns=['Id','Predicted'])
submission_df

Unnamed: 0,Id,Predicted


In [17]:
submission_df['Id'] = eval_df.reset_index()['ID']
submission_df['Predicted'] = pred
submission_df.head()

Unnamed: 0,Id,Predicted
0,24488,0.608116
1,24489,4.893186
2,24490,1.502217
3,24491,3.83566
4,24492,4.388692


In [19]:
submission_df.to_csv("submission/submission_no_outliers.csv", index=False)