# Configuration 

In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

from sklearn.model_selection import train_test_split, validation_curve
from sklearn.linear_model import Lasso, LinearRegression, Ridge
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import PolynomialFeatures

%matplotlib inline

plt.rcParams["figure.figsize"] = (10, 6)

# Utils

In [2]:
def include_features(model, df, features_lists, poly = None, target_feature = 'CO'):
    
    scores = {}
    
    
    for features in features_lists:
        X = df.drop(columns = 'CO')
        y = df['CO']
        
        if features == 'all':
            features = X.columns
        
        X_train, X_test, y_train, y_test = train_test_split(X[features], y, test_size = 0.25, random_state=42)
        
        
        if poly is not None:
            poly.fit(X_train)
            X_train, X_test = poly.transform(X_train), poly.transform(X_test)
            
        print(f'X_train shape = {X_train.shape}, X_test shape = {X_test.shape}')
        
        model.fit(X_train,y_train)
        y_pred = model.predict(X_test)
        
        if len(features) == len(df.columns):
            features == 'all'
        
        if poly is None:
            scores[tuple(model.feature_names_in_)] = (y_pred, model.coef_, mse(y_test, y_pred))
        else:
            scores[tuple(poly.get_feature_names_out())] = (y_pred, model.coef_ ,mse(y_test, y_pred))
        
    return scores
        

In [3]:
def print_features_coef(features, coef, only_non_van):
    print('Coefficients:')
    for f, c in zip(features, coef):
        if (only_non_van == True and c != 0):
            print(f,c)
        elif only_non_van == False:
            print(f,c)

In [4]:
def print_scores(scores_dict, only_mse = False, print_coef = False):
    '''scores_dict => (columns): (predictions, mse_score) '''
    for columns, (pred, coef ,mse_score) in scores_dict.items():
        print()
        if print_coef:
            print_features_coef(columns, coef, only_non_van = True)
            
        if only_mse:
            print(f'mse:{mse_score}')
        else:
            print(f'Columns: \n {columns}, \n MSE: {mse_score}')

# Importing Dataset

In [5]:
std_df = pd.read_csv('processed_dataset/std_dataset.csv', index_col = 0)
std_df.shape

(36733, 39)

In [6]:
dev_df = std_df.loc[:24487]
dev_df.shape

(24488, 39)

In [7]:
eval_df = std_df.loc[24488:]
eval_df.shape

(12245, 39)

In [8]:
dev_df.columns

Index(['YEAR', 'AT', 'AP', 'AH', 'AFDP', 'GTEP', 'TIT', 'TAT', 'TEY', 'CDP',
       'NOX', 'CO', 'Austria', 'Belgium', 'Bulgaria', 'Croatia',
       'Czech Republic', 'Denmark', 'Estonia', 'Finland', 'France', 'Germany',
       'Greece', 'Hungary', 'Ireland', 'Italy', 'Latvia', 'Lithuania',
       'Luxembourg', 'Malta', 'Netherlands', 'Poland', 'Portugal',
       'Republic of Cyprus', 'Romania', 'Slovakia', 'Slovenia', 'Spain',
       'Sweden'],
      dtype='object')

In [9]:
ambiental_pred = ['AT', 'AP', 'AH']
process_pred = ['AFDP', 'GTEP', 'TIT', 'TAT', 'TEY', 'CDP']
location_pred = ['Austria', 'Belgium', 'Bulgaria', 'Croatia',
       'Czech Republic', 'Denmark', 'Estonia', 'Finland', 'France', 'Germany',
       'Greece', 'Hungary', 'Ireland', 'Italy', 'Latvia', 'Lithuania',
       'Luxembourg', 'Malta', 'Netherlands', 'Poland', 'Portugal',
       'Republic of Cyprus', 'Romania', 'Slovakia', 'Slovenia', 'Spain',
       'Sweden']
nox = ['NOX']
year = ['YEAR']

In [10]:
ambiental_pred+process_pred+nox

['AT', 'AP', 'AH', 'AFDP', 'GTEP', 'TIT', 'TAT', 'TEY', 'CDP', 'NOX']

# Complete dataset test

In [11]:
features_lists = [
    'all',
    ambiental_pred,
    process_pred,
    ambiental_pred+nox,
    ambiental_pred+year,
    ambiental_pred+nox+year,
    process_pred+nox,
    process_pred+year,
    process_pred+nox+year,
    ambiental_pred+process_pred,
    ambiental_pred+process_pred+nox,
    ambiental_pred+process_pred+year,
    ambiental_pred+process_pred+nox+year,
]

## Lasso

In [12]:
default_lasso = Lasso()
scores = include_features(default_lasso, dev_df, features_lists)

X_train shape = (18366, 38), X_test shape = (6122, 38)
X_train shape = (18366, 3), X_test shape = (6122, 3)
X_train shape = (18366, 6), X_test shape = (6122, 6)
X_train shape = (18366, 4), X_test shape = (6122, 4)
X_train shape = (18366, 4), X_test shape = (6122, 4)
X_train shape = (18366, 5), X_test shape = (6122, 5)
X_train shape = (18366, 7), X_test shape = (6122, 7)
X_train shape = (18366, 7), X_test shape = (6122, 7)
X_train shape = (18366, 8), X_test shape = (6122, 8)
X_train shape = (18366, 9), X_test shape = (6122, 9)
X_train shape = (18366, 10), X_test shape = (6122, 10)
X_train shape = (18366, 10), X_test shape = (6122, 10)
X_train shape = (18366, 11), X_test shape = (6122, 11)


In [13]:
print_scores(scores, only_mse = True, print_coef=True)


Coefficients:
TIT -0.5760119278636129
mse:3.5978836067143054

Coefficients:
mse:5.179609728446949

Coefficients:
TIT -0.5760119278636129
mse:3.5978836067143054

Coefficients:
mse:5.179609728446949

Coefficients:
mse:5.179609728446949

Coefficients:
mse:5.179609728446949

Coefficients:
TIT -0.5760119278636129
mse:3.5978836067143054

Coefficients:
TIT -0.5760119278636129
mse:3.5978836067143054

Coefficients:
TIT -0.5760119278636129
mse:3.5978836067143054

Coefficients:
TIT -0.5760119278636129
mse:3.5978836067143054

Coefficients:
TIT -0.5760119278636129
mse:3.5978836067143054

Coefficients:
TIT -0.5760119278636129
mse:3.5978836067143054

Coefficients:
TIT -0.5760119278636129
mse:3.5978836067143054


## Poly

In [14]:
poly = PolynomialFeatures(degree = 2)
scores = include_features(default_lasso, dev_df, features_lists, poly = poly)

X_train shape = (18366, 780), X_test shape = (6122, 780)
X_train shape = (18366, 10), X_test shape = (6122, 10)
X_train shape = (18366, 28), X_test shape = (6122, 28)
X_train shape = (18366, 15), X_test shape = (6122, 15)
X_train shape = (18366, 15), X_test shape = (6122, 15)
X_train shape = (18366, 21), X_test shape = (6122, 21)
X_train shape = (18366, 36), X_test shape = (6122, 36)
X_train shape = (18366, 36), X_test shape = (6122, 36)
X_train shape = (18366, 45), X_test shape = (6122, 45)
X_train shape = (18366, 55), X_test shape = (6122, 55)
X_train shape = (18366, 66), X_test shape = (6122, 66)
X_train shape = (18366, 66), X_test shape = (6122, 66)
X_train shape = (18366, 78), X_test shape = (6122, 78)


In [15]:
print_scores(scores, only_mse = True, print_coef=True)


Coefficients:
YEAR^2 -0.01810505343297445
YEAR TIT -0.29749713342734685
YEAR NOX 0.035004405965254885
TIT^2 0.18433905696919756
NOX^2 0.02839130476537028
mse:2.518763121908493

Coefficients:
mse:5.179609728446949

Coefficients:
TIT -0.10106135969478873
TIT^2 0.5529454687067908
mse:2.964870916937598

Coefficients:
NOX^2 0.3220124088685397
mse:4.007002217097597

Coefficients:
AH YEAR 0.001374962043172018
YEAR^2 -0.032682064452630805
mse:5.008478731204563

Coefficients:
NOX^2 0.30350796648652095
NOX YEAR 0.015451625893011287
YEAR^2 -0.03141866077553242
mse:3.864590972219884

Coefficients:
TIT -0.05465359238205227
TIT^2 0.45837254625038776
NOX^2 0.15712724278496637
mse:2.863577494603666

Coefficients:
TIT^2 0.2167750041090352
TIT YEAR -0.3050551517374847
YEAR^2 -0.014793536961593572
mse:2.588654174887868

Coefficients:
TIT^2 0.1843015865535613
TIT YEAR -0.2975136301932111
NOX^2 0.028388991880048092
NOX YEAR 0.03500793570356147
YEAR^2 -0.01810437061465407
mse:2.518817178796338

Coefficient

For the default Lasso version the all the coefficients are 0 except for TIT.

## LinearRegression

In [16]:
linear = LinearRegression()
scores = include_features(linear, dev_df, features_lists, poly = None)

X_train shape = (18366, 38), X_test shape = (6122, 38)
X_train shape = (18366, 3), X_test shape = (6122, 3)
X_train shape = (18366, 6), X_test shape = (6122, 6)
X_train shape = (18366, 4), X_test shape = (6122, 4)
X_train shape = (18366, 4), X_test shape = (6122, 4)
X_train shape = (18366, 5), X_test shape = (6122, 5)
X_train shape = (18366, 7), X_test shape = (6122, 7)
X_train shape = (18366, 7), X_test shape = (6122, 7)
X_train shape = (18366, 8), X_test shape = (6122, 8)
X_train shape = (18366, 9), X_test shape = (6122, 9)
X_train shape = (18366, 10), X_test shape = (6122, 10)
X_train shape = (18366, 10), X_test shape = (6122, 10)
X_train shape = (18366, 11), X_test shape = (6122, 11)


In [17]:
print_scores(scores, only_mse = True, print_coef=True)


Coefficients:
YEAR -0.547227516125773
AT 0.3407072547288764
AP 0.05082457119038675
AH 0.16203728005253024
AFDP 0.013790256544022328
GTEP -0.10524757098708659
TIT 0.0664594885859136
TAT -1.4423427447623975
TEY -2.106994925677867
CDP 0.03481151710423558
NOX 0.7731356654984214
Austria -49133021362.44072
Belgium -49133021362.37091
Bulgaria -49133021362.3297
Croatia -49133021362.41589
Czech Republic -49133021362.36856
Denmark -49133021362.404305
Estonia -49133021362.44497
Finland -49133021362.44816
France -49133021362.37344
Germany -49133021362.40586
Greece -49133021362.43714
Hungary -49133021362.30928
Ireland -49133021362.323524
Italy -49133021362.41601
Latvia -49133021362.42598
Lithuania -49133021362.1996
Luxembourg -49133021362.38038
Malta -49133021362.36542
Netherlands -49133021362.37328
Poland -49133021362.3734
Portugal -49133021362.4558
Republic of Cyprus -49133021362.41351
Romania -49133021362.344185
Slovakia -49133021362.43214
Slovenia -49133021362.42989
Spain -49133021362.32533
Sw

## Poly LinearRegression

In [22]:
poly = PolynomialFeatures(degree = 2)
scores = include_features(linear, dev_df, features_lists, poly = poly)

X_train shape = (18366, 780), X_test shape = (6122, 780)
X_train shape = (18366, 10), X_test shape = (6122, 10)
X_train shape = (18366, 28), X_test shape = (6122, 28)
X_train shape = (18366, 15), X_test shape = (6122, 15)
X_train shape = (18366, 15), X_test shape = (6122, 15)
X_train shape = (18366, 21), X_test shape = (6122, 21)
X_train shape = (18366, 36), X_test shape = (6122, 36)
X_train shape = (18366, 36), X_test shape = (6122, 36)
X_train shape = (18366, 45), X_test shape = (6122, 45)
X_train shape = (18366, 55), X_test shape = (6122, 55)
X_train shape = (18366, 66), X_test shape = (6122, 66)
X_train shape = (18366, 66), X_test shape = (6122, 66)
X_train shape = (18366, 78), X_test shape = (6122, 78)


In [23]:
print_scores(scores, only_mse = True, print_coef=True)


Coefficients:
1 18693981.156273294
YEAR 46843006782.55306
AT 23709154500.341434
AP 13635415465.253437
AH -10833356818.821938
AFDP 19675468101.53833
GTEP 6491257526.225056
TIT 6151670967.421697
TAT 2647426630.6668763
TEY 1696096706.4041348
CDP -3081980920.1056604
NOX -5284613188.287186
Austria 14541121680.279602
Belgium 36643622302.68297
Bulgaria -24732604652.128338
Croatia -9220006503.736261
Czech Republic -16763529512.87221
Denmark 47116812796.298004
Estonia -8191408140.736651
Finland 17482816415.642895
France 324509753.5686446
Germany 3342827828.542617
Greece -26978626242.430096
Hungary -10427197002.970253
Ireland -12053344518.271753
Italy -2470585572.1838417
Latvia -6372405959.630107
Lithuania 9814854726.42383
Luxembourg 1234357916.0367618
Malta 674332837.2042174
Netherlands 468027713.9807173
Poland 4747930967.581699
Portugal -9938092796.521933
Republic of Cyprus 3872506017.511536
Romania -9186622929.475746
Slovakia 4263452351.982781
Slovenia -1070694732.3638208
Spain -3564868611.9

## Ridge

In [20]:
ridge = Ridge()
scores = include_features(ridge, dev_df, features_lists, poly = None)

X_train shape = (18366, 38), X_test shape = (6122, 38)
X_train shape = (18366, 3), X_test shape = (6122, 3)
X_train shape = (18366, 6), X_test shape = (6122, 6)
X_train shape = (18366, 4), X_test shape = (6122, 4)
X_train shape = (18366, 4), X_test shape = (6122, 4)
X_train shape = (18366, 5), X_test shape = (6122, 5)
X_train shape = (18366, 7), X_test shape = (6122, 7)
X_train shape = (18366, 7), X_test shape = (6122, 7)
X_train shape = (18366, 8), X_test shape = (6122, 8)
X_train shape = (18366, 9), X_test shape = (6122, 9)
X_train shape = (18366, 10), X_test shape = (6122, 10)
X_train shape = (18366, 10), X_test shape = (6122, 10)
X_train shape = (18366, 11), X_test shape = (6122, 11)


In [21]:
print_scores(scores, only_mse = True, print_coef=True)


Coefficients:
YEAR -0.5465261935438185
AT 0.3457975036038305
AP 0.05154093338622446
AH 0.16218980015980242
AFDP 0.013923406523780749
GTEP -0.10644356368432772
TIT 0.05524613040774399
TAT -1.4396149159765301
TEY -2.076133793967133
CDP 0.017467045313408917
NOX 0.7737148680720023
Austria -0.05891513395116588
Belgium 0.010938441297156364
Bulgaria 0.0518896028477832
Croatia -0.0339785182997561
Czech Republic 0.013135001818795207
Denmark -0.022593467043213855
Estonia -0.06303193983654559
Finland -0.0664271732582875
France 0.008344078807285571
Germany -0.024167551364272774
Greece -0.055180899693082555
Hungary 0.07229758675471058
Ireland 0.05819547222946756
Italy -0.03404836313617696
Latvia -0.04427508665004112
Lithuania 0.18192608930697934
Luxembourg 0.001452884774552782
Malta 0.016450166256771693
Netherlands 0.00831485325063166
Poland 0.008380570770583106
Portugal -0.0739121933815518
Republic of Cyprus -0.03167285867080196
Romania 0.037280481342998786
Slovakia -0.050243814622414874
Slovenia

## Poly Ridge

In [24]:
poly = PolynomialFeatures(degree = 2)
scores = include_features(ridge, dev_df, features_lists, poly = poly)

X_train shape = (18366, 780), X_test shape = (6122, 780)
X_train shape = (18366, 10), X_test shape = (6122, 10)
X_train shape = (18366, 28), X_test shape = (6122, 28)
X_train shape = (18366, 15), X_test shape = (6122, 15)
X_train shape = (18366, 15), X_test shape = (6122, 15)
X_train shape = (18366, 21), X_test shape = (6122, 21)
X_train shape = (18366, 36), X_test shape = (6122, 36)
X_train shape = (18366, 36), X_test shape = (6122, 36)
X_train shape = (18366, 45), X_test shape = (6122, 45)
X_train shape = (18366, 55), X_test shape = (6122, 55)
X_train shape = (18366, 66), X_test shape = (6122, 66)
X_train shape = (18366, 66), X_test shape = (6122, 66)
X_train shape = (18366, 78), X_test shape = (6122, 78)


In [25]:
print_scores(scores, only_mse = True, print_coef=True)


Coefficients:
YEAR -0.1861183543133879
AT 0.47147636183857494
AP 0.0895879085239707
AH 0.47203803501143526
AFDP -0.1392480420841778
GTEP 1.141864197689548
TIT -4.098755553889657
TAT -0.3133253709622185
TEY -0.7477268895406446
CDP 2.112836527531948
NOX 0.6173238943230387
Austria -0.040230290016007655
Belgium 0.032628759516587155
Bulgaria 0.09910337067751929
Croatia 0.059450034017030376
Czech Republic 0.023441808255004366
Denmark 0.030756433598632754
Estonia -0.05402539994011466
Finland -0.11261090803649516
France 0.09712304013589658
Germany -0.2546917759023468
Greece 0.04154252830598036
Hungary 0.129608466739926
Ireland -0.0426530493124376
Italy -0.16215869434518454
Latvia 0.010210055641579025
Lithuania 0.10224520011056354
Luxembourg 0.002972018388014043
Malta -0.0346966159859153
Netherlands -0.03418428001357734
Poland -0.028387359182476736
Portugal -0.029025041358193573
Republic of Cyprus -0.013439544205678729
Romania 0.10683886326083093
Slovakia -0.1290297836314768
Slovenia 0.0053073