# Configuration 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

from sklearn.model_selection import train_test_split, validation_curve
from sklearn.linear_model import Lasso, LinearRegression, Ridge
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import PolynomialFeatures

%matplotlib inline

plt.rcParams["figure.figsize"] = (10, 6)

# Utils

In [2]:
def include_features(model, df, features_lists, poly = None, target_feature = 'CO'):
    
    scores = {}
    
    
    for features in features_lists:
        X = df.drop(columns = 'CO')
        y = df['CO']
        
        if features == 'all':
            features = X.columns
        
        X_train, X_test, y_train, y_test = train_test_split(X[features], y, test_size = 0.25, random_state=42)
        
        
        if poly is not None:
            poly.fit(X_train)
            X_train, X_test = poly.transform(X_train), poly.transform(X_test)
            
        print(f'X_train shape = {X_train.shape}, X_test shape = {X_test.shape}')
        
        model.fit(X_train,y_train)
        y_pred = model.predict(X_test)
        
        if len(features) == len(df.columns):
            features == 'all'
        
        if poly is None:
            scores[tuple(model.feature_names_in_)] = (y_pred, model.coef_, mse(y_test, y_pred))
        else:
            scores[tuple(poly.get_feature_names_out())] = (y_pred, model.coef_ ,mse(y_test, y_pred))
        
    return scores
        

In [3]:
def print_features_coef(features, coef, only_non_van):
    print('Coefficients:')
    for f, c in zip(features, coef):
        if (only_non_van == True and c != 0):
            print(f,c)
        elif only_non_van == False:
            print(f,c)

In [4]:
def print_scores(scores_dict, only_mse = False, print_coef = False):
    '''scores_dict => (columns): (predictions, mse_score) '''
    for columns, (pred, coef ,mse_score) in scores_dict.items():
        print()
        if print_coef:
            print_features_coef(columns, coef, only_non_van = True)
            
        if only_mse:
            print(f'mse:{mse_score}')
        else:
            print(f'Columns: \n {columns}, \n MSE: {mse_score}')

# Importing Dataset

In [5]:
std_df = pd.read_csv('processed_dataset/std_dataset.csv', index_col = 0)
std_df.shape

(36733, 39)

In [14]:
dev_df = std_df.loc[:24487]
dev_df.shape

(24488, 39)

In [15]:
eval_df = std_df.loc[24488:]
eval_df.shape

(12245, 39)

In [16]:
standard_emission_df = dev_df.loc[dev_df['CO'] < 4.5, :]
standard_emission_df.shape

(21652, 39)

In [17]:
dev_df.columns

Index(['YEAR', 'AT', 'AP', 'AH', 'AFDP', 'GTEP', 'TIT', 'TAT', 'TEY', 'CDP',
       'NOX', 'CO', 'Austria', 'Belgium', 'Bulgaria', 'Croatia',
       'Czech Republic', 'Denmark', 'Estonia', 'Finland', 'France', 'Germany',
       'Greece', 'Hungary', 'Ireland', 'Italy', 'Latvia', 'Lithuania',
       'Luxembourg', 'Malta', 'Netherlands', 'Poland', 'Portugal',
       'Republic of Cyprus', 'Romania', 'Slovakia', 'Slovenia', 'Spain',
       'Sweden'],
      dtype='object')

In [18]:
ambiental_pred = ['AT', 'AP', 'AH']
process_pred = ['AFDP', 'GTEP', 'TIT', 'TAT', 'TEY', 'CDP']
location_pred = ['Austria', 'Belgium', 'Bulgaria', 'Croatia',
       'Czech Republic', 'Denmark', 'Estonia', 'Finland', 'France', 'Germany',
       'Greece', 'Hungary', 'Ireland', 'Italy', 'Latvia', 'Lithuania',
       'Luxembourg', 'Malta', 'Netherlands', 'Poland', 'Portugal',
       'Republic of Cyprus', 'Romania', 'Slovakia', 'Slovenia', 'Spain',
       'Sweden']
nox = ['NOX']
year = ['YEAR']

In [19]:
ambiental_pred+process_pred+nox

['AT', 'AP', 'AH', 'AFDP', 'GTEP', 'TIT', 'TAT', 'TEY', 'CDP', 'NOX']

# Complete dataset test

In [20]:
features_lists = [
    'all',
    ambiental_pred,
    process_pred,
    ambiental_pred+nox,
    ambiental_pred+year,
    ambiental_pred+nox+year,
    process_pred+nox,
    process_pred+year,
    process_pred+nox+year,
    ambiental_pred+process_pred,
    ambiental_pred+process_pred+nox,
    ambiental_pred+process_pred+year,
    ambiental_pred+process_pred+nox+year,
]

## Lasso

In [22]:
default_lasso = Lasso()
scores = include_features(default_lasso, standard_emission_df, features_lists)

X_train shape = (16239, 38), X_test shape = (5413, 38)
X_train shape = (16239, 3), X_test shape = (5413, 3)
X_train shape = (16239, 6), X_test shape = (5413, 6)
X_train shape = (16239, 4), X_test shape = (5413, 4)
X_train shape = (16239, 4), X_test shape = (5413, 4)
X_train shape = (16239, 5), X_test shape = (5413, 5)
X_train shape = (16239, 7), X_test shape = (5413, 7)
X_train shape = (16239, 7), X_test shape = (5413, 7)
X_train shape = (16239, 8), X_test shape = (5413, 8)
X_train shape = (16239, 9), X_test shape = (5413, 9)
X_train shape = (16239, 10), X_test shape = (5413, 10)
X_train shape = (16239, 10), X_test shape = (5413, 10)
X_train shape = (16239, 11), X_test shape = (5413, 11)


In [23]:
print_scores(scores, only_mse = True, print_coef=True)


Coefficients:
mse:0.9503916687522616

Coefficients:
mse:0.9503916687522616

Coefficients:
mse:0.9503916687522616

Coefficients:
mse:0.9503916687522616

Coefficients:
mse:0.9503916687522616

Coefficients:
mse:0.9503916687522616

Coefficients:
mse:0.9503916687522616

Coefficients:
mse:0.9503916687522616

Coefficients:
mse:0.9503916687522616

Coefficients:
mse:0.9503916687522616

Coefficients:
mse:0.9503916687522616

Coefficients:
mse:0.9503916687522616

Coefficients:
mse:0.9503916687522616


## Poly

In [24]:
poly = PolynomialFeatures(degree = 2)
scores = include_features(default_lasso, standard_emission_df, features_lists, poly = poly)

X_train shape = (16239, 780), X_test shape = (5413, 780)
X_train shape = (16239, 10), X_test shape = (5413, 10)
X_train shape = (16239, 28), X_test shape = (5413, 28)
X_train shape = (16239, 15), X_test shape = (5413, 15)
X_train shape = (16239, 15), X_test shape = (5413, 15)
X_train shape = (16239, 21), X_test shape = (5413, 21)
X_train shape = (16239, 36), X_test shape = (5413, 36)
X_train shape = (16239, 36), X_test shape = (5413, 36)
X_train shape = (16239, 45), X_test shape = (5413, 45)
X_train shape = (16239, 55), X_test shape = (5413, 55)
X_train shape = (16239, 66), X_test shape = (5413, 66)
X_train shape = (16239, 66), X_test shape = (5413, 66)
X_train shape = (16239, 78), X_test shape = (5413, 78)


In [25]:
print_scores(scores, only_mse = True, print_coef=True)


Coefficients:
YEAR^2 -0.017592123866028146
YEAR TIT -0.08738036068583735
YEAR TEY -0.0027086092813127646
mse:0.6351030187961823

Coefficients:
mse:0.9503916687522616

Coefficients:
mse:0.9503916687522616

Coefficients:
mse:0.9503916687522616

Coefficients:
YEAR^2 -0.023172067074814798
mse:0.8654663650817597

Coefficients:
YEAR^2 -0.023172067074814798
mse:0.8654663650817597

Coefficients:
mse:0.9503916687522616

Coefficients:
TIT YEAR -0.08738007648785356
TEY YEAR -0.0027088176956595166
YEAR^2 -0.017592383363612574
mse:0.6351026561948127

Coefficients:
TIT YEAR -0.08738007648785356
TEY YEAR -0.0027088176956595166
YEAR^2 -0.017592383363612574
mse:0.6351026561948127

Coefficients:
mse:0.9503916687522616

Coefficients:
mse:0.9503916687522616

Coefficients:
TIT YEAR -0.08738007648785356
TEY YEAR -0.0027088176956595166
YEAR^2 -0.017592383363612574
mse:0.6351026561948127

Coefficients:
TIT YEAR -0.08738007648785356
TEY YEAR -0.0027088176956595166
YEAR^2 -0.017592383363612574
mse:0.6351026561

For the default Lasso version the all the coefficients are 0 except for TIT.

## LinearRegression

In [26]:
linear = LinearRegression()
scores = include_features(linear, standard_emission_df, features_lists, poly = None)

X_train shape = (16239, 38), X_test shape = (5413, 38)
X_train shape = (16239, 3), X_test shape = (5413, 3)
X_train shape = (16239, 6), X_test shape = (5413, 6)
X_train shape = (16239, 4), X_test shape = (5413, 4)
X_train shape = (16239, 4), X_test shape = (5413, 4)
X_train shape = (16239, 5), X_test shape = (5413, 5)
X_train shape = (16239, 7), X_test shape = (5413, 7)
X_train shape = (16239, 7), X_test shape = (5413, 7)
X_train shape = (16239, 8), X_test shape = (5413, 8)
X_train shape = (16239, 9), X_test shape = (5413, 9)
X_train shape = (16239, 10), X_test shape = (5413, 10)
X_train shape = (16239, 10), X_test shape = (5413, 10)
X_train shape = (16239, 11), X_test shape = (5413, 11)


In [27]:
print_scores(scores, only_mse = True, print_coef=True)


Coefficients:
YEAR -0.3640510610650432
AT -0.21413419206231882
AP 0.01259797645757165
AH -0.0528942601900799
AFDP 0.05599805877606802
GTEP 0.11892404188394874
TIT 0.7082875788603988
TAT -0.8211245757129673
TEY -1.4486905318998313
CDP -0.7036638100543395
NOX 0.07410636163845788
Austria -14518984801.35266
Belgium -14518984801.301115
Bulgaria -14518984801.287447
Croatia -14518984801.395655
Czech Republic -14518984801.30968
Denmark -14518984801.332033
Estonia -14518984801.324299
Finland -14518984801.373787
France -14518984801.328814
Germany -14518984801.316187
Greece -14518984801.307013
Hungary -14518984801.362686
Ireland -14518984801.27645
Italy -14518984801.347578
Latvia -14518984801.35304
Lithuania -14518984801.344683
Luxembourg -14518984801.32545
Malta -14518984801.317986
Netherlands -14518984801.301197
Poland -14518984801.372978
Portugal -14518984801.317505
Republic of Cyprus -14518984801.361288
Romania -14518984801.32253
Slovakia -14518984801.375544
Slovenia -14518984801.296
Spain -

## Poly LinearRegression

In [33]:
poly = PolynomialFeatures(degree = 2)
scores = include_features(linear, standard_emission_df, features_lists, poly = poly)

X_train shape = (16239, 780), X_test shape = (5413, 780)
X_train shape = (16239, 10), X_test shape = (5413, 10)
X_train shape = (16239, 28), X_test shape = (5413, 28)
X_train shape = (16239, 15), X_test shape = (5413, 15)
X_train shape = (16239, 15), X_test shape = (5413, 15)
X_train shape = (16239, 21), X_test shape = (5413, 21)
X_train shape = (16239, 36), X_test shape = (5413, 36)
X_train shape = (16239, 36), X_test shape = (5413, 36)
X_train shape = (16239, 45), X_test shape = (5413, 45)
X_train shape = (16239, 55), X_test shape = (5413, 55)
X_train shape = (16239, 66), X_test shape = (5413, 66)
X_train shape = (16239, 66), X_test shape = (5413, 66)
X_train shape = (16239, 78), X_test shape = (5413, 78)


In [34]:
print_scores(scores, only_mse = True, print_coef=True)


Coefficients:
1 -7279583.97347924
YEAR -7212181572.610636
AT -574435670.9099393
AP 11080625232.852757
AH 1192096030.3537006
AFDP -3718692847.4837537
GTEP 79005990.93945245
TIT -516909493.5591416
TAT -3411392243.4255967
TEY 1834677554.5925887
CDP -1162353046.5926118
NOX -93507203.81990588
Austria 6260027791.57411
Belgium -2325254006.811634
Bulgaria -58263889396.39826
Croatia -15820272005.370975
Czech Republic -3684798852.953826
Denmark -12163053737.037783
Estonia -8816117335.687416
Finland -1462597565.6870754
France -8110092957.918589
Germany -5050114284.952569
Greece -13884202692.730343
Hungary -4423339753.046688
Ireland -11150208831.555023
Italy -6743768078.930648
Latvia -6097352948.073507
Lithuania -11238834449.848272
Luxembourg -9433224436.468464
Malta -2718827257.3039184
Netherlands -8904646814.230667
Poland -4489220389.159781
Portugal -6485103455.765259
Republic of Cyprus -1914009677.1857984
Romania -5678559733.040464
Slovakia -6301071588.510575
Slovenia -2003719119.1110144
Spain

## Ridge

In [29]:
ridge = Ridge()
scores = include_features(ridge, standard_emission_df, features_lists, poly = None)

X_train shape = (16239, 38), X_test shape = (5413, 38)
X_train shape = (16239, 3), X_test shape = (5413, 3)
X_train shape = (16239, 6), X_test shape = (5413, 6)
X_train shape = (16239, 4), X_test shape = (5413, 4)
X_train shape = (16239, 4), X_test shape = (5413, 4)
X_train shape = (16239, 5), X_test shape = (5413, 5)
X_train shape = (16239, 7), X_test shape = (5413, 7)
X_train shape = (16239, 7), X_test shape = (5413, 7)
X_train shape = (16239, 8), X_test shape = (5413, 8)
X_train shape = (16239, 9), X_test shape = (5413, 9)
X_train shape = (16239, 10), X_test shape = (5413, 10)
X_train shape = (16239, 10), X_test shape = (5413, 10)
X_train shape = (16239, 11), X_test shape = (5413, 11)


In [30]:
print_scores(scores, only_mse = True, print_coef=True)


Coefficients:
YEAR -0.3629805897948946
AT -0.21028527580425949
AP 0.013073513526217736
AH -0.05270890133352983
AFDP 0.05599991355415749
GTEP 0.11671537896414426
TIT 0.6936324672479491
TAT -0.8155362639089828
TEY -1.4286636966743855
CDP -0.703918694567788
NOX 0.07503471041135187
Austria -0.020264441891558644
Belgium 0.031192512540841207
Bulgaria 0.044816751597681614
Croatia -0.06319930332704642
Czech Republic 0.022742819071570827
Denmark 0.0003442623699613304
Estonia 0.008021377940621062
Finland -0.04139006113657274
France 0.0035979826514902684
Germany 0.01623148634959174
Greece 0.025358235738223142
Hungary -0.030415531680544883
Ireland 0.05575263146563392
Italy -0.015209706694490031
Latvia -0.0206985956010182
Lithuania -0.01230099814858591
Luxembourg 0.007046925843475959
Malta 0.014457692236171677
Netherlands 0.031031812144192202
Poland -0.04045674189420847
Portugal 0.01495452809181851
Republic of Cyprus -0.028801117346244475
Romania 0.009753000147923241
Slovakia -0.042908190214050186

## Poly Ridge

In [31]:
poly = PolynomialFeatures(degree = 2)
scores = include_features(ridge, standard_emission_df, features_lists, poly = poly)

X_train shape = (16239, 780), X_test shape = (5413, 780)
X_train shape = (16239, 10), X_test shape = (5413, 10)
X_train shape = (16239, 28), X_test shape = (5413, 28)
X_train shape = (16239, 15), X_test shape = (5413, 15)
X_train shape = (16239, 15), X_test shape = (5413, 15)
X_train shape = (16239, 21), X_test shape = (5413, 21)
X_train shape = (16239, 36), X_test shape = (5413, 36)
X_train shape = (16239, 36), X_test shape = (5413, 36)
X_train shape = (16239, 45), X_test shape = (5413, 45)
X_train shape = (16239, 55), X_test shape = (5413, 55)
X_train shape = (16239, 66), X_test shape = (5413, 66)
X_train shape = (16239, 66), X_test shape = (5413, 66)
X_train shape = (16239, 78), X_test shape = (5413, 78)


In [32]:
print_scores(scores, only_mse = True, print_coef=True)


Coefficients:
YEAR -0.19759938743031966
AT -0.1429004738442585
AP 0.1072995622673153
AH 0.06556600575829002
AFDP -0.3373534239809471
GTEP 1.4784509996543365
TIT -1.7337944561237861
TAT 0.4720534215618567
TEY -0.8638097300081202
CDP 0.4033753643363804
NOX 0.10110462771153461
Austria 0.0019235092685594127
Belgium -0.004392788012800396
Bulgaria 0.04816348751719152
Croatia -0.03004035567346682
Czech Republic 0.031574105728515545
Denmark -0.0014444919158123786
Estonia 0.0033214459507117785
Finland -0.0070262763040832635
France -0.003309334757402565
Germany -0.0566806258110701
Greece 0.06332962941625782
Hungary -0.059667710972397196
Ireland 0.0074982918589850235
Italy -0.04280710086286586
Latvia 0.03963140674608589
Lithuania -0.03476142547100431
Luxembourg 0.02642393702323701
Malta -0.014649638800202851
Netherlands 0.020394289536384034
Poland -0.0008472541890367819
Portugal 0.04792192549977409
Republic of Cyprus -0.003353355223195595
Romania 0.023262535282328727
Slovakia -0.0016684996944359