# Configuration 

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

from sklearn.model_selection import train_test_split, validation_curve
from sklearn.linear_model import Lasso, LinearRegression, Ridge
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import PolynomialFeatures

%matplotlib inline

plt.rcParams["figure.figsize"] = (10, 6)

# Utils

In [3]:
def include_features(model, df, features_lists, poly = None, target_feature = 'CO'):
    
    scores = {}
    
    
    for features in features_lists:
        X = df.drop(columns = 'CO')
        y = df['CO']
        
        if features == 'all':
            features = X.columns
        
        X_train, X_test, y_train, y_test = train_test_split(X[features], y, test_size = 0.25, random_state=42)
        
        
        if poly is not None:
            poly.fit(X_train)
            X_train, X_test = poly.transform(X_train), poly.transform(X_test)
            
        print(f'X_train shape = {X_train.shape}, X_test shape = {X_test.shape}')
        
        model.fit(X_train,y_train)
        y_pred = model.predict(X_test)
        
        if len(features) == len(df.columns):
            features == 'all'
        
        if poly is None:
            scores[tuple(model.feature_names_in_)] = (y_pred, model.coef_, mse(y_test, y_pred))
        else:
            scores[tuple(poly.get_feature_names_out())] = (y_pred, model.coef_ ,mse(y_test, y_pred))
        
    return scores
        

In [4]:
def print_features_coef(features, coef, only_non_van):
    print('Coefficients:')
    for f, c in zip(features, coef):
        if (only_non_van == True and c != 0):
            print(f,c)
        elif only_non_van == False:
            print(f,c)

In [5]:
def print_scores(scores_dict, only_mse = False, print_coef = False):
    '''scores_dict => (columns): (predictions, mse_score) '''
    for columns, (pred, coef ,mse_score) in scores_dict.items():
        print()
        if print_coef:
            print_features_coef(columns, coef, only_non_van = True)
            
        if only_mse:
            print(f'mse:{mse_score}')
        else:
            print(f'Columns: \n {columns}, \n MSE: {mse_score}')

# Importing Dataset

In [6]:
std_df = pd.read_csv('processed_dataset/std_dataset.csv', index_col = 0)
std_df.shape

(36733, 39)

In [7]:
dev_df = std_df.loc[:24487]
dev_df.shape

(24488, 39)

In [8]:
eval_df = std_df.loc[24488:]
eval_df.shape

(12245, 39)

In [9]:
extreme_emission_df = dev_df.loc[dev_df['CO'] >= 4.5, :]
extreme_emission_df.shape

(2836, 39)

In [10]:
dev_df.columns

Index(['YEAR', 'AT', 'AP', 'AH', 'AFDP', 'GTEP', 'TIT', 'TAT', 'TEY', 'CDP',
       'NOX', 'CO', 'Austria', 'Belgium', 'Bulgaria', 'Croatia',
       'Czech Republic', 'Denmark', 'Estonia', 'Finland', 'France', 'Germany',
       'Greece', 'Hungary', 'Ireland', 'Italy', 'Latvia', 'Lithuania',
       'Luxembourg', 'Malta', 'Netherlands', 'Poland', 'Portugal',
       'Republic of Cyprus', 'Romania', 'Slovakia', 'Slovenia', 'Spain',
       'Sweden'],
      dtype='object')

In [11]:
ambiental_pred = ['AT', 'AP', 'AH']
process_pred = ['AFDP', 'GTEP', 'TIT', 'TAT', 'TEY', 'CDP']
location_pred = ['Austria', 'Belgium', 'Bulgaria', 'Croatia',
       'Czech Republic', 'Denmark', 'Estonia', 'Finland', 'France', 'Germany',
       'Greece', 'Hungary', 'Ireland', 'Italy', 'Latvia', 'Lithuania',
       'Luxembourg', 'Malta', 'Netherlands', 'Poland', 'Portugal',
       'Republic of Cyprus', 'Romania', 'Slovakia', 'Slovenia', 'Spain',
       'Sweden']
nox = ['NOX']
year = ['YEAR']

In [12]:
ambiental_pred+process_pred+nox

['AT', 'AP', 'AH', 'AFDP', 'GTEP', 'TIT', 'TAT', 'TEY', 'CDP', 'NOX']

# Complete dataset test

In [13]:
features_lists = [
    'all',
    ambiental_pred,
    process_pred,
    ambiental_pred+nox,
    ambiental_pred+year,
    ambiental_pred+nox+year,
    process_pred+nox,
    process_pred+year,
    process_pred+nox+year,
    ambiental_pred+process_pred,
    ambiental_pred+process_pred+nox,
    ambiental_pred+process_pred+year,
    ambiental_pred+process_pred+nox+year,
]

## Lasso

In [15]:
default_lasso = Lasso()
scores = include_features(default_lasso, extreme_emission_df, features_lists)

X_train shape = (2127, 38), X_test shape = (709, 38)
X_train shape = (2127, 3), X_test shape = (709, 3)
X_train shape = (2127, 6), X_test shape = (709, 6)
X_train shape = (2127, 4), X_test shape = (709, 4)
X_train shape = (2127, 4), X_test shape = (709, 4)
X_train shape = (2127, 5), X_test shape = (709, 5)
X_train shape = (2127, 7), X_test shape = (709, 7)
X_train shape = (2127, 7), X_test shape = (709, 7)
X_train shape = (2127, 8), X_test shape = (709, 8)
X_train shape = (2127, 9), X_test shape = (709, 9)
X_train shape = (2127, 10), X_test shape = (709, 10)
X_train shape = (2127, 10), X_test shape = (709, 10)
X_train shape = (2127, 11), X_test shape = (709, 11)


In [16]:
print_scores(scores, only_mse = True, print_coef=True)


Coefficients:
TAT -1.247972089532729
NOX 0.2697916068796103
mse:5.504684666441961

Coefficients:
mse:9.627945979322952

Coefficients:
TAT -1.6211107112814456
mse:5.59769230706908

Coefficients:
NOX 0.6873007497978818
mse:6.905107346876137

Coefficients:
mse:9.627945979322952

Coefficients:
NOX 0.6873007497978818
mse:6.905107346876137

Coefficients:
TAT -1.247972089532729
NOX 0.2697916068796103
mse:5.504684666441961

Coefficients:
TAT -1.6211107112814456
mse:5.59769230706908

Coefficients:
TAT -1.247972089532729
NOX 0.2697916068796103
mse:5.504684666441961

Coefficients:
TAT -1.6211107112814456
mse:5.59769230706908

Coefficients:
TAT -1.247972089532729
NOX 0.2697916068796103
mse:5.504684666441961

Coefficients:
TAT -1.6211107112814456
mse:5.59769230706908

Coefficients:
TAT -1.247972089532729
NOX 0.2697916068796103
mse:5.504684666441961


## Poly

In [17]:
poly = PolynomialFeatures(degree = 2)
scores = include_features(default_lasso, extreme_emission_df, features_lists, poly = poly)

X_train shape = (2127, 780), X_test shape = (709, 780)
X_train shape = (2127, 10), X_test shape = (709, 10)
X_train shape = (2127, 28), X_test shape = (709, 28)
X_train shape = (2127, 15), X_test shape = (709, 15)
X_train shape = (2127, 15), X_test shape = (709, 15)
X_train shape = (2127, 21), X_test shape = (709, 21)
X_train shape = (2127, 36), X_test shape = (709, 36)
X_train shape = (2127, 36), X_test shape = (709, 36)
X_train shape = (2127, 45), X_test shape = (709, 45)
X_train shape = (2127, 55), X_test shape = (709, 55)
X_train shape = (2127, 66), X_test shape = (709, 66)
X_train shape = (2127, 66), X_test shape = (709, 66)
X_train shape = (2127, 78), X_test shape = (709, 78)


In [18]:
print_scores(scores, only_mse = True, print_coef=True)


Coefficients:
YEAR^2 -0.026847470733660093
YEAR TAT -0.18626605152844058
TIT TAT 0.7375283860559776
TIT NOX -0.10230221993299159
mse:4.388625618504613

Coefficients:
mse:9.627945979322952

Coefficients:
TIT^2 0.03728616713054376
TIT TAT 1.0382927334083103
mse:4.5730306055330034

Coefficients:
NOX^2 0.31163480610462013
mse:6.5469035998138905

Coefficients:
AT YEAR -0.16279777449472166
AP YEAR 0.03759697090641557
mse:8.699439119030103

Coefficients:
NOX^2 0.2946269317858661
NOX YEAR 0.025248438823837285
YEAR^2 -0.0225499467752126
mse:6.518411447847205

Coefficients:
TIT TAT 0.8839452437930339
TIT NOX -0.13295542666068444
mse:4.452410071268351

Coefficients:
TIT TAT 0.8479879307809693
TAT YEAR -0.21753105236628636
YEAR^2 -0.02800689483071817
mse:4.504914347792313

Coefficients:
TIT TAT 0.7373271903582104
TIT NOX -0.10237300970241832
TAT YEAR -0.18634238975624395
YEAR^2 -0.026849580952277692
mse:4.388592465868164

Coefficients:
TIT^2 0.03728616713054376
TIT TAT 1.0382927334083103
mse:4.57

For the default Lasso version the all the coefficients are 0 except for TIT.

## LinearRegression

In [19]:
linear = LinearRegression()
scores = include_features(linear, extreme_emission_df, features_lists, poly = None)

X_train shape = (2127, 38), X_test shape = (709, 38)
X_train shape = (2127, 3), X_test shape = (709, 3)
X_train shape = (2127, 6), X_test shape = (709, 6)
X_train shape = (2127, 4), X_test shape = (709, 4)
X_train shape = (2127, 4), X_test shape = (709, 4)
X_train shape = (2127, 5), X_test shape = (709, 5)
X_train shape = (2127, 7), X_test shape = (709, 7)
X_train shape = (2127, 7), X_test shape = (709, 7)
X_train shape = (2127, 8), X_test shape = (709, 8)
X_train shape = (2127, 9), X_test shape = (709, 9)
X_train shape = (2127, 10), X_test shape = (709, 10)
X_train shape = (2127, 10), X_test shape = (709, 10)
X_train shape = (2127, 11), X_test shape = (709, 11)


In [20]:
print_scores(scores, only_mse = True, print_coef=True)


Coefficients:
YEAR -0.8158566119607115
AT 0.3622958262561502
AP -0.257716053988304
AH 0.3975982464784425
AFDP -0.3360198188964725
GTEP 0.20992183813486362
TIT 2.45796834811494
TAT -4.200309856708997
TEY -8.44029087255813
CDP 6.029366889114896
NOX 0.4146385542953108
Austria -1482772187038.806
Belgium -1482772187038.7373
Bulgaria -1482772187039.0352
Croatia -1482772187038.586
Czech Republic -1482772187038.7332
Denmark -1482772187038.883
Estonia -1482772187038.9158
Finland -1482772187038.9116
France -1482772187038.9092
Germany -1482772187039.0544
Greece -1482772187039.198
Hungary -1482772187038.3967
Ireland -1482772187038.7292
Italy -1482772187038.8628
Latvia -1482772187038.9143
Lithuania -1482772187037.507
Luxembourg -1482772187038.6912
Malta -1482772187038.472
Netherlands -1482772187038.9841
Poland -1482772187038.347
Portugal -1482772187038.6892
Republic of Cyprus -1482772187038.7925
Romania -1482772187038.6604
Slovakia -1482772187038.2761
Slovenia -1482772187038.827
Spain -14827721870

## Poly LinearRegression

In [21]:
poly = PolynomialFeatures(degree = 2)
scores = include_features(linear, extreme_emission_df, features_lists, poly = poly)

X_train shape = (2127, 780), X_test shape = (709, 780)
X_train shape = (2127, 10), X_test shape = (709, 10)
X_train shape = (2127, 28), X_test shape = (709, 28)
X_train shape = (2127, 15), X_test shape = (709, 15)
X_train shape = (2127, 15), X_test shape = (709, 15)
X_train shape = (2127, 21), X_test shape = (709, 21)
X_train shape = (2127, 36), X_test shape = (709, 36)
X_train shape = (2127, 36), X_test shape = (709, 36)
X_train shape = (2127, 45), X_test shape = (709, 45)
X_train shape = (2127, 55), X_test shape = (709, 55)
X_train shape = (2127, 66), X_test shape = (709, 66)
X_train shape = (2127, 66), X_test shape = (709, 66)
X_train shape = (2127, 78), X_test shape = (709, 78)


In [22]:
print_scores(scores, only_mse = True, print_coef=True)


Coefficients:
1 1.3051630056556262e-13
YEAR 0.665067330117155
AT 4.771951668323789
AP -0.45350615048118287
AH 0.760323930061986
AFDP 2.1578228073093126
GTEP -4.337529107160005
TIT -18.08031828427709
TAT 3.2258718610353485
TEY 20.988121539778774
CDP 3.7331700952908013
NOX 2.479207404282389
Austria 1.2098615544112241
Belgium -0.3864731034697497
Bulgaria -0.10054660532188131
Croatia 0.1796027628811545
Czech Republic 0.00020135725523284798
Denmark -0.4508385218783272
Estonia -0.5878034799226791
Finland -1.1613894374915146
France -0.5652795053999653
Germany -0.7095171937193253
Greece -1.0938575796539407
Hungary 0.8150752210728863
Ireland 1.2821465044154992
Italy -0.8522656264718922
Latvia -1.9426266738850853
Lithuania 1.536396710519008
Luxembourg 0.615958023377322
Malta 1.4369631996995396
Netherlands -1.355354339485511
Poland 1.0963089130838015
Portugal -0.7617350129239211
Republic of Cyprus 0.5550622388225812
Romania 0.8830368276106442
Slovakia -0.9210723253633332
Slovenia 0.6521555599879

## Ridge

In [23]:
ridge = Ridge()
scores = include_features(ridge, extreme_emission_df, features_lists, poly = None)

X_train shape = (2127, 38), X_test shape = (709, 38)
X_train shape = (2127, 3), X_test shape = (709, 3)
X_train shape = (2127, 6), X_test shape = (709, 6)
X_train shape = (2127, 4), X_test shape = (709, 4)
X_train shape = (2127, 4), X_test shape = (709, 4)
X_train shape = (2127, 5), X_test shape = (709, 5)
X_train shape = (2127, 7), X_test shape = (709, 7)
X_train shape = (2127, 7), X_test shape = (709, 7)
X_train shape = (2127, 8), X_test shape = (709, 8)
X_train shape = (2127, 9), X_test shape = (709, 9)
X_train shape = (2127, 10), X_test shape = (709, 10)
X_train shape = (2127, 10), X_test shape = (709, 10)
X_train shape = (2127, 11), X_test shape = (709, 11)


In [24]:
print_scores(scores, only_mse = True, print_coef=True)


Coefficients:
YEAR -0.7942624607281286
AT 0.6243265844224603
AP -0.22749227194576188
AH 0.40362577387992393
AFDP -0.34070434130783805
GTEP 0.12622213438980992
TIT 2.019603495223742
TAT -4.104213966516841
TEY -6.637581328701809
CDP 4.763460728481356
NOX 0.45128584994421267
Austria -0.11372796502718185
Belgium -0.03137826543350403
Bulgaria -0.31998405911395444
Croatia 0.10029169139944778
Czech Republic -0.0353003563893431
Denmark -0.18458536128892797
Estonia -0.19085580718951917
Finland -0.2012287203179501
France -0.22462128022838837
Germany -0.34547680445689477
Greece -0.49062968936450835
Hungary 0.32419303407382044
Ireland -0.011846308728227494
Italy -0.1542024754510533
Latvia -0.19954810849374363
Lithuania 1.1942895708089365
Luxembourg 0.008689753150006335
Malta 0.22202622201669833
Netherlands -0.2746138211922456
Poland 0.33258728496457335
Portugal 0.0231038218154958
Republic of Cyprus -0.06900119121933306
Romania 0.04997779884817707
Slovakia 0.42956465815926387
Slovenia -0.068386481

## Poly Ridge

In [25]:
poly = PolynomialFeatures(degree = 2)
scores = include_features(ridge, extreme_emission_df, features_lists, poly = poly)

X_train shape = (2127, 780), X_test shape = (709, 780)
X_train shape = (2127, 10), X_test shape = (709, 10)
X_train shape = (2127, 28), X_test shape = (709, 28)
X_train shape = (2127, 15), X_test shape = (709, 15)
X_train shape = (2127, 15), X_test shape = (709, 15)
X_train shape = (2127, 21), X_test shape = (709, 21)
X_train shape = (2127, 36), X_test shape = (709, 36)
X_train shape = (2127, 36), X_test shape = (709, 36)
X_train shape = (2127, 45), X_test shape = (709, 45)
X_train shape = (2127, 55), X_test shape = (709, 55)
X_train shape = (2127, 66), X_test shape = (709, 66)
X_train shape = (2127, 66), X_test shape = (709, 66)
X_train shape = (2127, 78), X_test shape = (709, 78)


In [26]:
print_scores(scores, only_mse = True, print_coef=True)


Coefficients:
YEAR -1.0722592328716647
AT 1.8189916803092436
AP -0.6737012863448671
AH 0.4842067169359292
AFDP 1.6369979358596185
GTEP -0.6869837747355576
TIT -2.508912635644453
TAT -3.243764144394449
TEY 2.5106310956299693
CDP 0.9386155245161422
NOX 2.3459181075639437
Austria 0.5021113043486047
Belgium -0.3328189001782709
Bulgaria -0.04249121247846774
Croatia 0.3363118111931509
Czech Republic 0.25529544089813727
Denmark -0.7805085392541397
Estonia -0.506511273406308
Finland -1.0795906195097957
France -0.2572189411925627
Germany -0.5645897828473329
Greece -0.3501797461847744
Hungary 1.1410303154937447
Ireland 1.4257664991502652
Italy -0.5368935817950569
Latvia -1.429180529417636
Lithuania 1.5679891835202755
Luxembourg 0.5494104904987479
Malta 0.9350584775615121
Netherlands -1.064655502234877
Poland 0.6821798931336949
Portugal -0.6438354524355095
Republic of Cyprus -0.12944275799813645
Romania 0.713018259012252
Slovakia -0.3200377597458784
Slovenia 0.23917642437712103
Spain -0.32354868