# Configuration 

In [111]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

from sklearn.model_selection import train_test_split, validation_curve
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import PolynomialFeatures

%matplotlib inline

plt.rcParams["figure.figsize"] = (10, 6)

# Utils

In [115]:
def include_features(model, df, features_lists, poly = None, target_feature = 'CO'):
    
    scores = {}
    
    
    for features in features_lists:
        X = df.drop(columns = 'CO')
        y = df['CO']
        
        if features == 'all':
            features = X.columns
        
        X_train, X_test, y_train, y_test = train_test_split(X[features], y, test_size = 0.25, random_state=42)
        
        
        if poly is not None:
            poly.fit(X_train)
            X_train, X_test = poly.transform(X_train), poly.transform(X_test)
            
        print(f'X_train shape = {X_train.shape}, X_test shape = {X_test.shape}')
        
        model.fit(X_train,y_train)
        y_pred = model.predict(X_test)
        
        if len(features) == len(df.columns):
            features == 'all'
        
        if poly is None:
            scores[tuple(model.feature_names_in_)] = (y_pred, model.coef_, mse(y_test, y_pred))
        else:
            scores[tuple(poly.get_feature_names_out())] = (y_pred, model.coef_ ,mse(y_test, y_pred))
        
    return scores
        

In [97]:
def print_features_coef(features, coef, only_non_van):
    print('Coefficients:')
    for f, c in zip(features, coef):
        if (only_non_van == True and c != 0):
            print(f,c)
        elif only_non_van == False:
            print(f,c)

In [116]:
def print_scores(scores_dict, only_mse = False, print_coef = False):
    '''scores_dict => (columns): (predictions, mse_score) '''
    for columns, (pred, coef ,mse_score) in scores_dict.items():
        print()
        if print_coef:
            print_features_coef(columns, coef, only_non_van = True)
            
        if only_mse:
            print(f'mse:{mse_score}')
        else:
            print(f'Columns: \n {columns}, \n MSE: {mse_score}')

# Importing Dataset

In [7]:
std_df = pd.read_csv('processed_dataset/std_dataset.csv', index_col = 0)
std_df.shape

(36733, 39)

In [9]:
dev_df = std_df.loc[:24487]
dev_df.shape

(24488, 39)

In [10]:
eval_df = std_df.loc[24488:]
eval_df.shape

(12245, 39)

In [19]:
dev_df.columns

Index(['YEAR', 'AT', 'AP', 'AH', 'AFDP', 'GTEP', 'TIT', 'TAT', 'TEY', 'CDP',
       'NOX', 'CO', 'Austria', 'Belgium', 'Bulgaria', 'Croatia',
       'Czech Republic', 'Denmark', 'Estonia', 'Finland', 'France', 'Germany',
       'Greece', 'Hungary', 'Ireland', 'Italy', 'Latvia', 'Lithuania',
       'Luxembourg', 'Malta', 'Netherlands', 'Poland', 'Portugal',
       'Republic of Cyprus', 'Romania', 'Slovakia', 'Slovenia', 'Spain',
       'Sweden'],
      dtype='object')

In [63]:
ambiental_pred = ['AT', 'AP', 'AH']
process_pred = ['AFDP', 'GTEP', 'TIT', 'TAT', 'TEY', 'CDP']
location_pred = ['Austria', 'Belgium', 'Bulgaria', 'Croatia',
       'Czech Republic', 'Denmark', 'Estonia', 'Finland', 'France', 'Germany',
       'Greece', 'Hungary', 'Ireland', 'Italy', 'Latvia', 'Lithuania',
       'Luxembourg', 'Malta', 'Netherlands', 'Poland', 'Portugal',
       'Republic of Cyprus', 'Romania', 'Slovakia', 'Slovenia', 'Spain',
       'Sweden']
nox = ['NOX']
year = ['YEAR']

In [65]:
ambiental_pred+process_pred+nox

['AT', 'AP', 'AH', 'AFDP', 'GTEP', 'TIT', 'TAT', 'TEY', 'CDP', 'NOX']

# Complete dataset test

In [120]:
features_lists = [
    'all',
    ambiental_pred,
    process_pred,
    ambiental_pred+nox,
    ambiental_pred+year,
    ambiental_pred+nox+year,
    process_pred+nox,
    process_pred+year,
    process_pred+nox+year,
    ambiental_pred+process_pred,
    ambiental_pred+process_pred+nox,
    ambiental_pred+process_pred+year,
    ambiental_pred+process_pred+nox+year,
]

## Lasso

In [121]:
default_lasso = Lasso()
scores = include_features(default_lasso, dev_df, features_lists)

X_train shape = (18366, 38), X_test shape = (6122, 38)
X_train shape = (18366, 3), X_test shape = (6122, 3)
X_train shape = (18366, 6), X_test shape = (6122, 6)
X_train shape = (18366, 4), X_test shape = (6122, 4)
X_train shape = (18366, 4), X_test shape = (6122, 4)
X_train shape = (18366, 5), X_test shape = (6122, 5)
X_train shape = (18366, 7), X_test shape = (6122, 7)
X_train shape = (18366, 7), X_test shape = (6122, 7)
X_train shape = (18366, 8), X_test shape = (6122, 8)
X_train shape = (18366, 9), X_test shape = (6122, 9)
X_train shape = (18366, 10), X_test shape = (6122, 10)
X_train shape = (18366, 10), X_test shape = (6122, 10)
X_train shape = (18366, 11), X_test shape = (6122, 11)


In [122]:
print_scores(scores, only_mse = True, print_coef=True)


Coefficients:
TIT -0.5760119278636129
mse:3.5978836067143054

Coefficients:
mse:5.179609728446949

Coefficients:
TIT -0.5760119278636129
mse:3.5978836067143054

Coefficients:
mse:5.179609728446949

Coefficients:
mse:5.179609728446949

Coefficients:
mse:5.179609728446949

Coefficients:
TIT -0.5760119278636129
mse:3.5978836067143054

Coefficients:
TIT -0.5760119278636129
mse:3.5978836067143054

Coefficients:
TIT -0.5760119278636129
mse:3.5978836067143054

Coefficients:
TIT -0.5760119278636129
mse:3.5978836067143054

Coefficients:
TIT -0.5760119278636129
mse:3.5978836067143054

Coefficients:
TIT -0.5760119278636129
mse:3.5978836067143054

Coefficients:
TIT -0.5760119278636129
mse:3.5978836067143054


For the default Lasso version the all the coefficients are 0 except for TIT.

## Poly

In [123]:
poly = PolynomialFeatures(degree = 2)
scores = include_features(default_lasso, dev_df, features_lists, poly = poly)

X_train shape = (18366, 780), X_test shape = (6122, 780)
X_train shape = (18366, 10), X_test shape = (6122, 10)
X_train shape = (18366, 28), X_test shape = (6122, 28)
X_train shape = (18366, 15), X_test shape = (6122, 15)
X_train shape = (18366, 15), X_test shape = (6122, 15)
X_train shape = (18366, 21), X_test shape = (6122, 21)
X_train shape = (18366, 36), X_test shape = (6122, 36)
X_train shape = (18366, 36), X_test shape = (6122, 36)
X_train shape = (18366, 45), X_test shape = (6122, 45)
X_train shape = (18366, 55), X_test shape = (6122, 55)
X_train shape = (18366, 66), X_test shape = (6122, 66)
X_train shape = (18366, 66), X_test shape = (6122, 66)
X_train shape = (18366, 78), X_test shape = (6122, 78)


In [124]:
print_scores(scores, only_mse = True, print_coef=True)


Coefficients:
YEAR^2 -0.01810505343297444
YEAR TIT -0.29749713342734685
YEAR NOX 0.035004405965254885
TIT^2 0.18433905696919742
NOX^2 0.02839130476537028
mse:2.5187631219084934

Coefficients:
mse:5.179609728446949

Coefficients:
TIT -0.10106135969478913
TIT^2 0.5529454687067905
mse:2.9648709169375977

Coefficients:
NOX^2 0.3220124088685398
mse:4.007002217097596

Coefficients:
AH YEAR 0.001374962043172018
YEAR^2 -0.03268206445263081
mse:5.008478731204563

Coefficients:
NOX^2 0.30350796648652095
NOX YEAR 0.015451625893011306
YEAR^2 -0.03141866077553242
mse:3.864590972219884

Coefficients:
TIT -0.054653592382052474
TIT^2 0.4583725462503874
NOX^2 0.1571272427849665
mse:2.863577494603666

Coefficients:
TIT^2 0.21677500410903502
TIT YEAR -0.30505515173748476
YEAR^2 -0.014793536961593572
mse:2.5886541748878686

Coefficients:
TIT^2 0.18430158655356127
TIT YEAR -0.2975136301932112
NOX^2 0.02838899188004799
NOX YEAR 0.03500793570356147
YEAR^2 -0.018104370614654062
mse:2.518817178796338

Coeffic

# LinearRegression

In [125]:
linear = LinearRegression()
scores = include_features(linear, dev_df, features_lists, poly = None)

X_train shape = (18366, 38), X_test shape = (6122, 38)
X_train shape = (18366, 3), X_test shape = (6122, 3)
X_train shape = (18366, 6), X_test shape = (6122, 6)
X_train shape = (18366, 4), X_test shape = (6122, 4)
X_train shape = (18366, 4), X_test shape = (6122, 4)
X_train shape = (18366, 5), X_test shape = (6122, 5)
X_train shape = (18366, 7), X_test shape = (6122, 7)
X_train shape = (18366, 7), X_test shape = (6122, 7)
X_train shape = (18366, 8), X_test shape = (6122, 8)
X_train shape = (18366, 9), X_test shape = (6122, 9)
X_train shape = (18366, 10), X_test shape = (6122, 10)
X_train shape = (18366, 10), X_test shape = (6122, 10)
X_train shape = (18366, 11), X_test shape = (6122, 11)


In [126]:
print_scores(scores, only_mse = True, print_coef=True)


Coefficients:
YEAR -0.5471514014572199
AT 0.340646677350231
AP 0.050801317006002086
AH 0.1620495962450336
AFDP 0.013786075391155936
GTEP -0.10523974799543677
TIT 0.06647338808978134
TAT -1.4423019437560445
TEY -2.107323135520473
CDP 0.03513882414716826
NOX 0.7731363801321904
Austria -718172304639.6368
Belgium -718172304639.5671
Bulgaria -718172304639.5255
Croatia -718172304639.612
Czech Republic -718172304639.5641
Denmark -718172304639.6005
Estonia -718172304639.6407
Finland -718172304639.644
France -718172304639.5696
Germany -718172304639.6019
Greece -718172304639.6326
Hungary -718172304639.5051
Ireland -718172304639.5187
Italy -718172304639.6119
Latvia -718172304639.6218
Lithuania -718172304639.3953
Luxembourg -718172304639.5763
Malta -718172304639.5612
Netherlands -718172304639.569
Poland -718172304639.569
Portugal -718172304639.6519
Republic of Cyprus -718172304639.6097
Romania -718172304639.5398
Slovakia -718172304639.6279
Slovenia -718172304639.6257
Spain -718172304639.5203
Swed