# Configuration 

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

from sklearn.model_selection import train_test_split, validation_curve
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import PolynomialFeatures

%matplotlib inline

plt.rcParams["figure.figsize"] = (10, 6)

# Utils

In [105]:
def include_features(model, df, features_lists, poly = None, target_feature = 'CO'):
    
    scores = {}
    
    
    for features in features_lists:
        X = df.drop(columns = 'CO')
        y = df['CO']
        
        if features == 'all':
            features = X.columns
        
        X_train, X_test, y_train, y_test = train_test_split(X[features], y, test_size = 0.2, random_state=42)
        
        
        if poly is not None:
            poly.fit(X_train)
            X_train, X_test = poly.transform(X_train), poly.transform(X_test)
            
        print(f'X_train shape = {X_train.shape}, X_test shape = {X_test.shape}')
        
        model.fit(X_train,y_train)
        y_pred = model.predict(X_test)
        
        if len(features) == len(df.columns):
            features == 'all'
        
        if poly is None:
            scores[tuple(model.feature_names_in_)] = (y_pred, model.coef_, mse(y_test, y_pred))
        else:
            scores[tuple(poly.get_feature_names_out())] = (y_pred, model.coef_ ,mse(y_test, y_pred))
        
    return scores
        

In [97]:
def print_features_coef(features, coef, only_non_van):
    print('Coefficients:')
    for f, c in zip(features, coef):
        if (only_non_van == True and c != 0):
            print(f,c)
        elif only_non_van == False:
            print(f,c)

In [100]:
def print_scores(scores_dict, only_mse = False, print_coef = False):
    '''scores_dict => (columns): (predictions, mse_score) '''
    for columns, (pred, coef ,mse_score) in scores_dict.items():
        print()
        if print_coef:
            print_features_coef(columns, coef, only_non_van = True)
        if only_mse:
            print(f'mse:{mse_score}')
        else:
            print(f'Columns: \n {columns}, \n MSE: {mse_score}')

# Importing Dataset

In [7]:
std_df = pd.read_csv('processed_dataset/std_dataset.csv', index_col = 0)
std_df.shape

(36733, 39)

In [9]:
dev_df = std_df.loc[:24487]
dev_df.shape

(24488, 39)

In [10]:
eval_df = std_df.loc[24488:]
eval_df.shape

(12245, 39)

In [19]:
dev_df.columns

Index(['YEAR', 'AT', 'AP', 'AH', 'AFDP', 'GTEP', 'TIT', 'TAT', 'TEY', 'CDP',
       'NOX', 'CO', 'Austria', 'Belgium', 'Bulgaria', 'Croatia',
       'Czech Republic', 'Denmark', 'Estonia', 'Finland', 'France', 'Germany',
       'Greece', 'Hungary', 'Ireland', 'Italy', 'Latvia', 'Lithuania',
       'Luxembourg', 'Malta', 'Netherlands', 'Poland', 'Portugal',
       'Republic of Cyprus', 'Romania', 'Slovakia', 'Slovenia', 'Spain',
       'Sweden'],
      dtype='object')

In [63]:
ambiental_pred = ['AT', 'AP', 'AH']
process_pred = ['AFDP', 'GTEP', 'TIT', 'TAT', 'TEY', 'CDP']
location_pred = ['Austria', 'Belgium', 'Bulgaria', 'Croatia',
       'Czech Republic', 'Denmark', 'Estonia', 'Finland', 'France', 'Germany',
       'Greece', 'Hungary', 'Ireland', 'Italy', 'Latvia', 'Lithuania',
       'Luxembourg', 'Malta', 'Netherlands', 'Poland', 'Portugal',
       'Republic of Cyprus', 'Romania', 'Slovakia', 'Slovenia', 'Spain',
       'Sweden']
nox = ['NOX']
year = ['YEAR']

In [65]:
ambiental_pred+process_pred+nox

['AT', 'AP', 'AH', 'AFDP', 'GTEP', 'TIT', 'TAT', 'TEY', 'CDP', 'NOX']

# Complete dataset test

## Lasso

In [101]:
default_lasso = Lasso()
features_lists = [
    'all',
    ambiental_pred,
    process_pred,
    ambiental_pred+process_pred,
    ambiental_pred+process_pred+nox,
    ambiental_pred+process_pred+year,
    ambiental_pred+process_pred+nox+year,
]

scores = include_features(default_lasso, dev_df, features_lists)

X_train shape = (19590, 38), X_test shape = (4898, 38)
X_train shape = (19590, 3), X_test shape = (4898, 3)
X_train shape = (19590, 6), X_test shape = (4898, 6)
X_train shape = (19590, 9), X_test shape = (4898, 9)
X_train shape = (19590, 10), X_test shape = (4898, 10)
X_train shape = (19590, 10), X_test shape = (4898, 10)
X_train shape = (19590, 11), X_test shape = (4898, 11)


In [102]:
print_scores(scores, only_mse = True, print_coef=True)


Coefficients:
TIT -0.5817890227673992
mse:3.6964834721654096

Coefficients:
mse:5.287133724390688

Coefficients:
TIT -0.5817890227673992
mse:3.6964834721654096

Coefficients:
TIT -0.5817890227673992
mse:3.6964834721654096

Coefficients:
TIT -0.5817890227673992
mse:3.6964834721654096

Coefficients:
TIT -0.5817890227673992
mse:3.6964834721654096

Coefficients:
TIT -0.5817890227673992
mse:3.6964834721654096


For the default Lasso version the all the coefficients are 0 except for TIT.

## Poly

In [107]:
poly = PolynomialFeatures(degree = 2)
scores = include_features(default_lasso, dev_df, features_lists, poly = poly)

X_train shape = (19590, 38), X_test shape = (4898, 38)
X_train shape = (19590, 3), X_test shape = (4898, 3)
X_train shape = (19590, 6), X_test shape = (4898, 6)
X_train shape = (19590, 9), X_test shape = (4898, 9)
X_train shape = (19590, 10), X_test shape = (4898, 10)
X_train shape = (19590, 10), X_test shape = (4898, 10)
X_train shape = (19590, 11), X_test shape = (4898, 11)


In [110]:
print_scores(scores, only_mse = True, print_coef=True)


Coefficients:
YEAR^2 -0.018433927473257974
YEAR TIT -0.29322329378258755
YEAR NOX 0.033833047739014514
TIT^2 0.20505013519737908
NOX^2 0.025942954074382175
mse:2.6060428817968186

Coefficients:
mse:5.287133724390688

Coefficients:
TIT -0.09079942562190141
TIT^2 0.5668460030212084
mse:3.1245465180817775

Coefficients:
TIT -0.09079942562190141
TIT^2 0.5668460030212084
mse:3.1245465180817775

Coefficients:
TIT -0.04609466746098859
TIT^2 0.4754278780310179
NOX^2 0.1527709996554858
mse:3.007143806239917

Coefficients:
TIT^2 0.23539336899610913
TIT YEAR -0.3002704196545768
YEAR^2 -0.015240360874495974
mse:2.6799289962676514

Coefficients:
TIT^2 0.2050131541761838
TIT YEAR -0.2932392584904093
NOX^2 0.025940841620099558
NOX YEAR 0.033836692273835366
YEAR^2 -0.018433407118876793
mse:2.6060886120488354
