# Configuration

In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#scikitlearn importing
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

#models importing
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

%matplotlib inline

plt.rcParams["figure.figsize"] = (10, 6)

### Dataset preparation

In [3]:
std_df = pd.read_csv("processed_dataset/std_dataset.csv")

In [4]:
dev_df = std_df.loc[:24487,:]

In [5]:
dev_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24488 entries, 0 to 24487
Data columns (total 40 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ID                  24488 non-null  int64  
 1   YEAR                24488 non-null  int64  
 2   AT                  24488 non-null  float64
 3   AP                  24488 non-null  float64
 4   AH                  24488 non-null  float64
 5   AFDP                24488 non-null  float64
 6   GTEP                24488 non-null  float64
 7   TIT                 24488 non-null  float64
 8   TAT                 24488 non-null  float64
 9   TEY                 24488 non-null  float64
 10  CDP                 24488 non-null  float64
 11  NOX                 24488 non-null  float64
 12  CO                  24488 non-null  float64
 13  Austria             24488 non-null  int64  
 14  Belgium             24488 non-null  int64  
 15  Bulgaria            24488 non-null  int64  
 16  Croa

In [14]:
eval_df = std_df.loc[24488::,:].drop(columns=['CO'])

In [15]:
eval_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12245 entries, 24488 to 36732
Data columns (total 39 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ID                  12245 non-null  int64  
 1   YEAR                12245 non-null  int64  
 2   AT                  12245 non-null  float64
 3   AP                  12245 non-null  float64
 4   AH                  12245 non-null  float64
 5   AFDP                12245 non-null  float64
 6   GTEP                12245 non-null  float64
 7   TIT                 12245 non-null  float64
 8   TAT                 12245 non-null  float64
 9   TEY                 12245 non-null  float64
 10  CDP                 12245 non-null  float64
 11  NOX                 12245 non-null  float64
 12  Austria             12245 non-null  int64  
 13  Belgium             12245 non-null  int64  
 14  Bulgaria            12245 non-null  int64  
 15  Croatia             12245 non-null  int64  
 16  

In [23]:
X_df = dev_df.drop(columns=['ID','CO'])
Y_df = pd.DataFrame(dev_df['CO'])

# Models all predictors

In [41]:
X_train, X_test = train_test_split(X_df.values, test_size=0.2, random_state=42, shuffle=True)
Y_train, Y_test = train_test_split(Y_df.values, test_size=0.2, random_state=42, shuffle=True)

### Linear Regression

In [47]:
linear = LinearRegression()
linear.fit(X_train,Y_train)
Y_linear = linear.predict(X_test)

In [48]:
mean_squared_error(Y_test, Y_linear)

1.8493793939580176

### RIDGE 

In [50]:
ridge = Ridge()
ridge.fit(X_train,Y_train)
Y_ridge = ridge.predict(X_test)

In [51]:
mean_squared_error(Y_test, Y_ridge)

1.8492947228280479

# Models without NOX

In [58]:
X_train, X_test = train_test_split(X_df.drop(columns=['NOX']).values, test_size=0.2, random_state=42, shuffle=True)
Y_train, Y_test = train_test_split(Y_df.values, test_size=0.2, random_state=42, shuffle=True)

### Linear Regression

In [61]:
linear = LinearRegression()
linear.fit(X_train,Y_train)
Y_linear = linear.predict(X_test)

In [62]:
mean_squared_error(Y_test, Y_linear)

2.1931232619856864

### RIDGE 

In [63]:
ridge = Ridge()
ridge.fit(X_train,Y_train)
Y_ridge = ridge.predict(X_test)

In [64]:
mean_squared_error(Y_test, Y_ridge)

2.193121482839718

# Models only ambiental and process predictors


In [70]:
X_train, X_test = train_test_split(X_df.loc[:,'AT':'CDP'].values, test_size=0.2, random_state=42, shuffle=True)
Y_train, Y_test = train_test_split(Y_df.values, test_size=0.2, random_state=42, shuffle=True)

### Linear Regression

In [71]:
linear = LinearRegression()
linear.fit(X_train,Y_train)
Y_linear = linear.predict(X_test)

In [72]:
mean_squared_error(Y_test, Y_linear)

2.3475072700813

### RIDGE 

In [73]:
ridge = Ridge()
ridge.fit(X_train,Y_train)
Y_ridge = ridge.predict(X_test)

In [74]:
mean_squared_error(Y_test, Y_ridge)

2.3474641994792536