**Library Import**

In [2]:
import numpy as np
import pandas as pd
from scipy.stats import uniform
from sklearn.linear_model import ElasticNetCV, SGDRegressor, BayesianRidge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn import metrics

**Creation of the data models**<br/>
1) model1: estimate the concentration of any gas <br/>
2) model2: estimate the concentration but gas feature is supplied <br/>
3) model3: estimate the concentration but model is create for each particular gas <br/>

In [3]:
# read in the data 
path = r"C:\Users\BrechtDewilde\Documents\Github\BDProject\data\data.csv"
data = pd.read_csv(path, index_col = 0)

# Create a particular x, y dataset for each task
data1 = data.iloc[:, 2:]
data2 = pd.get_dummies(data.iloc[:, 1:])

# for task 3 we need to create a model for each gas
ethanol = data.loc[data['gas'] == "Ethanol"].iloc[:, 2:]
ethylene = data.loc[data['gas'] == "Ethylene"].iloc[:, 2:]
ammonia = data.loc[data['gas'] == "Ammonia"].iloc[:, 2:]
acetaldehyde = data.loc[data['gas'] == "Acetaldehyde"].iloc[:, 2:]
acetone = data.loc[data['gas'] == "Acetone"].iloc[:, 2:]
toluene = data.loc[data['gas'] == "Toluene"].iloc[:, 2:]
data3 = [ethanol, ethylene, ammonia, acetaldehyde, acetone, toluene]

**Train-Test set split**

In [4]:
data1x_train, data1x_test, data1y_train, data1y_test = train_test_split(data1.iloc[:,1:], data1["concentration"], random_state=0)
data2x_train, data2x_test, data2y_train, data2y_test = train_test_split(data2.iloc[:,1:], data1["concentration"], random_state=0)

ethanol = {"xtrain": [], "xtest": [], "ytrain": [], "ytest": []} 
ethylene = {"xtrain": [], "xtest": [], "ytrain": [], "ytest": []} 
ammonia = {"xtrain": [], "xtest": [], "ytrain": [], "ytest": []} 
acetaldehyde = {"xtrain": [], "xtest": [], "ytrain": [], "ytest": []}
acetone = {"xtrain": [], "xtest": [], "ytrain": [], "ytest": []} 
toluene = {"xtrain": [], "xtest": [], "ytrain": [], "ytest": []} 

data3_splitted = [ethanol, ethylene, ammonia, acetaldehyde, acetone, toluene]
for index, df in enumerate(data3):
    data3_splitted[index]["xtrain"], data3_splitted[index]["xtest"], data3_splitted[index]["ytrain"], data3_splitted[index]["ytest"] = train_test_split(df.iloc[:,1:], df["concentration"], random_state=0)

**Initialization of the different prediction models**<br/>
1) Elastic Net Regression <br/>
2) kNeighborsRegression<br/>
3) SGDRegression <br/>
4) AdaBoost <br/>
5) bayesianRidge <br/>

In [5]:
en = ElasticNetCV(cv=5, max_iter=10000, random_state=0, normalize = True)
knn = KNeighborsRegressor()
sgdr = SGDRegressor()
dt = DecisionTreeRegressor(max_depth = 1)
ada = AdaBoostRegressor(dt)
br = BayesianRidge()

**Model fitting and Model evaluation** <br/>
metrics: MSE, MAE and R2 

Elastic net

In [6]:
# Initialization of the metric lists
en_mse = []
en_mae = []
en_r = []

# ElasticNet 
# Data 1 
en.fit(data1x_train, data1y_train)
en_mse.append(metrics.mean_squared_error(data1y_test, en.predict(data1x_test)))
en_mae.append(metrics.mean_absolute_error(data1y_test, en.predict(data1x_test)))
en_r.append(metrics.r2_score(data1y_test, en.predict(data1x_test)))

# Data 2 
en.fit(data2x_train, data2y_train)
en_mse.append(metrics.mean_squared_error(data2y_test, en.predict(data2x_test)))
en_mae.append(metrics.mean_absolute_error(data2y_test, en.predict(data2x_test)))
en_r.append(metrics.r2_score(data2y_test, en.predict(data2x_test)))

# Data 3
for df in data3_splitted:
    en.fit(df["xtrain"], df["ytrain"])
    en_mse.append(metrics.mean_squared_error(df["ytest"], en.predict(df["xtest"])))
    en_mae.append(metrics.mean_absolute_error(df["ytest"], en.predict(df["xtest"])))
    en_r.append(metrics.r2_score(df["ytest"], en.predict(df["xtest"])))

Nearest Neighbors Regression (Sklearn normalizes)

In [8]:
knn_mse = []
knn_mae = []
knn_r = []
params = {"n_neighbors": np.arange(1,5), "weights": ["uniform", "distance"]}
grid = GridSearchCV(estimator=knn, param_grid=params,  scoring = "neg_mean_squared_error", cv = 5, iid = False)

# Nearest
# Data 1
grid.fit(data1x_train, data1y_train)
knn_mse.append(metrics.mean_squared_error(data1y_test, grid.predict(data1x_test)))
knn_mae.append(metrics.mean_absolute_error(data1y_test, grid.predict(data1x_test)))
knn_r.append(metrics.r2_score(data1y_test, grid.predict(data1x_test)))

# Data 2
grid.fit(data2x_train, data2y_train)
knn_mse.append(metrics.mean_squared_error(data2y_test, grid.predict(data2x_test)))
knn_mae.append(metrics.mean_absolute_error(data2y_test, grid.predict(data2x_test)))
knn_r.append(metrics.r2_score(data2y_test, grid.predict(data2x_test)))


# Data 3
for df in data3_splitted:
    grid.fit(df["xtrain"], df["ytrain"])
    knn_mse.append(metrics.mean_squared_error(df["ytest"], grid.predict(df["xtest"])))
    knn_mae.append(metrics.mean_absolute_error(df["ytest"], grid.predict(df["xtest"])))
    knn_r.append(metrics.r2_score(df["ytest"], grid.predict(df["xtest"])))

SGDRegressor

In [9]:
# List initizalization
sgd_mse = []
sgd_mae = []
sgd_r = []

# Grid initialization
params = {"loss": ["squared_loss", "huber", "epsilon_insensitive"]}
grid = GridSearchCV(estimator=sgdr, param_grid=params,  scoring = "neg_mean_squared_error", cv = 5, iid = False)

# Data 1
grid.fit(data1x_train, data1y_train)
sgd_mse.append(metrics.mean_squared_error(data1y_test, grid.predict(data1x_test)))
sgd_mae.append(metrics.mean_absolute_error(data1y_test, grid.predict(data1x_test)))
sgd_r.append(metrics.r2_score(data1y_test, grid.predict(data1x_test)))

# Data 2
grid.fit(data2x_train, data2y_train)
sgd_mse.append(metrics.mean_squared_error(data2y_test, grid.predict(data2x_test)))
sgd_mae.append(metrics.mean_absolute_error(data2y_test, grid.predict(data2x_test)))
sgd_r.append(metrics.r2_score(data2y_test, grid.predict(data2x_test)))

# Data 3
for df in data3_splitted:
    grid.fit(df["xtrain"], df["ytrain"])
    sgd_mse.append(metrics.mean_squared_error(df["ytest"], grid.predict(df["xtest"])))
    sgd_mae.append(metrics.mean_absolute_error(df["ytest"], grid.predict(df["xtest"])))
    sgd_r.append(metrics.r2_score(df["ytest"], grid.predict(df["xtest"])))



ADABoost

In [10]:
# Metric list initialization
ada_mse = []
ada_mae = []
ada_r = []

# Grid initialization
param_dist = {'n_estimators': [50, 100], 'learning_rate' : [0.01,0.05,0.1,0.3,1], 'loss' : ['linear', 'square', 'exponential']}
grid = RandomizedSearchCV(ada, param_distributions = param_dist, cv=3, n_iter = 5, n_jobs=-1, scoring = "neg_mean_squared_error")

# Data 1
grid.fit(data1x_train, data1y_train)
ada_mse.append(metrics.mean_squared_error(data1y_test, grid.predict(data1x_test)))
ada_mae.append(metrics.mean_absolute_error(data1y_test, grid.predict(data1x_test)))
ada_r.append(metrics.r2_score(data1y_test, grid.predict(data1x_test)))

# Data 2
grid.fit(data2x_train, data2y_train)
ada_mse.append(metrics.mean_squared_error(data2y_test, grid.predict(data2x_test)))
ada_mae.append(metrics.mean_absolute_error(data2y_test, grid.predict(data2x_test)))
ada_r.append(metrics.r2_score(data2y_test, grid.predict(data2x_test)))

# Data 3
for df in data3_splitted:
    grid.fit(df["xtrain"], df["ytrain"])
    ada_mse.append(metrics.mean_squared_error(df["ytest"], grid.predict(df["xtest"])))
    ada_mae.append(metrics.mean_absolute_error(df["ytest"], grid.predict(df["xtest"])))
    ada_r.append(metrics.r2_score(df["ytest"], grid.predict(df["xtest"])))

BayesianRdige

In [11]:
# List initizalization
br_mse = []
br_mae = []
br_r = []

# data 1
br.fit(data1x_train, data1y_train)
br_mse.append(metrics.mean_squared_error(data1y_test, br.predict(data1x_test)))
br_mae.append(metrics.mean_absolute_error(data1y_test, br.predict(data1x_test)))
br_r.append(metrics.r2_score(data1y_test, br.predict(data1x_test)))

# Data 2
br.fit(data2x_train, data2y_train)
br_mse.append(metrics.mean_squared_error(data2y_test, br.predict(data2x_test)))
br_mae.append(metrics.mean_absolute_error(data2y_test, br.predict(data2x_test)))
br_r.append(metrics.r2_score(data2y_test, br.predict(data2x_test)))

# Data 3
for df in data3_splitted:
    br.fit(df["xtrain"], df["ytrain"])
    br_mse.append(metrics.mean_squared_error(df["ytest"], br.predict(df["xtest"])))
    br_mae.append(metrics.mean_absolute_error(df["ytest"], br.predict(df["xtest"])))
    br_r.append(metrics.r2_score(df["ytest"], br.predict(df["xtest"])))

**Metric tables**

In [12]:
row_names = ["Task 1", "Task 2", "Ethanol", "Ethylene", "Ammonia", "Acetaldehyde", "Acetone", "Toluene"]
mse_metrics = [en_mse, knn_mse, sgd_mse, ada_mse, br_mse]
mae_metrics = [en_mae, knn_mae, sgd_mae, ada_mae, br_mae]
r2_metrics = [en_r, knn_r, sgd_r, ada_r, br_r]

Mean-Squared-Error

In [13]:
mse_table = pd.DataFrame({"index": row_names, "Elastic Net": en_mse, "NN": knn_mse, "SGDR": sgd_mse, "AdaBoost": ada_mse, "br": br_mse})
mse_table.set_index("index").round()

Unnamed: 0_level_0,Elastic Net,NN,SGDR,AdaBoost,br
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Task 1,29489.0,1033.0,282198100000.0,12327.0,16457.0
Task 2,29395.0,1033.0,101499200000.0,11036.0,2575.0
Ethanol,6733.0,169.0,156007500000.0,2552.0,1363.0
Ethylene,1390.0,257.0,25395260000.0,2119.0,4683.0
Ammonia,8291.0,2464.0,424326400.0,15374.0,3630.0
Acetaldehyde,1247.0,173.0,159108900000.0,2044.0,178.0
Acetone,3605.0,453.0,2508835000000.0,8705.0,341.0
Toluene,395.0,269.0,2725266000.0,612.0,317.0


Mean-absolute-error

In [14]:
mae_table = pd.DataFrame({"index": row_names, "Elastic Net": en_mae, "NN": knn_mae, "SGDR": sgd_mae, "AdaBoost": ada_mae, "br": br_mae})
mae_table.set_index("index").round()

Unnamed: 0_level_0,Elastic Net,NN,SGDR,AdaBoost,br
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Task 1,108.0,8.0,327666.0,84.0,29.0
Task 2,108.0,8.0,239878.0,84.0,23.0
Ethanol,69.0,2.0,351803.0,39.0,14.0
Ethylene,29.0,6.0,137492.0,38.0,13.0
Ammonia,57.0,15.0,10185.0,94.0,27.0
Acetaldehyde,28.0,5.0,322563.0,38.0,9.0
Acetone,40.0,6.0,1200736.0,78.0,11.0
Toluene,10.0,4.0,37277.0,17.0,7.0


R-squared

In [15]:
r_table = pd.DataFrame({"index": row_names, "Elastic Net": en_r, "NN": knn_r, "SGDR": sgd_r, "AdaBoost": ada_r, "br": br_r})
r_table.set_index("index").round(2)

Unnamed: 0_level_0,Elastic Net,NN,SGDR,AdaBoost,br
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Task 1,0.04,0.97,-9233381.29,0.6,0.46
Task 2,0.04,0.97,-3321003.08,0.64,0.92
Ethanol,0.02,0.98,-22598177.96,0.63,0.8
Ethylene,0.79,0.96,-3758489.03,0.69,0.31
Ammonia,0.88,0.97,-5888.52,0.79,0.95
Acetaldehyde,0.77,0.97,-29017353.6,0.63,0.97
Acetone,0.92,0.99,-55527222.48,0.81,0.99
Toluene,0.63,0.75,-2523389.96,0.43,0.71
