**Library import**

In [31]:
import numpy as np
import pandas as pd
from scipy.stats import uniform
from sklearn.linear_model import ElasticNetCV, SGDRegressor, BayesianRidge
from sklearn.feature_selection import SelectFromModel
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LinearRegression

**Data import**

In [3]:
# read in the data 
path = r"C:\Users\BrechtDewilde\Documents\Github\BDProject\data\data.csv"
data = pd.read_csv(path, index_col = 0)

# Create a particular x, y dataset for each task
data1 = data.iloc[:, 2:]
data2 = pd.get_dummies(data.iloc[:, 1:])

# for task 3 we need to create a model for each gas
ethanol = data.loc[data['gas'] == "Ethanol"].iloc[:, 2:]
ethylene = data.loc[data['gas'] == "Ethylene"].iloc[:, 2:]
ammonia = data.loc[data['gas'] == "Ammonia"].iloc[:, 2:]
acetaldehyde = data.loc[data['gas'] == "Acetaldehyde"].iloc[:, 2:]
acetone = data.loc[data['gas'] == "Acetone"].iloc[:, 2:]
toluene = data.loc[data['gas'] == "Toluene"].iloc[:, 2:]
data3 = [ethanol, ethylene, ammonia, acetaldehyde, acetone, toluene]

**Train-test split**

In [32]:
data1x_train, data1x_test, data1y_train, data1y_test = train_test_split(data1.iloc[:,1:], data1["concentration"], random_state=0)
data2x_train, data2x_test, data2y_train, data2y_test = train_test_split(data2.iloc[:,1:], data1["concentration"], random_state=0)

ethanol = {"xtrain": [], "xtest": [], "ytrain": [], "ytest": []} 
ethylene = {"xtrain": [], "xtest": [], "ytrain": [], "ytest": []} 
ammonia = {"xtrain": [], "xtest": [], "ytrain": [], "ytest": []} 
acetaldehyde = {"xtrain": [], "xtest": [], "ytrain": [], "ytest": []}
acetone = {"xtrain": [], "xtest": [], "ytrain": [], "ytest": []} 
toluene = {"xtrain": [], "xtest": [], "ytrain": [], "ytest": []} 

data3_splitted = [ethanol, ethylene, ammonia, acetaldehyde, acetone, toluene]
for index, df in enumerate(data3):
    data3_splitted[index]["xtrain"], data3_splitted[index]["xtest"], data3_splitted[index]["ytrain"], data3_splitted[index]["ytest"] = train_test_split(df.iloc[:,1:], df["concentration"], random_state=0)

## Dimension Reduction 
<br/> First run the train-test split again, then run the cell of the wanted dimension reduction and finally run the code and metric cell

**Feature Importance with random forest**, the feature importance will be done only on the training set.

In [25]:
# Creation of the random forest object
rf = RandomForestRegressor(100, max_depth = 10, random_state=0, n_jobs=-1)

# Create selector object that only pick the most important features  
sfm = SelectFromModel(rf, threshold=0.15)

# The randomForest will be created, the feature importances calculated and the most important features kept in memory
sfm.fit(data1x_train, data1y_train)

# The original data is transformed such that only the important features remain
data1x_train = sfm.transform(data1x_train)
data1x_test = sfm.transform(data1x_test)

# Second dataset
sfm.fit(data2x_train, data2y_train)
data2x_train = sfm.transform(data2x_train)
data2x_test = sfm.transform(data2x_test)

# Individual datasets
for index,df in enumerate(data3_splitted):
    sfm.fit(df["xtrain"], df["ytrain"])
    data3_splitted[index]["xtrain"] = sfm.transform(df["xtrain"])
    data3_splitted[index]["xtest"] = sfm.transform(df["xtest"])

**PCA** <br/> Important to standardize before applying PCA

In [23]:
## Performing standardization before applying PCA
scaler = StandardScaler()
pca = PCA(0.95)

# first dataset
scaler.fit(data1x_train)
data1x_train = scaler.transform(data1x_train)
pca.fit(data1x_train)
data1x_train = pca.transform(data1x_train)
data1x_test = pca.transform(data1x_test)

# second dataset
scaler.fit(data2x_train)
data2x_train = scaler.transform(data2x_train)
pca.fit(data2x_train)
data2x_train = pca.transform(data2x_train)
data2x_test = pca.transform(data2x_test)

# Individual datasets
for index,df in enumerate(data3_splitted):
    scaler.fit(df["xtrain"])
    df["xtrain"] = scaler.transform(df["xtrain"])
    pca.fit(df["xtrain"])
    data3_splitted[index]["xtrain"] = pca.transform(df["xtrain"])
    data3_splitted[index]["xtest"] = pca.transform(df["xtest"])

**Recursive feature selection with linear model**

In [57]:
# creation of the lr and selector model
lr = LinearRegression(normalize=True)
selector = RFECV(estimator = lr, scoring = "neg_mean_squared_error", cv = 5)

# Data 1 
data1x_train = selector.fit_transform(data1x_train, data1y_train)
data1x_test = selector.transform(data1x_test)

# Data 2
data1x_train = selector.fit_transform(data2x_train, data2y_train)
data1x_test = selector.transform(data2x_test)

# Data 3
for index,df in enumerate(data3_splitted):
    data3_splitted[index]["xtrain"] = selector.fit_transform(df["xtrain"], df["ytrain"])
    data3_splitted[index]["xtest"] = selector.transform(df["xtest"])

**Initialization of the different prediction models**<br/>
1) Elastic Net Regression <br/>
2) kNeighborsRegression<br/>
3) SGDRegression <br/>
4) AdaBoost <br/>
5) bayesianRidge <br/>

In [60]:
en = ElasticNetCV(cv=5, max_iter=10000, random_state=0)
knn = KNeighborsRegressor()
sgdr = SGDRegressor()
dt = DecisionTreeRegressor(max_depth = 1)
ada = AdaBoostRegressor(dt)
br = BayesianRidge()

# Initialization of the metric lists
en_mse = []
en_mae = []
en_r = []

# ElasticNet 
# Data 1 
en.fit(data1x_train, data1y_train)
en_mse.append(metrics.mean_squared_error(data1y_test, en.predict(data1x_test)))
en_mae.append(metrics.mean_absolute_error(data1y_test, en.predict(data1x_test)))
en_r.append(metrics.r2_score(data1y_test, en.predict(data1x_test)))

# Data 2 
en.fit(data2x_train, data2y_train)
en_mse.append(metrics.mean_squared_error(data2y_test, en.predict(data2x_test)))
en_mae.append(metrics.mean_absolute_error(data2y_test, en.predict(data2x_test)))
en_r.append(metrics.r2_score(data2y_test, en.predict(data2x_test)))

# Data 3
for df in data3_splitted:
    en.fit(df["xtrain"], df["ytrain"])
    en_mse.append(metrics.mean_squared_error(df["ytest"], en.predict(df["xtest"])))
    en_mae.append(metrics.mean_absolute_error(df["ytest"], en.predict(df["xtest"])))
    en_r.append(metrics.r2_score(df["ytest"], en.predict(df["xtest"])))

knn_mse = []
knn_mae = []
knn_r = []
params = {"n_neighbors": np.arange(1,5), "weights": ["uniform", "distance"]}
grid = GridSearchCV(estimator=knn, param_grid=params,  scoring = "neg_mean_squared_error", cv = 5, iid = False)

# Nearest
# Data 1
grid.fit(data1x_train, data1y_train)
knn_mse.append(metrics.mean_squared_error(data1y_test, grid.predict(data1x_test)))
knn_mae.append(metrics.mean_absolute_error(data1y_test, grid.predict(data1x_test)))
knn_r.append(metrics.r2_score(data1y_test, grid.predict(data1x_test)))

# Data 2
grid.fit(data2x_train, data2y_train)
knn_mse.append(metrics.mean_squared_error(data2y_test, grid.predict(data2x_test)))
knn_mae.append(metrics.mean_absolute_error(data2y_test, grid.predict(data2x_test)))
knn_r.append(metrics.r2_score(data2y_test, grid.predict(data2x_test)))


# Data 3
for df in data3_splitted:
    grid.fit(df["xtrain"], df["ytrain"])
    knn_mse.append(metrics.mean_squared_error(df["ytest"], grid.predict(df["xtest"])))
    knn_mae.append(metrics.mean_absolute_error(df["ytest"], grid.predict(df["xtest"])))
    knn_r.append(metrics.r2_score(df["ytest"], grid.predict(df["xtest"])))

# List initizalization
sgd_mse = []
sgd_mae = []
sgd_r = []

# Grid initialization
params = {"loss": ["squared_loss", "huber", "epsilon_insensitive"]}
grid = GridSearchCV(estimator=sgdr, param_grid=params,  scoring = "neg_mean_squared_error", cv = 5, iid = False)

# Data 1
grid.fit(data1x_train, data1y_train)
sgd_mse.append(metrics.mean_squared_error(data1y_test, grid.predict(data1x_test)))
sgd_mae.append(metrics.mean_absolute_error(data1y_test, grid.predict(data1x_test)))
sgd_r.append(metrics.r2_score(data1y_test, grid.predict(data1x_test)))

# Data 2
grid.fit(data2x_train, data2y_train)
sgd_mse.append(metrics.mean_squared_error(data2y_test, grid.predict(data2x_test)))
sgd_mae.append(metrics.mean_absolute_error(data2y_test, grid.predict(data2x_test)))
sgd_r.append(metrics.r2_score(data2y_test, grid.predict(data2x_test)))

# Data 3
for df in data3_splitted:
    grid.fit(df["xtrain"], df["ytrain"])
    sgd_mse.append(metrics.mean_squared_error(df["ytest"], grid.predict(df["xtest"])))
    sgd_mae.append(metrics.mean_absolute_error(df["ytest"], grid.predict(df["xtest"])))
    sgd_r.append(metrics.r2_score(df["ytest"], grid.predict(df["xtest"])))

# Metric list initialization
ada_mse = []
ada_mae = []
ada_r = []

# Grid initialization
param_dist = {'n_estimators': [50, 100], 'learning_rate' : [0.01,0.05,0.1,0.3,1], 'loss' : ['linear', 'square', 'exponential']}
grid = RandomizedSearchCV(ada, param_distributions = param_dist, cv=3, n_iter = 5, n_jobs=-1, scoring = "neg_mean_squared_error")

# Data 1
grid.fit(data1x_train, data1y_train)
ada_mse.append(metrics.mean_squared_error(data1y_test, grid.predict(data1x_test)))
ada_mae.append(metrics.mean_absolute_error(data1y_test, grid.predict(data1x_test)))
ada_r.append(metrics.r2_score(data1y_test, grid.predict(data1x_test)))

# Data 2
grid.fit(data2x_train, data2y_train)
ada_mse.append(metrics.mean_squared_error(data2y_test, grid.predict(data2x_test)))
ada_mae.append(metrics.mean_absolute_error(data2y_test, grid.predict(data2x_test)))
ada_r.append(metrics.r2_score(data2y_test, grid.predict(data2x_test)))

# Data 3
for df in data3_splitted:
    grid.fit(df["xtrain"], df["ytrain"])
    ada_mse.append(metrics.mean_squared_error(df["ytest"], grid.predict(df["xtest"])))
    ada_mae.append(metrics.mean_absolute_error(df["ytest"], grid.predict(df["xtest"])))
    ada_r.append(metrics.r2_score(df["ytest"], grid.predict(df["xtest"])))

# List initizalization
br_mse = []
br_mae = []
br_r = []

# data 1
br.fit(data1x_train, data1y_train)
br_mse.append(metrics.mean_squared_error(data1y_test, br.predict(data1x_test)))
br_mae.append(metrics.mean_absolute_error(data1y_test, br.predict(data1x_test)))
br_r.append(metrics.r2_score(data1y_test, br.predict(data1x_test)))

# Data 2
br.fit(data2x_train, data2y_train)
br_mse.append(metrics.mean_squared_error(data2y_test, br.predict(data2x_test)))
br_mae.append(metrics.mean_absolute_error(data2y_test, br.predict(data2x_test)))
br_r.append(metrics.r2_score(data2y_test, br.predict(data2x_test)))

# Data 3
for df in data3_splitted:
    br.fit(df["xtrain"], df["ytrain"])
    br_mse.append(metrics.mean_squared_error(df["ytest"], br.predict(df["xtest"])))
    br_mae.append(metrics.mean_absolute_error(df["ytest"], br.predict(df["xtest"])))
    br_r.append(metrics.r2_score(df["ytest"], br.predict(df["xtest"])))



**Metric tables**

In [61]:
row_names = ["Task 1", "Task 2", "Ethanol", "Ethylene", "Ammonia", "Acetaldehyde", "Acetone", "Toluene"]
mse_metrics = [en_mse, knn_mse, sgd_mse, ada_mse, br_mse]
mae_metrics = [en_mae, knn_mae, sgd_mae, ada_mae, br_mae]
r2_metrics = [en_r, knn_r, sgd_r, ada_r, br_r]

In [62]:
mse_table = pd.DataFrame({"index": row_names, "Elastic Net": en_mse, "NN": knn_mse, "SGDR": sgd_mse, "AdaBoost": ada_mse, "br": br_mse})
mse_table.set_index("index").round()

Unnamed: 0_level_0,Elastic Net,NN,SGDR,AdaBoost,br
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Task 1,7475.0,855.0,4470.0,12408.0,3870.0
Task 2,6029.0,1033.0,16713580000.0,11546.0,2575.0
Ethanol,2439.0,83.0,1671.0,3248.0,1300.0
Ethylene,909.0,268.0,748.0,2092.0,7209.0
Ammonia,15366.0,4490.0,21116.0,15587.0,2877.0
Acetaldehyde,857.0,173.0,692482500000.0,2062.0,178.0
Acetone,1538.0,241.0,1165.0,8961.0,375.0
Toluene,391.0,317.0,330.0,660.0,1151.0


In [63]:
mae_table = pd.DataFrame({"index": row_names, "Elastic Net": en_mae, "NN": knn_mae, "SGDR": sgd_mae, "AdaBoost": ada_mae, "br": br_mae})
mae_table.set_index("index").round()

Unnamed: 0_level_0,Elastic Net,NN,SGDR,AdaBoost,br
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Task 1,59.0,7.0,38.0,88.0,36.0
Task 2,50.0,8.0,82973.0,85.0,23.0
Ethanol,36.0,2.0,24.0,46.0,23.0
Ethylene,20.0,5.0,18.0,39.0,14.0
Ammonia,93.0,28.0,82.0,94.0,27.0
Acetaldehyde,22.0,5.0,698077.0,38.0,9.0
Acetone,24.0,4.0,20.0,80.0,12.0
Toluene,11.0,4.0,8.0,18.0,7.0


In [64]:
r_table = pd.DataFrame({"index": row_names, "Elastic Net": en_r, "NN": knn_r, "SGDR": sgd_r, "AdaBoost": ada_r, "br": br_r})
r_table.set_index("index").round(2)

Unnamed: 0_level_0,Elastic Net,NN,SGDR,AdaBoost,br
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Task 1,0.76,0.97,0.85,0.59,0.87
Task 2,0.8,0.97,-546858.9,0.62,0.92
Ethanol,0.65,0.99,0.76,0.53,0.81
Ethylene,0.87,0.96,0.89,0.69,-0.07
Ammonia,0.79,0.94,0.71,0.78,0.96
Acetaldehyde,0.84,0.97,-126290900.0,0.62,0.97
Acetone,0.97,0.99,0.97,0.8,0.99
Toluene,0.64,0.71,0.69,0.39,-0.07
