**Library Import**

In [25]:
import numpy as np
import pandas as pd
from sklearn.linear_model import ElasticNetCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics

**Data Import**<br/>
1) model1: estimate the concentration of any gas <br/>
2) model2: estimate the concentration but gas feature is supplied <br/>
3) model3: estimate the concentration but model is create for each particular gas <br/>

In [5]:
# read in the data 
path = r"C:\Users\BrechtDewilde\Documents\Github\BDProject\data\data.csv"
data = pd.read_csv(path, index_col = 0)

# Create a particular x, y dataset for each task
data1 = data.iloc[:, 2:]
data2 = pd.get_dummies(data.iloc[:, 1:])

# for task 3 we need to create a model for each gas
ethanol = data.loc[data['gas'] == "Ethanol"].iloc[:, 2:]
ethylene = data.loc[data['gas'] == "Ethylene"].iloc[:, 2:]
ammonia = data.loc[data['gas'] == "Ammonia"].iloc[:, 2:]
acetaldehyde = data.loc[data['gas'] == "Acetaldehyde"].iloc[:, 2:]
acetone = data.loc[data['gas'] == "Acetone"].iloc[:, 2:]
toluene = data.loc[data['gas'] == "Toluene"].iloc[:, 2:]
data3 = [ethanol, ethylene, ammonia, acetaldehyde, acetone, toluene]

**Train-Test set split**

In [6]:
data1x_train, data1x_test, data1y_train, data1y_test = train_test_split(data1.iloc[:,1:], data1["concentration"], random_state=0)
data2x_train, data2x_test, data2y_train, data2y_test = train_test_split(data2.iloc[:,1:], data1["concentration"], random_state=0)

ethanol = {"xtrain": [], "xtest": [], "ytrain": [], "ytest": []} 
ethylene = {"xtrain": [], "xtest": [], "ytrain": [], "ytest": []} 
ammonia = {"xtrain": [], "xtest": [], "ytrain": [], "ytest": []} 
acetaldehyde = {"xtrain": [], "xtest": [], "ytrain": [], "ytest": []}
acetone = {"xtrain": [], "xtest": [], "ytrain": [], "ytest": []} 
toluene = {"xtrain": [], "xtest": [], "ytrain": [], "ytest": []} 

data3_splitted = [ethanol, ethylene, ammonia, acetaldehyde, acetone, toluene]
for index, df in enumerate(data3):
    data3_splitted[index]["xtrain"], data3_splitted[index]["xtest"], data3_splitted[index]["ytrain"], data3_splitted[index]["ytest"] = train_test_split(df.iloc[:,1:], df["concentration"], random_state=0)

**Initialization of the different prediction models**<br/>
1) Elastic Net Regression <br/>
2) Support Vector Regression <br/>
3) Nearest Neighbour Regression<br/>
4) Boosting

In [3]:
en = ElasticNetCV(cv=5, max_iter=10000, random_state=0)
knn = KNeighborsRegressor()

**Model fitting and Model evaluation** <br/>
metrics: MSE, MAE and R2 

Elastic net

In [33]:
# Initialization of the metric lists
en_mse = []
en_mae = []
en_r = []

# ElasticNet 
# Data 1 
en.fit(data1x_train, data1y_train)
en_mse.append(metrics.mean_squared_error(data1y_test, en.predict(data1x_test)))
en_mae.append(metrics.mean_absolute_error(data1y_test, en.predict(data1x_test)))
en_ar.append(metrics.r2_score(data1y_test, en.predict(data1x_test)))

# Data 2 
en.fit(data2x_train, data2y_train)
en_mse.append(metrics.mean_squared_error(data2y_test, en.predict(data2x_test)))
en_mae.append(metrics.mean_absolute_error(data2y_test, en.predict(data2x_test)))
en_ar.append(metrics.r2_score(data2y_test, en.predict(data2x_test)))

# Data 3
for df in data3_splitted:
    en.fit(df["xtrain"], df["ytrain"])
    en_mse.append(metrics.mean_squared_error(df["ytest"], en.predict(df["xtest"])))
    en_mae.append(metrics.mean_absolute_error(df["ytest"], en.predict(df["xtest"])))
    en_ar.append(metrics.r2_score(df["ytest"], en.predict(df["xtest"])))

Nearest Neighbors Regression

In [23]:
knn_mse = []
params = {"n_neighbors": np.arange(1,5), "weights": ["uniform", "distance"]}
grid = GridSearchCV(estimator=knn, param_grid=params,  scoring = "neg_mean_squared_error", cv = 5, iid = False)

# Nearest
# Data 1
grid.fit(data1x_train, data1y_train)
knn_mse.append(metrics.mean_squared_error(data1y_test, grid.predict(data1x_test)))

# Data 2
grid.fit(data2x_train, data2y_train)
knn_mse.append(metrics.mean_squared_error(data2y_test, grid.predict(data2x_test)))

# Data 3
for df in data3_splitted:
    grid.fit(df["xtrain"], df["ytrain"])
    knn_mse.append(metrics.mean_squared_error(df["ytest"], grid.predict(df["xtest"])))

In [None]:
mse_table = pd.DataFrame({"Elastic Net": en_mse, "SVM": [], "NN": [], "Booster": []})
# mse_table.set_index(["Data1", "Data2", "Ethanol", "Ethylene", "Ammonia", "Acetaldehyde", "Acetone", "Toluene"])

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'brier_score_loss',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'max_error',
 'mutual_info_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'v_measure_score']

In [26]:
svr = SVR()
svr.fit(data1x_train, data1y_train)



SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
    gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
    tol=0.001, verbose=False)