# Build Models
1. Use 20 features and 4 labels from features.csv
2. Try different models

## 1. Initialization

In [22]:
step = 2
processed_folder = r".\ProcessedData\Dataset_Step=" + str(step)
model_folder = processed_folder + r"\models"
features_path = processed_folder + r"\features.csv"

In [52]:
import pandas as pd
import os
import numpy as np
from enum import Enum
from sklearn.model_selection import train_test_split
import joblib 
import pickle
from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, BaggingRegressor, \
BaggingRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor, \
VotingRegressor
import lightgbm as lgb
from sklearn.neural_network import MLPRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel
from IPython.display import clear_output
from copy import deepcopy

## 2. Prepare data

In [3]:
class FingerType(Enum):
    Thumb = 0
    Index = 1

finger_type = FingerType.Thumb

In [4]:
features_pd = pd.read_csv(features_path)
X = features_pd.iloc[:, 0: 20].to_numpy()
if finger_type == FingerType.Thumb:
    Y =  features_pd.iloc[:, 20: 22].to_numpy()
elif finger_type == FingerType.Index:
    Y =  features_pd.iloc[:, 22: 24].to_numpy()

print("Original dataset shape")
print("X:", X.shape, " Y:", Y.shape)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)
print("Train dataset shape")
print("X:", X_train.shape, " Y:", Y_train.shape)
print("Test dataset shape")
print("X:", X_test.shape, " Y:", Y_test.shape)

Original dataset shape
X: (465470, 20)  Y: (465470, 2)
Train dataset shape
X: (325829, 20)  Y: (325829, 2)
Test dataset shape
X: (139641, 20)  Y: (139641, 2)


## 3. Prepare the regression model pipeline
### Use multi output regressor

In [31]:
def generate_joint_model(single_model):
    model = MultiOutputRegressor(single_model)
    model.fit(X_train, Y_train)
    

    score_train = model.score(X_train, Y_train)
    score = model.score(X_test, Y_test)
    
    clear_output(wait=True)
    print('Score of train', round(score_train * 100, 1), "%")
    print('Score of test', round(score * 100, 1), "%")
    
    model_path = model_folder + "\\" +  \
                    str(round(score, 3)).replace('.', '_') + r"_" +  \
                    str(model.get_params()['estimator']).split('(')[0] + \
                    '.joblib'
    joblib.dump(model, model_path)
    print("Save model file", model_path)
    
    return model, model_path

### Use single output regressor

In [65]:
def generate_single_models(single_model):
    models = []
    file_label = ['x', 'y']
    scores = []
    file_names = []
    for i in range(2):
        model = deepcopy(single_model)    
        model.fit(X_train, Y_train[:, i])
    
        models.append(model)
        
        score_train = model.score(X_train, Y_train[:, i])
        score_test  = model.score(X_test, Y_test[:, i])
        
        single_score = {'train': score_train, 'test': score_test}
        scores.append(single_score)
        
        model_path = model_folder + r'\\' +  \
                        str(round(score_test, 3)).replace('.', '_') + r'_' +  \
                        str(type(single_model)).split('.')[-1].split('\'')[0] + r'_' + \
                        file_label[i] + '.joblib'
        joblib.dump(model, model_path)
        file_names.append(model_path)
        
    clear_output(wait=True)
    for i in range(2):
        print(file_label[i])
        print('Score of train', round(scores[i]['train'] * 100, 1), "%")
        print('Score of test',  round(scores[i]['test']  * 100, 1), "%")
        print("Save model file", file_names[i])
    
    return models, file_names

## 4. Use specific model

In [56]:
# single_model = SVR(kernel='linear', C=1.0, epsilon=0.2, max_iter=10000)
# single_model = SVR(kernel='poly', C=1.0, epsilon=0.2, max_iter=10000)
# single_model = SVR(kernel='sigmoid', C=1.0, epsilon=0.2, tol=0.1)
# single_model = LogisticRegression(random_state=0)

single_model = RandomForestRegressor(n_estimators=10, max_depth=None, 
                                     min_samples_split=15, min_samples_leaf=15,
                                     verbose=3)
# single_model = RandomForestRegressor(n_estimators=60, max_depth=None, criterion='mae',
#                                      min_samples_split=15, min_samples_leaf=15, verbose=1)


# single_model = LinearRegression()

# single_model = AdaBoostRegressor(random_state=0, n_estimators=100, loss='square')
# single_model = BaggingRegressor(base_estimator=SVR(), n_estimators=10, random_state=0)

# ** single_model = GradientBoostingRegressor(n_estimators=10000)
# ** single_model = HistGradientBoostingRegressor(max_iter=10000)

# r1 = RandomForestRegressor(max_depth=None, random_state=None)
# r2 = GradientBoostingRegressor(n_estimators=10000)
# r3 = HistGradientBoostingRegressor(max_iter=10000)
# ** single_model = VotingRegressor([('RF', r1), ('GB', r2), ('HGB', r3)])

# LR1 = LinearRegression()
# LR2 = LinearRegression()
# LR3 = LinearRegression()
# single_model = VotingRegressor([('LR1', r1), ('LR2', r2), ('LR3', r3)])

# ** single_model = lgb.LGBMRegressor(boosting_type='dart', num_leaves=10, n_estimators=100)

# single_model = MLPRegressor()

# kernel = DotProduct() + WhiteKernel()
# single_model = GaussianProcessRegressor(kernel=kernel, random_state=0)

In [35]:
model, model_path = generate_joint_model(single_model)

Score of train 92.4 %
Score of test 90.6 %
Save model file .\ProcessedData\Dataset_Step=2\models\0_906_RandomForestRegressor.joblib


In [66]:
models, model_paths = generate_single_models(single_model)

x
Score of train 90.9 %
Score of test 88.9 %
Save model file .\ProcessedData\Dataset_Step=2\models\\0_889_RandomForestRegressor_x.joblib
y
Score of train 94.0 %
Score of test 92.4 %
Save model file .\ProcessedData\Dataset_Step=2\models\\0_924_RandomForestRegressor_y.joblib


## 5. Load model from disk and test on other dataset

In [36]:
step = 3
features_pd = pd.read_csv(r".\ProcessedData\Dataset_Step=" + str(step) + r"\features.csv")
X_a = features_pd.iloc[:, 0: 20].to_numpy()
if finger_type == FingerType.Thumb:
    Y_a =  features_pd.iloc[:, 20: 22].to_numpy()
elif finger_type == FingerType.Index:
    Y_a =  features_pd.iloc[:, 22: 24].to_numpy()
print("Original dataset shape")
print("X:", X_a.shape, " Y:", Y_a.shape)

Original dataset shape
X: (54268, 20)  Y: (54268, 2)


In [None]:
model_path = r'./ProcessedData/Dataset_Step=3/models/0_89_RandomForestRegressor.joblib'
model = joblib.load(model_path)
# model.get_params()

In [45]:
score = model.score(X_a, Y_a)
print('Score on another dataset', round(score * 100, 1), "%")

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


Score on another dataset 73.1 %


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished


## 6. Convert to other format 