In [1]:
%matplotlib inline
import pandas as pd
import librosa as lib
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Audio
import scipy

In [2]:
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.dummy import DummyRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import TimeSeriesSplit, train_test_split, RandomizedSearchCV
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import r2_score

In [3]:
from dask import dataframe as dd
import joblib
from dask.distributed import Client, LocalCluster
cluster = LocalCluster()
client = Client(cluster)
cluster.scheduler, cluster.workers

Perhaps you already have a cluster running?
Hosting the HTTP server on port 53738 instead


(<Scheduler: "tcp://127.0.0.1:53741" processes: 3 cores: 6>,
 {0: <Nanny: tcp://127.0.0.1:53763, threads: 2>,
  1: <Nanny: tcp://127.0.0.1:53760, threads: 2>,
  2: <Nanny: tcp://127.0.0.1:53766, threads: 2>})

In [4]:
model_data = pd.read_pickle('DFs/model_data.pkl')

In [5]:
def cross_val_time(model, algo,  X, y, split=5):
    count=1
    tscv = TimeSeriesSplit(n_splits=split)
    scoretemp = {algo+'_MAE':[], algo+'_MSE':[], algo+'_MPE':[]}
    for train_index, test_index in tscv.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        print(train_index[0], train_index[-1], test_index[0], test_index[-1])
        with joblib.parallel_backend('dask'):
            model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        #scoretemp[algo + '_R2'].append(r2_score(y_test, y_pred))
        scoretemp[algo + '_MAE'].append(mean_absolute_error(y_test, y_pred))
        scoretemp[algo + '_MSE'].append(mean_squared_error(y_test, y_pred))
        scoretemp[algo + '_MPE'].append(mean_absolute_percentage_error(y_test, y_pred))
        for i in scoretemp.keys():
            print(i + ': '+ str(scoretemp[i][count-1]))
        count+=1
    scoretemp = pd.DataFrame(scoretemp)
    return scoretemp

In [6]:
def Xy_prep(df, target, a=1, b=75):
    df_1 = df.loc[df['PID'].between(a, b)]
    drop_cols = ['PID', 'SID', 'target_5', 'target_10', 'target_20', 'X', 'Y','Z', 'segment']
    df_1i = df_1.set_index('millisecond')
    participants = b-a+1
    cut_rows = int(target[-2:])*participants*10
    X = df_1i.drop(columns=drop_cols).to_numpy()[:len(df)-cut_rows]
    y = df_1i[target].to_numpy()[:len(df)-cut_rows]
    cols = df_1i.drop(columns=drop_cols).columns
    participants = b-a+1
    
    return X, y, cols, participants, cut_rows

In [14]:
rf = RandomForestRegressor()
params = {'criterion':['mse', 'mae'], 'max_depth':[1, 3, 5, 7], 'max_leaf_nodes':[2, 4, 6, 10], 
          'min_samples_split':[.1, .2, .3, .4, .5, .6, .7, .8, .9], 'min_samples_leaf':[.1, .2, .3, .4, .5]}
rsearch = RandomizedSearchCV(estimator=rf, param_distributions = params, 
                             scoring=['neg_mean_absolute_error', 'neg_mean_squared_error'], refit='neg_mean_absolute_error')
X, y, cols, participants, cut_rows = Xy_prep(model_data, 'target_10', 7, 7)
with joblib.parallel_backend('dask'):
    rsearch.fit(X, y)
rsearch.cv_results_

{'mean_fit_time': array([33.98706074, 32.36010032, 55.66196556, 58.29931355, 47.97609158,
        19.84963942, 60.72575469, 81.12411828, 93.38701329, 38.67571592]),
 'std_fit_time': array([ 3.11682946,  1.66133965, 10.6563258 , 14.97238165, 23.61278939,
        19.85241766, 14.94537091,  2.34506947, 12.63848305, 29.47614101]),
 'mean_score_time': array([0.1566596 , 0.20663223, 0.06368294, 0.06877284, 0.09560366,
        0.13050227, 0.23616714, 0.09220009, 0.07615347, 0.07101202]),
 'std_score_time': array([0.13659719, 0.11364672, 0.03217886, 0.03684385, 0.0780982 ,
        0.10313131, 0.15325235, 0.04665735, 0.05219025, 0.04376756]),
 'param_min_samples_split': masked_array(data=[0.6, 0.3, 0.3, 0.4, 0.7, 0.9, 0.1, 0.4, 0.6, 0.3],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'param_min_samples_leaf': masked_array(data=[0.3, 0.5, 0.1, 0.2, 0.5, 0.1, 0.1, 0.1, 0.1, 0.1],
