In [1]:
%matplotlib inline
import pandas as pd
import librosa as lib
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Audio
import scipy

In [2]:
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.dummy import DummyRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import TimeSeriesSplit, train_test_split, RandomizedSearchCV
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import r2_score

In [3]:
#importing project created functions, these can be found in model_prep folder in repo
from model_prep import Xy_prep_all, Xy_prep_silence, Xy_prep_music, cross_val_time

In [4]:
from dask import dataframe as dd
import joblib
from dask.distributed import Client, LocalCluster
cluster = LocalCluster()
client = Client(cluster)
cluster.scheduler, cluster.workers

Perhaps you already have a cluster running?
Hosting the HTTP server on port 52265 instead


(<Scheduler: "tcp://127.0.0.1:52268" processes: 3 cores: 6>,
 {0: <Nanny: tcp://127.0.0.1:52290, threads: 2>,
  1: <Nanny: tcp://127.0.0.1:52293, threads: 2>,
  2: <Nanny: tcp://127.0.0.1:52287, threads: 2>})

In [5]:
model_data = pd.read_pickle('DFs/model_data.pkl')

# RandomForest Randomized Search - Silence

In [7]:
rf = RandomForestRegressor()
params = {'criterion':['mse', 'mae'], 'max_depth':[1, 3, 5, 7], 'max_leaf_nodes':[2, 4, 6, 10], 
          'min_samples_split':[.1, .2, .3, .4, .5, .6, .7, .8, .9], 'min_samples_leaf':[.1, .2, .3, .4, .5]}
rsearch = RandomizedSearchCV(estimator=rf, param_distributions = params, n_iter=20,
                             scoring=['neg_mean_absolute_error', 'neg_mean_absolute_percentage_error'], 
                             refit='neg_mean_absolute_percentage_error')
X_train, X_test, y_train, y_test, cols, participants, cut_rows = Xy_prep_silence(model_data, 'target_20', 20, 25)
with joblib.parallel_backend('dask'):
    rsearch.fit(X_train, y_train)
rsearch.cv_results_

KeyboardInterrupt: 

In [None]:
rsearch.best_score_, rsearch.best_params_, rsearch.score(X_test, y_test)

In [None]:
y_pred = rsearch.predict(X_test)
plot_pred = pd.DataFrame(y_pred, columns=['pred'])
plot_pred.insert(0, value=y_test, column='true')
sns.lineplot(data=plot_pred.rolling(200).mean())
plt.title('RandomForest Search Predictions vs Actual - 20s - Silence')
plt.ylabel('milliseconds (relative to section of music)')
plt.xlabel('total mm of movement in 20s')
plt.show()

# RandomForest Randomized Search - Music

In [None]:
rf = RandomForestRegressor()
params = {'criterion':['mse', 'mae'], 'max_depth':[1, 3, 5, 7], 'max_leaf_nodes':[2, 4, 6, 10], 
          'min_samples_split':[.1, .2, .3, .4, .5, .6, .7, .8, .9], 'min_samples_leaf':[.1, .2, .3, .4, .5]}
rsearch = RandomizedSearchCV(estimator=rf, param_distributions = params, n_iter= 25,
                             scoring=['neg_mean_absolute_error', 'neg_mean_absolute_percentage_error'], 
                             refit='neg_mean_absolute_percentage_error')
X_train, X_test, y_train, y_test, cols, participants, cut_rows = Xy_prep_music(model_data, 'target_20', 20, 25)
with joblib.parallel_backend('dask'):
    rsearch.fit(X_train, y_train)
rsearch.cv_results_

In [None]:
rsearch.best_score_, rsearch.best_params_, rsearch.score(X_test, y_test)

In [None]:
y_pred = rsearch.predict(X_test)
plot_pred = pd.DataFrame(y_pred, columns=['pred'])
plot_pred.insert(0, value=y_test, column='true')
sns.lineplot(data=plot_pred.rolling(200).mean())
plt.title('RandomForest Search Predictions vs Actual - 20s - Music')
plt.ylabel('milliseconds (relative to section of music)')
plt.xlabel('total mm of movement in 20s')
plt.show()

# ElasticNet Randomized Search - Silence

In [None]:
elastic = ElasticNet()
params_elastic = {'alpha':[0, .1, .2, .3, .4, .5, .6, .7, .8, .9, 1], 'l1_ratio':[.1, .2, .3, .4, .5, .6, .7, .8, .9]}
rsearch_elastic = RandomizedSearchCV(estimator=elastic, param_distributions = params_elastic, n_iter=20,
                             scoring=['neg_mean_absolute_error', 'neg_mean_absolute_percentage_error'], 
                                     refit='neg_mean_absolute_percentage_error')
X_train, X_test, y_train, y_test, cols, participants, cut_rows = Xy_prep_silence(model_data, 'target_10', 20, 25)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test
with joblib.parallel_backend('dask'):
    rsearch_elastic.fit(X_train, y_train)
rsearch_elastic.cv_results_

In [None]:
rsearch_elastic.best_score_, rsearch_elastic.best_params_, rsearch_elastic.score(X_test, y_test)

In [None]:
y_pred = rsearch_elastic.predict(X_test)
plot_pred = pd.DataFrame(y_pred, columns=['pred'])
plot_pred.insert(0, value=y_test, column='true')
sns.lineplot(data=plot_pred.rolling(200).mean())
plt.title('ElasticNet Search Predictions vs Actual - 20s - Silence')
plt.ylabel('milliseconds (relative to section of music)')
plt.xlabel('total mm of movement in 20s')
plt.show()

# ElasticNet Randomized Search - Music

In [None]:
elastic = ElasticNet()
params_elastic = {'alpha':[0, .1, .2, .3, .4, .5, .6, .7, .8, .9, 1], 'l1_ratio':[.1, .2, .3, .4, .5, .6, .7, .8, .9]}
rsearch_elastic = RandomizedSearchCV(estimator=elastic, param_distributions = params_elastic, n_iter=20,
                             scoring=['neg_mean_absolute_error', 'neg_mean_absolute_percentage_error'], 
                                     refit='neg_mean_absolute_percentage_error')
X_train, X_test, y_train, y_test, cols, participants, cut_rows = Xy_prep_music(model_data, 'target_10', 20, 25)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test
with joblib.parallel_backend('dask'):
    rsearch_elastic.fit(X_train, y_train)
rsearch_elastic.cv_results_

In [None]:
rsearch_elastic.best_score_, rsearch_elastic.best_params_, rsearch_elastic.score(X_test, y_test)

In [None]:
y_pred = rsearch_elastic.predict(X_test)
plot_pred = pd.DataFrame(y_pred, columns=['pred'])
plot_pred.insert(0, value=y_test, column='true')
sns.lineplot(data=plot_pred.rolling(200).mean())
plt.title('ElasticNet Search Predictions vs Actual - 20s - Music')
plt.ylabel('milliseconds (relative to section of music)')
plt.xlabel('total mm of movement in 20s')
plt.show()

# ElasticNet Best Params - Silence - Individuals

In [None]:
elastic = ElasticNet(alpha=.8, l1_ratio=.5, max_iter=2500, selection='random')
scores = {'PID':[], 'test_score':[], 'mae':[], 'mape':[]}
preds=pd.DataFrame()
for i in range(20, 30):
    X_train, X_test, y_train, y_test, cols, participants, cut_rows = Xy_prep_silence(model_data, 'target_20', i, i)
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test
    with joblib.parallel_backend('dask'):
        elastic.fit(X_train, y_train)
    y_pred = elastic.predict(X_test)
    scores['PID'].append(i)
    scores['mae'].append(mean_absolute_error(y_test, y_pred))
    preds[str(i) + 'pred'] = y_pred
    preds[str(i) + 'actual'] = y_test
    scores['test_score'].append(elastic.score(X_test, y_test))
    scores['mape'].append(mean_absolute_percentage_error(y_test, y_pred))
    

In [None]:
pd.DataFrame(scores)

In [None]:
fig, axs = plt.subplots(5, 2, figsize=(20, 20))
i=20
for ax in axs.flatten():
    sns.lineplot(data=preds[[str(i)+'pred', str(i)+'actual']].rolling(50).mean(), ax=ax)
    ax.set_title('PID' + str(i))
    ax.set_ylabel('millimeters')
    ax.set_xlabel('milliseconds')
    i+=1
plt.tight_layout()
plt.show()

# RandomForest Best Params - Silence - Individuals

In [None]:
rf = RandomForestRegressor(min_samples_split= 0.6, min_samples_leaf= 0.3, max_leaf_nodes= 10, max_depth= 3,
                           criterion= 'mse')
scores = {'PID':[], 'test_score':[], 'mae':[], 'mape':[]}
preds=pd.DataFrame()
for i in range(20, 30):
    X_train, X_test, y_train, y_test, cols, participants, cut_rows = Xy_prep_silence(model_data, 'target_20', i, i)
    with joblib.parallel_backend('dask'):
        rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    scores['PID'].append(i)
    scores['mae'].append(mean_absolute_error(y_test, y_pred))
    preds[str(i) + 'pred'] = y_pred
    preds[str(i) + 'actual'] = y_test
    scores['test_score'].append(rf.score(X_test, y_test))
    scores['mape'].append(mean_absolute_percentage_error(y_test, y_pred))
    

In [None]:
pd.DataFrame(scores)

In [None]:
preds

In [None]:
fig, axs = plt.subplots(5, 2, figsize=(20, 20))
i=20
for ax in axs.flatten():
    sns.lineplot(data=preds[[str(i)+'pred', str(i)+'actual']].rolling(10).mean(), ax=ax)
    ax.set_title('PID' + str(i))
    ax.set_ylabel('millimeters')
    ax.set_xlabel('milliseconds')
    i+=1
plt.tight_layout()
plt.show()

# ElasticNet Best Params - Silence - All Participants

In [None]:
elastic = ElasticNet(alpha=.8, l1_ratio=.5, max_iter=2500, selection='random')
X_train, X_test, y_train, y_test, participants, cols, cut_rows = Xy_prep_silence(model_data, 'target_20', 50)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
with joblib.parallel_backend('dask'):
    elastic.fit(X_train, y_train)
y_pred = elastic.predict(X_test)
elastic.score(X_test, y_test), mean_absolute_error(y_test, y_pred), mean_absolute_percentage_error(y_test, y_pred)

In [None]:
elastic_pred = pd.DataFrame(y_pred, columns=['Prediction'])
elastic_pred.insert(0, value=y_test, column='Actual')
elastic_pred

In [None]:
plt.figure(figsize=(16, 6))
sns.lineplot(data=elastic_pred.rolling(1000).mean())
plt.title('ElasticNet Predictions vs Actual - 20s - Silence All')
plt.xlabel('milliseconds (relative to section')
plt.ylabel('total mm of movement in 20s')
plt.plot()

# RandomForest Best Params - Silence - All Participants

In [None]:
rf = RandomForestRegressor(min_samples_split= 0.2, min_samples_leaf= 0.1, max_leaf_nodes= 10, max_depth= 5,
                           criterion= 'mse')
X_train, X_test, y_train, y_test, cols, participants, cut_rows = Xy_prep_silence(model_data, 'target_20', 1, 75)
with joblib.parallel_backend('dask'):
    rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
rf.score(X_test, y_test), mean_absolute_error(y_test, y_pred), mean_absolute_percentage_error(y_test, y_pred)

In [None]:
rf_pred = pd.DataFrame(y_pred, columns=['Prediction'])
rf_pred.insert(0, value=y_test, column='Actual')
rf_pred

In [None]:
plt.figure(figsize=(16, 6))
sns.lineplot(data=rf_pred.rolling(1000).mean())
plt.title('RandomForest Predictions vs Actual - 20s - Silence All')
plt.xlabel('milliseconds (relative to section')
plt.ylabel('total mm of movement in 20s')
plt.plot()

# ElasticNet Best Params - Music - All Participants

In [None]:
elastic = ElasticNet(alpha=.7, l1_ratio=.1, max_iter=2500, selection='random')
X_train, X_test, y_train, y_test, cols, participants, cut_rows = Xy_prep_music(model_data, 'target_20', 1, 75)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test
with joblib.parallel_backend('dask'):
    elastic.fit(X_train, y_train)
y_pred = elastic.predict(X_test)
elastic.score(X_test, y_test), mean_absolute_error(y_test, y_pred), mean_absolute_percentage_error(y_test, y_pred)

In [None]:
elastic_pred = pd.DataFrame(y_pred, columns=['Prediction'])
elastic_pred.insert(0, value=y_test, column='Actual')
elastic_pred

In [None]:
plt.figure(figsize=(16, 6))
sns.lineplot(data=elastic_pred.rolling(1000).mean())
plt.title('ElasticNet Predictions vs Actual - 20s - Music All')
plt.xlabel('milliseconds (relative to section')
plt.ylabel('total mm of movement in 20s')
plt.plot()

# RandomForest Best Params - Silence - All Participants

In [None]:
rf = RandomForestRegressor(min_samples_split= 0.2, min_samples_leaf= 0.1, max_leaf_nodes= 6, max_depth= 3,
                           criterion= 'mse')
X_train, X_test, y_train, y_test, cols, participants, cut_rows = Xy_prep_music(model_data, 'target_20', 1, 75)
with joblib.parallel_backend('dask'):
    rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
rf.score(X_test, y_test), mean_absolute_error(y_test, y_pred), mean_absolute_percentage_error(y_test, y_pred)

In [None]:
rf_pred = pd.DataFrame(y_pred, columns=['Prediction'])
rf_pred.insert(0, value=y_test, column='Actual')
rf_pred

In [None]:
plt.figure(figsize=(16, 6))
sns.lineplot(data=rf_pred.rolling(1000).mean())
plt.title('RandomForest Predictions vs Actual - 20s - Music All')
plt.xlabel('milliseconds (relative to section')
plt.ylabel('total mm of movement in 20s')
plt.plot()