In [1]:
%matplotlib inline
import pandas as pd
import librosa as lib
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Audio
import scipy
import pickle

In [2]:
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.dummy import DummyRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import TimeSeriesSplit, train_test_split, RandomizedSearchCV
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import r2_score

In [3]:
from dask import dataframe as dd
import joblib
from dask.distributed import Client, LocalCluster
cluster = LocalCluster()
client = Client(cluster)
cluster.scheduler, cluster.workers

(<Scheduler: "tcp://127.0.0.1:56999" processes: 3 cores: 6>,
 {0: <Nanny: tcp://127.0.0.1:57016, threads: 2>,
  1: <Nanny: tcp://127.0.0.1:57022, threads: 2>,
  2: <Nanny: tcp://127.0.0.1:57019, threads: 2>})

In [4]:
model_data = pd.read_pickle('DFs/model_data.pkl')

In [5]:
# pca = PCA()
# scaler = StandardScaler()

# scaled = pd.DataFrame(scaler.fit_transform(model_data.drop(columns=['segment', 'SID', 'PID'])), 
#                       columns=model_data.drop(columns=['segment', 'SID', 'PID']).columns)
# with joblib.parallel_backend('dask'):
#     pca.fit(scaled)

In [6]:
# ex = pca.explained_variance_ratio_

In [7]:
# exdf = pd.DataFrame(ex.reshape(1, -1), columns=range(len(ex)))
# exdf.T.rename(columns={0:'variance_explained'}).head(15)

In [8]:
%%writefile model_prep\all_prep.py 

def Xy_prep_all(df, target, train_a=1, train_par=list(range(0,50)), test_par=list(range(50,75))):
    import numpy as np
    import pandas as pd
    
    #Select proper participants for training
    df_train = df.loc[df['PID'].isin(train_par)]
    #Select only musical sections
    df_train = df_train.iloc[(int(len(df_train)/2)):,:]
    #columns to drop from X
    drop_cols = ['PID', 'SID', 'target_1_sec', 'X', 'Y','Z', 'segment']
    #set milliseconds as index
    df_train = df_train.set_index('millisecond')
 
    #
    train_X = df_train.drop(columns=drop_cols)#.to_numpy()
    train_y = df_train[target]#.to_numpy()
    
    #Select proper participants for training
    df_test = df.loc[df['PID'].isin(test_par)]
    #Select only musical sections
    df_test = df_test.iloc[(int(len(df_train)/2)):,:]
    #columns to drop from X
    drop_cols = ['PID', 'SID', 'target_1_sec', 'X', 'Y','Z', 'segment']
    #set milliseconds as index
    df_test = df_test.set_index('millisecond')
    
    
    test_X = df_test.drop(columns=drop_cols)
    test_y = df_test.loc[:,[target,'PID']]
        
    return train_X, test_X, train_y, test_y, train_par, test_par

Overwriting model_prep\all_prep.py


In [9]:
from model_prep import Xy_prep_all

We can run an out of the box elasticNet on the PCA transformed model_data to see how it performs

In [10]:
%%writefile model_prep/gfn.py

#From https://johaupt.github.io/scikit-learn/tutorial/python/data%20processing/ml%20pipeline/
#model%20interpretation/columnTransformer_feature_names.html

def get_feature_name(column_transformer):
    """Get feature names from all transformers.
    Returns
    -------
    feature_names : list of strings
        Names of the features produced by transform.
    """
    # Remove the internal helper function
    #check_is_fitted(column_transformer)
    import warnings
    import sklearn
    
    # Turn loopkup into function for better handling with pipeline later
    def get_names(trans):
        # >> Original get_feature_names() method
        if trans == 'drop' or (
                hasattr(column, '__len__') and not len(column)):
            return []
        if trans == 'passthrough':
            if hasattr(column_transformer, '_df_columns'):
                if ((not isinstance(column, slice))
                        and all(isinstance(col, str) for col in column)):
                    return column
                else:
                    return column_transformer._df_columns[column]
            else:
                indices = np.arange(column_transformer._n_features)
                return ['x%d' % i for i in indices[column]]
        if not hasattr(trans, 'get_feature_names'):
        # >>> Change: Return input column names if no method avaiable
            # Turn error into a warning
            warnings.warn("Transformer %s (type %s) does not "
                                 "provide get_feature_names. "
                                 "Will return input column names if available"
                                 % (str(name), type(trans).__name__))
            # For transformers without a get_features_names method, use the input
            # names to the column transformer
            if column is None:
                return []
            else:
                return [name + "__" + f for f in column]

        return [name + "__" + f for f in trans.get_feature_names()]
    
    ### Start of processing
    feature_names = []
    
    # Allow transformers to be pipelines. Pipeline steps are named differently, so preprocessing is needed
    if type(column_transformer) == sklearn.pipeline.Pipeline:
        l_transformers = [(name, trans, None, None) for step, name, trans in column_transformer._iter()]
    else:
        # For column transformers, follow the original method
        l_transformers = list(column_transformer._iter(fitted=True))
    
    
    for name, trans, column, _ in l_transformers: 
        if type(trans) == sklearn.pipeline.Pipeline:
            # Recursive call on pipeline
            _names = get_feature_names(trans)
            # if pipeline has no transformer that returns names
            if len(_names)==0:
                _names = [name + "__" + f for f in column]
            feature_names.extend(_names)
        else:
            feature_names.extend(get_names(trans))
    
    return feature_names


Overwriting model_prep/gfn.py


In [11]:
from model_prep import get_feature_name

In [12]:
pars = list(model_data['PID'].unique())
#max 53
train_par = {x:pars[x] for x in np.random.choice(range(len(pars)), size=10, replace=False)}
for x in sorted(train_par.keys(), reverse=True):
    pars.pop(x)
#max 20
test_par={x:pars[x] for x in np.random.choice(range(len(pars)), size=5, replace=False)}
participants = {'train':list(train_par.values()), 'test':list(test_par.values())} 
participants

{'train': [26, 54, 5, 1, 16, 6, 18, 17, 44, 19], 'test': [40, 12, 28, 39, 37]}

In [13]:
train_X, test_X, train_y, test_y, train_par, test_par = Xy_prep_all(model_data, 'target_1_sec', 
                                                            train_par=participants['train'], test_par=participants['test'])

In [14]:
#Create column lists to use as inputs for the StandardScaler and OneHotEncoder Pipeline
num_cols = list(train_X.columns[-10:])
num_cols.extend(['Height', 'Age', 'Listen', 'Produce', 'Dance', 'Exercise'])
cat_cols = ['Tiresome', 'age_bin', 'height_bin']

In [15]:
#Set up instances of StanardSCaler, OneHotEncoder and ColumnTransformer to process model_data
scaler = StandardScaler()
ohe = OneHotEncoder(handle_unknown='ignore')
CT = ColumnTransformer(remainder='passthrough', transformers=[('scaler', scaler, num_cols),
                                              ('ohe', ohe, cat_cols)], verbose=True, sparse_threshold=0)

In [16]:
train_Xct = CT.fit_transform(train_X)
    
#transform test data based on training fit
test_Xct = CT.transform(test_X)

#get columns names from ColumnTransformer
cols = get_feature_name(CT)
train_Xf = pd.DataFrame(train_Xct, columns=cols)
cols = get_feature_name(CT)
test_Xf = pd.DataFrame(test_Xct, columns=cols)

[ColumnTransformer] ........ (1 of 3) Processing scaler, total=   0.0s
[ColumnTransformer] ........... (2 of 3) Processing ohe, total=   0.0s
[ColumnTransformer] ..... (3 of 3) Processing remainder, total=   0.0s




In [17]:
train_Xf.to_pickle(r'DFs/train_test/train_X.pkl')
train_y.to_pickle(r'DFs/train_test/train_y.pkl')
test_Xf.to_pickle(r'DFs/train_test/test_X.pkl')
test_y.to_pickle(r'DFs/train_test/test_y.pkl')

In [18]:
f = open(r'DFs/train_test/participants.pkl','wb')
pickle.dump(participants,f)
f.close()