## Import Libraries

In [16]:
import gc
import os
import logging
import datetime
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
from subprocess import check_output
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectPercentile, mutual_info_regression
from sklearn.impute import SimpleImputer 
from sklearn.impute import MissingIndicator
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import mean_squared_error, confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import StratifiedKFold, GroupKFold, GroupShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import FeatureUnion
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
from scipy.stats import reciprocal, uniform

#warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('X_train.csv')
test = pd.read_csv('X_test.csv')
y = pd.read_csv('y_train.csv')

In [3]:
train = train.set_index('series_id').join(y.set_index('series_id'))
trainb = train.loc[train['group_id'].isin([2,7,13,23,37,49])]

In [14]:
train.head()

Unnamed: 0_level_0,row_id,measurement_number,orientation_X,orientation_Y,orientation_Z,orientation_W,angular_velocity_X,angular_velocity_Y,angular_velocity_Z,linear_acceleration_X,linear_acceleration_Y,linear_acceleration_Z,group_id,surface
series_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,0_0,0,-0.75853,-0.63435,-0.10488,-0.10597,0.10765,0.017561,0.000767,-0.74857,2.103,-9.7532,13,fine_concrete
0,0_1,1,-0.75853,-0.63434,-0.1049,-0.106,0.067851,0.029939,0.003385,0.33995,1.5064,-9.4128,13,fine_concrete
0,0_2,2,-0.75853,-0.63435,-0.10492,-0.10597,0.007275,0.028934,-0.005978,-0.26429,1.5922,-8.7267,13,fine_concrete
0,0_3,3,-0.75852,-0.63436,-0.10495,-0.10597,-0.013053,0.019448,-0.008974,0.42684,1.0993,-10.096,13,fine_concrete
0,0_4,4,-0.75852,-0.63435,-0.10495,-0.10596,0.005135,0.007652,0.005245,-0.50969,1.4689,-10.441,13,fine_concrete


In [15]:
train.groupby(['group_id']).agg('count')

Unnamed: 0_level_0,row_id,measurement_number,orientation_X,orientation_Y,orientation_Z,orientation_W,angular_velocity_X,angular_velocity_Y,angular_velocity_Z,linear_acceleration_X,linear_acceleration_Y,linear_acceleration_Z,surface
group_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,7296,7296,7296,7296,7296,7296,7296,7296,7296,7296,7296,7296,7296
1,4864,4864,4864,4864,4864,4864,4864,4864,4864,4864,4864,4864,4864
2,2304,2304,2304,2304,2304,2304,2304,2304,2304,2304,2304,2304,2304
3,7296,7296,7296,7296,7296,7296,7296,7296,7296,7296,7296,7296,7296
4,7296,7296,7296,7296,7296,7296,7296,7296,7296,7296,7296,7296,7296
5,1152,1152,1152,1152,1152,1152,1152,1152,1152,1152,1152,1152,1152
6,6144,6144,6144,6144,6144,6144,6144,6144,6144,6144,6144,6144,6144
7,7168,7168,7168,7168,7168,7168,7168,7168,7168,7168,7168,7168,7168
8,7296,7296,7296,7296,7296,7296,7296,7296,7296,7296,7296,7296,7296
9,7296,7296,7296,7296,7296,7296,7296,7296,7296,7296,7296,7296,7296


In [4]:
## Step 1: feature engineering: A) convert to euler B) polynomial features
quats = ['orientation_X','orientation_Y','orientation_Z','orientation_W']
feat_engineering = ColumnTransformer(
    transformers = [
        ('add_features', PolynomialFeatures(degree=2,interaction_only=True,include_bias=False), quats )
    ]
)

## Step 2: standardize? I need to think carefully about my cross-validation scheme
try_pipe = Pipeline([
    ('add_features', PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)),
    ('standardize', StandardScaler())
])

In [5]:
## Step3: LabelEncode
encoded_response=pd.DataFrame()
instance_LabelEncoder = LabelEncoder()
encoded_response['surface'] = instance_LabelEncoder.fit_transform(train['surface'])

In [8]:
## Step4: split train into train and validation set
X_train, X_val, y_train, y_val = train_test_split(train[['orientation_X','orientation_Y','orientation_Z','orientation_W', \
                                                        'angular_velocity_X','angular_velocity_Y','angular_velocity_Z', \
                                                        'linear_acceleration_X','linear_acceleration_Y','linear_acceleration_Z']],
                                                  encoded_response['surface'], 
                                                  test_size = 0.2)

## Random Forest (second try)

In [26]:
class_pipe = Pipeline([
    ('pipeline', try_pipe),
    ('classif', RandomForestClassifier())
])

parameters = [
                  {
                      'classif__max_depth': [8,10,12],
                      'classif__n_estimators': [5, 10, 15],
                      'classif__min_samples_leaf': [2,4]
                  }
             ]

# Define addtional variables in hopes of increasing readability
X_data = train[['orientation_X','orientation_Y','orientation_Z','orientation_W','angular_velocity_X','angular_velocity_Y','angular_velocity_Z','linear_acceleration_X','linear_acceleration_Y','linear_acceleration_Z']]
y_data = encoded_response['surface']
grp_data = train['group_id']
X = X_data.values
y = y_data.values

# Instantiate groupshufflesplit
gss = GroupShuffleSplit(n_splits = 9, test_size = 0.2, random_state=42)

model_list={}
# Loop through the training/testing indicies for cross validation
for i, (train_indices, val_indices) in enumerate(gss.split(X = X_data, y=y_data, groups=grp_data)):
    
    # Define training and test sets
    X_train, X_valid = X[train_indices], X[val_indices]
    y_train, y_valid = y[train_indices], y[val_indices]
       
    #print('Train Size: %s | Test: %s' % (train_indices.shape, val_indices.shape))
    
    # Define name for dictionary
    name=str('GroupFold') + str(i)
    
    # Perform cross-validation within a loop
    GrdSrch = GridSearchCV(class_pipe, parameters, verbose=2, cv = 3, scoring='accuracy')
    classif = GrdSrch.fit(X_train, y_train)
    score = classif.score(X_valid, y_valid)
    print("{} score: {}".format(name, score))
    
    model_list[name] = classif
    

Fitting 3 folds for each of 18 candidates, totalling 54 fits
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5, total=  10.4s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   11.9s remaining:    0.0s


[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5, total=   9.9s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5 
[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5, total=   9.7s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=10 
[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=10, total=  16.9s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=10 
[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=10, total=  19.0s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=10 
[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=10, total=  19.7s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=15 
[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=15, total=  28.0s
[CV] classi

[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=5, total=  12.0s
[CV] classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10 
[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10, total=  21.8s
[CV] classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10 
[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10, total=  21.6s
[CV] classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10 
[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10, total=  21.2s
[CV] classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=15 
[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=15, total=  31.7s
[CV] classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=15 
[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=15, total=  31.7

[Parallel(n_jobs=1)]: Done  54 out of  54 | elapsed: 19.1min finished


GroupFold0 score: 0.35858784634760704
Fitting 3 folds for each of 18 candidates, totalling 54 fits
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5, total=   9.9s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   11.3s remaining:    0.0s


[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5, total=   9.8s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5 
[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5, total=   9.8s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=10 
[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=10, total=  17.3s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=10 
[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=10, total=  16.8s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=10 
[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=10, total=  16.8s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=15 
[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=15, total=  24.6s
[CV] classi

[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=5, total=  12.2s
[CV] classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10 
[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10, total=  21.7s
[CV] classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10 
[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10, total=  21.5s
[CV] classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10 
[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10, total=  23.1s
[CV] classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=15 
[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=15, total=  31.5s
[CV] classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=15 
[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=15, total=  30.6

[Parallel(n_jobs=1)]: Done  54 out of  54 | elapsed: 18.8min finished


GroupFold1 score: 0.27231725374064836
Fitting 3 folds for each of 18 candidates, totalling 54 fits
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5, total=   9.5s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   10.9s remaining:    0.0s


[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5, total=   9.7s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5 
[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5, total=   9.8s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=10 
[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=10, total=  17.1s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=10 
[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=10, total=  16.6s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=10 
[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=10, total=  16.5s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=15 
[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=15, total=  22.9s
[CV] classi

[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=5, total=  11.3s
[CV] classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10 
[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10, total=  20.1s
[CV] classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10 
[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10, total=  20.0s
[CV] classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10 
[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10, total=  20.1s
[CV] classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=15 
[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=15, total=  29.7s
[CV] classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=15 
[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=15, total=  28.4

[Parallel(n_jobs=1)]: Done  54 out of  54 | elapsed: 17.5min finished


GroupFold2 score: 0.2285
Fitting 3 folds for each of 18 candidates, totalling 54 fits
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5, total=   9.1s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   10.4s remaining:    0.0s


[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5, total=   9.1s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5 
[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5, total=   9.0s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=10 
[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=10, total=  15.5s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=10 
[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=10, total=  15.5s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=10 
[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=10, total=  15.6s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=15 
[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=15, total=  22.2s
[CV] classi

[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=5, total=  11.3s
[CV] classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10 
[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10, total=  19.8s
[CV] classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10 
[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10, total=  19.6s
[CV] classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10 
[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10, total=  19.7s
[CV] classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=15 
[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=15, total=  28.3s
[CV] classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=15 
[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=15, total=  28.0

[Parallel(n_jobs=1)]: Done  54 out of  54 | elapsed: 17.2min finished


GroupFold3 score: 0.4268846099088838
Fitting 3 folds for each of 18 candidates, totalling 54 fits
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5, total=   9.5s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   10.8s remaining:    0.0s


[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5, total=   9.3s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5 
[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5, total=   9.3s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=10 
[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=10, total=  16.2s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=10 
[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=10, total=  15.9s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=10 
[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=10, total=  16.1s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=15 
[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=15, total=  22.8s
[CV] classi

[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=5, total=  11.5s
[CV] classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10 
[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10, total=  20.7s
[CV] classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10 
[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10, total=  20.4s
[CV] classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10 
[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10, total=  20.4s
[CV] classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=15 
[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=15, total=  29.5s
[CV] classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=15 
[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=15, total=  29.0

[Parallel(n_jobs=1)]: Done  54 out of  54 | elapsed: 17.8min finished


GroupFold4 score: 0.3377338047445255
Fitting 3 folds for each of 18 candidates, totalling 54 fits
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5, total=   9.4s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   10.7s remaining:    0.0s


[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5, total=   9.2s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5 
[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5, total=   9.1s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=10 
[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=10, total=  16.0s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=10 
[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=10, total=  15.6s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=10 
[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=10, total=  15.7s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=15 
[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=15, total=  22.3s
[CV] classi

[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=5, total=  11.2s
[CV] classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10 
[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10, total=  20.4s
[CV] classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10 
[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10, total=  20.0s
[CV] classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10 
[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10, total=  19.8s
[CV] classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=15 
[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=15, total=  28.5s
[CV] classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=15 
[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=15, total=  28.4

[Parallel(n_jobs=1)]: Done  54 out of  54 | elapsed: 17.4min finished


GroupFold5 score: 0.26864098837209305
Fitting 3 folds for each of 18 candidates, totalling 54 fits
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5, total=   9.9s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   11.3s remaining:    0.0s


[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5, total=  10.0s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5 
[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5, total=   9.9s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=10 
[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=10, total=  17.2s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=10 
[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=10, total=  16.6s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=10 
[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=10, total=  17.0s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=15 
[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=15, total=  24.4s
[CV] classi

[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=5, total=  12.2s
[CV] classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10 
[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10, total=  21.3s
[CV] classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10 
[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10, total=  21.5s
[CV] classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10 
[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10, total=  21.4s
[CV] classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=15 
[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=15, total=  30.8s
[CV] classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=15 
[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=15, total=  30.6

[Parallel(n_jobs=1)]: Done  54 out of  54 | elapsed: 18.8min finished


GroupFold6 score: 0.23997707709580837
Fitting 3 folds for each of 18 candidates, totalling 54 fits
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5, total=   9.3s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   10.6s remaining:    0.0s


[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5, total=   9.0s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5 
[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5, total=   9.2s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=10 
[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=10, total=  15.7s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=10 
[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=10, total=  15.7s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=10 
[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=10, total=  15.7s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=15 
[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=15, total=  22.3s
[CV] classi

[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=5, total=  11.5s
[CV] classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10 
[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10, total=  20.1s
[CV] classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10 
[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10, total=  19.7s
[CV] classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10 
[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10, total=  19.9s
[CV] classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=15 
[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=15, total=  28.6s
[CV] classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=15 
[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=15, total=  28.3

[Parallel(n_jobs=1)]: Done  54 out of  54 | elapsed: 17.3min finished


GroupFold7 score: 0.36794777684563756
Fitting 3 folds for each of 18 candidates, totalling 54 fits
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5, total=   9.2s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   10.5s remaining:    0.0s


[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5, total=   9.0s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5 
[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5, total=   9.1s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=10 
[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=10, total=  16.3s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=10 
[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=10, total=  15.6s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=10 
[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=10, total=  15.6s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=15 
[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=15, total=  22.0s
[CV] classi

[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=5, total=  11.2s
[CV] classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10 
[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10, total=  19.5s
[CV] classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10 
[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10, total=  19.8s
[CV] classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10 
[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10, total=  19.9s
[CV] classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=15 
[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=15, total=  28.6s
[CV] classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=15 
[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=15, total=  28.5

[Parallel(n_jobs=1)]: Done  54 out of  54 | elapsed: 17.3min finished


GroupFold8 score: 0.16123744419642858


In [47]:
model_list['GroupFold8'].get_params()


{'cv': 3,
 'error_score': 'raise-deprecating',
 'estimator__memory': None,
 'estimator__steps': [('pipeline', Pipeline(memory=None,
        steps=[('add_features', PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)), ('standardize', StandardScaler(copy=True, with_mean=True, with_std=True))])),
  ('classif',
   RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
               max_depth=None, max_features='auto', max_leaf_nodes=None,
               min_impurity_decrease=0.0, min_impurity_split=None,
               min_samples_leaf=1, min_samples_split=2,
               min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
               oob_score=False, random_state=None, verbose=0,
               warm_start=False))],
 'estimator__pipeline': Pipeline(memory=None,
      steps=[('add_features', PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)), ('standardize', StandardScaler(copy=True, with_mean=True, with_std=T

## Random Forest (first try)

In [10]:
## Step5: Train Classifier(s) 
names = [
         "Random Forest Classifier"
        ]

classifier = [
    RandomForestClassifier()
]

parameters = [
                  {
                      'classif__max_depth': [8,10,12],
                      'classif__n_estimators': [5, 10, 15],
                      'classif__min_samples_leaf': [2,4],
                      'classif__n_jobs': [4]
                  }
             ]

model_list = {}
for name, classifier, params in zip(names, classifier, parameters):
    class_pipe = Pipeline([
        ('pipeline', try_pipe),
        ('classif', classifier),
    ])
    gs_classif = GridSearchCV(class_pipe, params, verbose=2, cv = 5, scoring='accuracy')
    classif = gs_classif.fit(X_train, y_train)
    score = classif.score(X_val, y_val)
    print("{} score: {}".format(name, score))
    model_list[name] = classif

## To include later?
# for train_index, test_index in group_kfold.split(X, y, groups):
#    folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=59)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5, classif__n_jobs=4 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5, classif__n_jobs=4, total=  11.5s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5, classif__n_jobs=4 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   13.5s remaining:    0.0s


[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5, classif__n_jobs=4, total=   9.6s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5, classif__n_jobs=4 
[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5, classif__n_jobs=4, total=   9.0s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5, classif__n_jobs=4 
[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5, classif__n_jobs=4, total=   9.0s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5, classif__n_jobs=4 
[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=5, classif__n_jobs=4, total=   9.0s
[CV] classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=10, classif__n_jobs=4 
[CV]  classif__max_depth=8, classif__min_samples_leaf=2, classif__n_estimators=10, classif__n_jobs=4, total=  12.6s
[CV] classif__max_depth

[CV]  classif__max_depth=10, classif__min_samples_leaf=2, classif__n_estimators=10, classif__n_jobs=4, total=  14.2s
[CV] classif__max_depth=10, classif__min_samples_leaf=2, classif__n_estimators=15, classif__n_jobs=4 
[CV]  classif__max_depth=10, classif__min_samples_leaf=2, classif__n_estimators=15, classif__n_jobs=4, total=  18.2s
[CV] classif__max_depth=10, classif__min_samples_leaf=2, classif__n_estimators=15, classif__n_jobs=4 
[CV]  classif__max_depth=10, classif__min_samples_leaf=2, classif__n_estimators=15, classif__n_jobs=4, total=  18.6s
[CV] classif__max_depth=10, classif__min_samples_leaf=2, classif__n_estimators=15, classif__n_jobs=4 
[CV]  classif__max_depth=10, classif__min_samples_leaf=2, classif__n_estimators=15, classif__n_jobs=4, total=  17.9s
[CV] classif__max_depth=10, classif__min_samples_leaf=2, classif__n_estimators=15, classif__n_jobs=4 
[CV]  classif__max_depth=10, classif__min_samples_leaf=2, classif__n_estimators=15, classif__n_jobs=4, total=  17.9s
[CV] cl

[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=5, classif__n_jobs=4, total=  11.4s
[CV] classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=5, classif__n_jobs=4 
[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=5, classif__n_jobs=4, total=  11.2s
[CV] classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=5, classif__n_jobs=4 
[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=5, classif__n_jobs=4, total=  11.4s
[CV] classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10, classif__n_jobs=4 
[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10, classif__n_jobs=4, total=  15.7s
[CV] classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10, classif__n_jobs=4 
[CV]  classif__max_depth=12, classif__min_samples_leaf=4, classif__n_estimators=10, classif__n_jobs=4, total=  16.6s
[CV] classif

[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed: 24.6min finished


Random Forest Classifier score: 0.9905778379265092


In [11]:
model_list['Random Forest Classifier'].best_params_

{'classif__max_depth': 12,
 'classif__min_samples_leaf': 2,
 'classif__n_estimators': 15,
 'classif__n_jobs': 4}

## Fit Final

## Predict & Submit

In [13]:
## Generate Predictions
y_hat = gs_classif.best_estimator_.predict(test[['orientation_X','orientation_Y','orientation_Z','orientation_W', \
                                                        'angular_velocity_X','angular_velocity_Y','angular_velocity_Z', \
                                                        'linear_acceleration_X','linear_acceleration_Y','linear_acceleration_Z']])

## Transform back to labels
test['surface'] = instance_LabelEncoder.inverse_transform(y_hat)

ValueError: X shape does not match training shape

In [None]:
## join to submission file
submission = pd.read_csv(os.path.join(PATH,'sample_submission.csv'))
answers = test.groupby('series_id').first()[['surface']]
submission['surface'] = answers['surface']

## save as csv
submission.to_csv('submission_29Mar2019b_djs.csv', index=False)