In [1]:
import gc
import os
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
import logging
import datetime
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
from subprocess import check_output
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectPercentile, mutual_info_regression
from sklearn.impute import SimpleImputer 
from sklearn.impute import MissingIndicator
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import mean_squared_error, confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, cross_val_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, GroupShuffleSplit
#from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import FeatureUnion
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
from scipy.stats import reciprocal, uniform

#warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('X_train.csv')
test = pd.read_csv('X_test.csv')
y = pd.read_csv('y_train.csv')

In [3]:
train = train.set_index('series_id').join(y.set_index('series_id'))
trainb = train.loc[train['group_id'].isin([2,7,13,23,37,49])]

In [7]:
## Step 1: feature engineering: A) convert to euler B) polynomial features
quats = ['orientation_X','orientation_Y','orientation_Z','orientation_W']
feat_engineering = ColumnTransformer(
    transformers = [
        ('add_features', PolynomialFeatures(degree=2,interaction_only=True,include_bias=False), quats )
    ]
)

## Step 2: standardize? I need to think carefully about my cross-validation scheme
try_pipe = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('add_features', PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)),
    ('standardize', StandardScaler())
])

In [5]:
## Step3: LabelEncode
encoded_response=pd.DataFrame()
instance_LabelEncoder = LabelEncoder()
encoded_response['surface'] = instance_LabelEncoder.fit_transform(trainb['surface'])

### See these references
https://scikit-learn.org/stable/auto_examples/model_selection/plot_nested_cross_validation_iris.html#sphx-glr-auto-examples-model-selection-plot-nested-cross-validation-iris-py

https://scikit-learn.org/stable/modules/grid_search.html#grid-search

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GroupShuffleSplit.html#sklearn.model_selection.GroupShuffleSplit

https://www.kaggle.com/prathamsolanki/can-xgboost-help-robots

In [None]:
class_pipe = Pipeline([
    ('pipeline', try_pipe),
    ('classif', RandomForestClassifier())
])

parameters = [
                  {
                      'classif__max_depth': [8,10,12],
                      'classif__n_estimators': [5, 10, 15],
                      'classif__min_samples_leaf': [2,4]
                  }
             ]

# Define addtional variables in hopes of increasing readability
X_data = train[['orientation_X','orientation_Y','orientation_Z','orientation_W','angular_velocity_X','angular_velocity_Y','angular_velocity_Z','linear_acceleration_X','linear_acceleration_Y','linear_acceleration_Z']]
y_data = encoded_response['surface']
grp_data = train['group_id']
X = X_data.values
y = y_data.values

# Instantiate groupshufflesplit
gss = GroupShuffleSplit(n_splits = 9, test_size = 0.2, random_state=42)

model_list={}
score=[]
# Loop through the training/testing indicies for cross validation
for i, (train_indices, val_indices) in enumerate(gss.split(X = X_data, y=y_data, groups=grp_data)):
    
    # Define training and test sets
    X_train, X_valid = X[train_indices], X[val_indices]
    y_train, y_valid = y[train_indices], y[val_indices]
       
    #print('Train Size: %s | Test: %s' % (train_indices.shape, val_indices.shape))
    
    # Define name for dictionary
    name=str('GroupFold') + str(i)
    
    # Perform cross-validation within a loop
    GrdSrch = GridSearchCV(class_pipe, parameters, verbose=2, cv = 3, scoring='accuracy')
    classif = GrdSrch.fit(X_train, y_train)
    score[i] = classif.score(X_valid, y_valid)
    print("{} score: {}".format(name, score))
    
    model_list[name] = classif
    

## Run Me ver 2

In [8]:
# Define addtional variables in hopes of increasing readability
X_data = trainb[['orientation_X','orientation_Y','orientation_Z','orientation_W','angular_velocity_X','angular_velocity_Y','angular_velocity_Z','linear_acceleration_X','linear_acceleration_Y','linear_acceleration_Z']]
y_data = encoded_response['surface']
grp_data = trainb['group_id']

## I think this will work

names = [
         "Random Forest Classifier"
        ]

classifier = [
    RandomForestClassifier()
]

parameters = [
                  {
                      'classif__max_depth': [12],
                      'classif__n_estimators': [5, 15],
                      'classif__min_samples_leaf': [4]
                  }
             ]


model_list = {}
for name, classifier, params in zip(names, classifier, parameters):
    
    ## Append classifier to pipe
    class_pipe = Pipeline([
        ('pipeline', try_pipe),
        ('classif', classifier)
    ])
    
    ## Attach StratifiedKFold or whatever to CV
    GrdSrch = GridSearchCV(class_pipe, params, verbose=2, scoring='accuracy')
    gs_classif = GrdSrch.fit(X_data, y_data)
    model_list[name] = gs_classif
    
    ## Use cross_val_score
    score = cross_val_score(gs_classif, X=X_data, y=y_data, cv=GroupShuffleSplit().split(X=X_data, y=y_data, groups=grp_data))
    print("{} score: {}".format(name, score))

TypeError: All intermediate steps of the chain should not be Pipelines

In [None]:
model_list['Random Forest Classifier'].best_params_

In [None]:
## Generate Predictions
y_hat = gs_classif.best_estimator_.predict(test[['orientation_X','orientation_Y','orientation_Z','orientation_W', \
                                                        'angular_velocity_X','angular_velocity_Y','angular_velocity_Z', \
                                                        'linear_acceleration_X','linear_acceleration_Y','linear_acceleration_Z']])

## Transform back to labels
test['surface'] = instance_LabelEncoder.inverse_transform(y_hat)

In [None]:
## join to submission file
submission = pd.read_csv('sample_submission.csv')
answers = test.groupby('series_id').first()[['surface']]
submission['surface'] = answers['surface']

## save as csv
submission.to_csv('submission_3Apr2019_djs.csv', index=False)

In [None]:
#>>> y_hat = pipeline.predict(X_test)
#>>> print(classification_report(y_test, y_hat))