In [1]:
import gc
import os
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
import logging
import datetime
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
from subprocess import check_output
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectPercentile, mutual_info_regression
from sklearn.impute import SimpleImputer 
from sklearn.impute import MissingIndicator
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import mean_squared_error, confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, cross_val_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, GroupShuffleSplit
#from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import FeatureUnion
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
from scipy.stats import reciprocal, uniform

#warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('X_train.csv')
test = pd.read_csv('X_test.csv')
y = pd.read_csv('y_train.csv')

In [89]:
train = train.set_index('series_id').join(y.set_index('series_id'))
trainb = train.loc[train['group_id'].isin([2,7,13,23,37,49])]

## Feature Engineering

In [3]:
class FE(BaseEstimator, TransformerMixin): 
    def __init__(self, columns = None):
        self.col = None
        self.df = pd.DataFrame()
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        self.df = X.copy()
        self.df['totl_anglr_vel'] = (X['angular_velocity_X']**2 + X['angular_velocity_Y']**2 + X['angular_velocity_Z']**2)**0.5
        self.df['totl_linr_acc'] = (X['linear_acceleration_X']**2 + X['linear_acceleration_Y']**2 + X['linear_acceleration_Z'])**0.5
        self.df['totl_xyz'] = (X['orientation_X']**2 + X['orientation_Y']**2 + X['orientation_Z'])**0.5
        self.df['acc_vs_vel'] = X['totl_linr_acc'] / X['totl_anglr_vel']
        
        
        for self.col in X.columns:
            if self.col in ['row_id','series_id','measurement_number']:
                continue
            self.df[self.col + '_mean'] = X.groupby(['series_id'])[self.col].mean()
            self.df[self.col + '_median'] = X.groupby(['series_id'])[self.col].median()
            self.df[self.col + '_max'] = X.groupby(['series_id'])[self.col].max()
            self.df[self.col + '_min'] = X.groupby(['series_id'])[self.col].min()
            self.df[self.col + '_std'] = X.groupby(['series_id'])[self.col].std()
            self.df[self.col + '_range'] = self.df[self.col + '_max'] - self.df[self.col + '_min']
            self.df[self.col + '_maxtoMin'] = self.df[self.col + '_max'] / self.df[self.col + '_min']
            self.df[self.col + '_mean_abs_chg'] = X.groupby(['series_id'])[self.col].apply(lambda x: np.mean(np.abs(np.diff(x))))
            self.df[self.col + '_abs_max'] = X.groupby(['series_id'])[self.col].apply(lambda x: np.max(np.abs(x)))
            self.df[self.col + '_abs_min'] = X.groupby(['series_id'])[self.col].apply(lambda x: np.min(np.abs(x)))
            self.df[self.col + '_abs_avg'] = (self.df[self.col + '_abs_min'] + self.df[self.col + '_abs_max'])/2
            
        return self.df
    
    def fit_transform(self, X, y=None):
        return self.fit(X,y).transform(X)


In [4]:
def quaternion_to_euler(x, y, z, w):

    t0 = +2.0 * (w * x + y * z)
    t1 = +1.0 - 2.0 * (x * x + y * y)
    X = math.atan2(t0, t1)

    t2 = +2.0 * (w * y - z * x)
    t2 = +1.0 if t2 > +1.0 else t2
    t2 = -1.0 if t2 < -1.0 else t2
    Y = math.asin(t2)

    t3 = +2.0 * (w * z + x * y)
    t4 = +1.0 - 2.0 * (y * y + z * z)
    Z = math.atan2(t3, t4)
    
    return X, Y, Z

def FE(data):
    df = pd.DataFrame()
    
    data['norm_quat'] = (data['orientation_X']**2 + data['orientation_Y']**2 + data['orientation_Z']**2 + data['orientation_W']**2)
    data['mod_quat'] = (data['norm_quat'])**0.5
    
    data['norm_X'] = data['orientation_X'] / data['mod_quat']
    data['norm_Y'] = data['orientation_Y'] / data['mod_quat']
    data['norm_Z'] = data['orientation_Z'] / data['mod_quat']
    data['norm_W'] = data['orientation_W'] / data['mod_quat']
    
    data['total_angular_velocity'] = (data['angular_velocity_X'] ** 2 + data['angular_velocity_Y'] ** 2 +
                             data['angular_velocity_Z'] ** 2) ** 0.5
    data['total_linear_acceleration'] = (data['linear_acceleration_X'] ** 2 + data['linear_acceleration_Y'] ** 2 +
                             data['linear_acceleration_Z'] ** 2) ** 0.5
    data['total_orientation'] = (data['orientation_X'] ** 2 + data['orientation_Y'] ** 2 +
                             data['orientation_Z'] ** 2) ** 0.5
    
    data['acc_vs_vel'] = data['total_linear_acceleration'] / data['total_angular_velocity']
    
    x, y, z, w = data['orientation_X'].tolist(), data['orientation_Y'].tolist(), data['orientation_Z'].tolist(), data['orientation_W'].tolist()
    nx, ny, nz = [], [], []
    
    for i in range(len(x)):
        xx, yy, zz = quaternion_to_euler(x[i], y[i], z[i], w[i])
        nx.append(xx)
        ny.append(yy)
        nz.append(zz)
    
    data['euler_x'] = nx
    data['euler_y'] = ny
    data['euler_z'] = nz
    
    data['total_angle'] = (data['euler_x'] ** 2 + data['euler_y'] ** 2 + data['euler_z'] ** 2) ** 0.5
    data['angle_vs_acc'] = data['total_angle'] / data['total_linear_acceleration']
    data['angle_vs_vel'] = data['total_angle'] / data['total_angular_velocity']
    
    def f1(x):
        return np.mean(np.diff(np.abs(np.diff(x))))
    
    def f2(x):
        return np.mean(np.abs(np.diff(x)))
    
    # Deriving more feature, since we are reducing rows now, we should know min, max, mean values
    for col in data.columns:
        if col in ['row_id', 'series_id', 'measurement_number']:
            continue
            
        df[col + '_mean'] = data.groupby(['series_id'])[col].mean()
        df[col + '_max'] = data.groupby(['series_id'])[col].max()
        df[col + '_min'] = data.groupby(['series_id'])[col].min()
        df[col + '_std'] = data.groupby(['series_id'])[col].std()
        df[col + '_maxtoMin'] = df[col + '_max'] / df[col + '_min']
        
        df[col + '_abs_min'] = data.groupby(['series_id'])[col].apply(lambda x: np.min(np.abs(x)))
        df[col + '_abs_max'] = data.groupby(['series_id'])[col].apply(lambda x: np.max(np.abs(x)))
#         df[col + '_abs_std'] = data.groupby(['series_id'])[col].apply(lambda x: np.std(np.abs(x)))
        df[col + '_abs_avg'] = (df[col + '_abs_min'] + df[col + '_abs_max'])/2
        
        # Change. 1st order.
        df[col + '_mean_abs_change'] = data.groupby('series_id')[col].apply(f2)
        
        # Change of Change. 2nd order.
        df[col + '_mean_change_of_abs_change'] = data.groupby('series_id')[col].apply(f1)
        
    return df

In [95]:
train.dtypes

row_id                        object
measurement_number             int64
orientation_X                float64
orientation_Y                float64
orientation_Z                float64
orientation_W                float64
angular_velocity_X           float64
angular_velocity_Y           float64
angular_velocity_Z           float64
linear_acceleration_X        float64
linear_acceleration_Y        float64
linear_acceleration_Z        float64
group_id                       int64
surface                       object
norm_quat                    float64
mod_quat                     float64
norm_X                       float64
norm_Y                       float64
norm_Z                       float64
norm_W                       float64
total_angular_velocity       float64
total_linear_acceleration    float64
total_orientation            float64
acc_vs_vel                   float64
euler_x                      float64
euler_y                      float64
euler_z                      float64
t

In [5]:
train_fe = FE(train)
test_fe = FE(test)

KeyError: 'series_id'

## Pipeline

In [85]:
class_pipe = Pipeline([
    ('smote', SMOTE(random_state=42, sampling_strategy='minority')),
    ('feature_engineering', FE()), 
    #('add_features', PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)),
    ('impute', SimpleImputer(strategy='constant', fill_value=0))
   # ('standardize', StandardScaler()),
   # ('GrdSrch', GridSearchCV(RandomForestClassifier(), param_grid=parameters, scoring='accuracy'))
])

In [86]:
train_prepared = class_pipe.fit_transform(train_fe)
test_prepared = class_pipe.transform(test_fe)

DataError: No numeric types to aggregate

In [83]:
trainb

Unnamed: 0_level_0,row_id,measurement_number,orientation_X,orientation_Y,orientation_Z,orientation_W,angular_velocity_X,angular_velocity_Y,angular_velocity_Z,linear_acceleration_X,linear_acceleration_Y,linear_acceleration_Z,group_id,surface,totl_anglr_vel,totl_linr_acc
series_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,0_0,0,-0.75853,-0.63435,-0.104880,-0.10597,0.107650,0.017561,0.000767,-0.748570,2.103000,-9.7532,13,fine_concrete,0.109076,
0,0_1,1,-0.75853,-0.63434,-0.104900,-0.10600,0.067851,0.029939,0.003385,0.339950,1.506400,-9.4128,13,fine_concrete,0.074240,
0,0_2,2,-0.75853,-0.63435,-0.104920,-0.10597,0.007275,0.028934,-0.005978,-0.264290,1.592200,-8.7267,13,fine_concrete,0.030428,
0,0_3,3,-0.75852,-0.63436,-0.104950,-0.10597,-0.013053,0.019448,-0.008974,0.426840,1.099300,-10.0960,13,fine_concrete,0.025082,
0,0_4,4,-0.75852,-0.63435,-0.104950,-0.10596,0.005135,0.007652,0.005245,-0.509690,1.468900,-10.4410,13,fine_concrete,0.010603,
0,0_5,5,-0.75853,-0.63439,-0.104830,-0.10580,0.059664,0.013043,-0.013231,-0.447450,0.992810,-10.4020,13,fine_concrete,0.062490,
0,0_6,6,-0.75853,-0.63441,-0.104810,-0.10569,0.082140,0.044356,-0.002696,-0.141630,0.734970,-9.4296,13,fine_concrete,0.093390,
0,0_7,7,-0.75852,-0.63444,-0.104800,-0.10561,0.056218,0.038162,-0.022931,-0.121600,0.075417,-8.6088,13,fine_concrete,0.071712,
0,0_8,8,-0.75851,-0.63445,-0.104850,-0.10559,-0.012846,0.039004,-0.007831,1.600000,0.816110,-7.6426,13,fine_concrete,0.041805,
0,0_9,9,-0.75851,-0.63443,-0.104890,-0.10567,-0.090082,0.027299,-0.009970,0.474960,0.909600,-8.8120,13,fine_concrete,0.094654,


In [66]:
train_prepared

Unnamed: 0_level_0,row_id,measurement_number,orientation_X,orientation_Y,orientation_Z,orientation_W,angular_velocity_X,angular_velocity_Y,angular_velocity_Z,linear_acceleration_X,linear_acceleration_Y,linear_acceleration_Z,group_id,surface,totl_anglr_vel,totl_linr_acc,totl_xyz,acc_vs_vel
series_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,0_0,0,-0.75853,-0.63435,-0.104880,-0.10597,0.107650,0.017561,0.000767,-0.748570,2.103000,-9.7532,13,fine_concrete,0.109076,,0.934285,
0,0_1,1,-0.75853,-0.63434,-0.104900,-0.10600,0.067851,0.029939,0.003385,0.339950,1.506400,-9.4128,13,fine_concrete,0.074240,,0.934267,
0,0_2,2,-0.75853,-0.63435,-0.104920,-0.10597,0.007275,0.028934,-0.005978,-0.264290,1.592200,-8.7267,13,fine_concrete,0.030428,,0.934263,
0,0_3,3,-0.75852,-0.63436,-0.104950,-0.10597,-0.013053,0.019448,-0.008974,0.426840,1.099300,-10.0960,13,fine_concrete,0.025082,,0.934246,
0,0_4,4,-0.75852,-0.63435,-0.104950,-0.10596,0.005135,0.007652,0.005245,-0.509690,1.468900,-10.4410,13,fine_concrete,0.010603,,0.934239,
0,0_5,5,-0.75853,-0.63439,-0.104830,-0.10580,0.059664,0.013043,-0.013231,-0.447450,0.992810,-10.4020,13,fine_concrete,0.062490,,0.934339,
0,0_6,6,-0.75853,-0.63441,-0.104810,-0.10569,0.082140,0.044356,-0.002696,-0.141630,0.734970,-9.4296,13,fine_concrete,0.093390,,0.934363,
0,0_7,7,-0.75852,-0.63444,-0.104800,-0.10561,0.056218,0.038162,-0.022931,-0.121600,0.075417,-8.6088,13,fine_concrete,0.071712,,0.934380,
0,0_8,8,-0.75851,-0.63445,-0.104850,-0.10559,-0.012846,0.039004,-0.007831,1.600000,0.816110,-7.6426,13,fine_concrete,0.041805,,0.934352,
0,0_9,9,-0.75851,-0.63443,-0.104890,-0.10567,-0.090082,0.027299,-0.009970,0.474960,0.909600,-8.8120,13,fine_concrete,0.094654,,0.934317,


## Training

In [None]:
def k_folds(X, y, X_test, k):
    folds = StratifiedKFold(n_splits = k, shuffle=True, random_state=2019)
    y_test = np.zeros((X_test.shape[0], 9))
    y_oof = np.zeros((X.shape[0]))
    score = 0
    for i, (train_idx, val_idx) in  enumerate(folds.split(X, y)):
        clf =  RandomForestClassifier(n_estimators = 500, n_jobs = -1)
        clf.fit(X_train.iloc[train_idx], y[train_idx])
        y_oof[val_idx] = clf.predict(X.iloc[val_idx])
        y_test += clf.predict_proba(X_test) / folds.n_splits
        score += clf.score(X.iloc[val_idx], y[val_idx])
        print('Fold: {} score: {}'.format(i,clf.score(X.iloc[val_idx], y[val_idx])))
    print('Avg Accuracy', score / folds.n_splits) 
        
    return y_oof, y_test 