In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy.signal import resample
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
import matplotlib.pyplot as plt
import seaborn as sns
import random
import lightgbm as lgb
from tsfresh import extract_features, extract_relevant_features, select_features
from tsfresh.feature_extraction import MinimalFCParameters,settings,ComprehensiveFCParameters,EfficientFCParameters
from tsfresh.feature_selection import select_features
from tsfresh.utilities.dataframe_functions import impute

  data_klasses = (pandas.Series, pandas.DataFrame, pandas.Panel)


In [2]:
train = pd.read_csv('sensor_train.csv')
test = pd.read_csv('sensor_test.csv')



In [4]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

Memory usage of dataframe is 29.21 MB
Memory usage after optimization is: 6.90 MB
Decreased by 76.4%
Memory usage of dataframe is 26.26 MB
Memory usage after optimization is: 6.56 MB
Decreased by 75.0%


In [3]:
train['mod'] = (train.acc_x ** 2 + train.acc_y ** 2 + train.acc_z ** 2) ** .5
train['modg'] = (train.acc_xg ** 2 + train.acc_yg ** 2 + train.acc_zg ** 2) ** .5
test['mod'] = (test.acc_x ** 2 + test.acc_y ** 2 + test.acc_z ** 2) ** .5
test['modg'] = (test.acc_xg ** 2 + test.acc_yg ** 2 + test.acc_zg ** 2) ** .5

In [37]:
train_X = train.drop(['behavior_id'],axis=1)
train_y = train['behavior_id']
test['fragment_id'] += 10000
X = pd.concat([train_X,test])

In [None]:
X_extracted_features = extract_features(train_X, 
                                        column_id="fragment_id",
                                        column_sort="time_point",
                                        impute_function=impute,
                                        default_fc_parameters=EfficientFCParameters())




Feature Extraction:   0%|          | 0/20 [00:00<?, ?it/s]

Feature Extraction:   5%|▌         | 1/20 [02:32<47:34, 150.23s/it]

In [3]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object :
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            num_unique_values = len(df[col].unique())
            num_total_values = len(df[col])
            rate = num_unique_values/num_total_values
                #rate = df[col].value_counts(normalize=True, dropna=False).values[0]
            if rate <0.5:
                df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


In [None]:
plt.figure(figsize=(25,100))
for behavior_id in range(19):
    frag = random.sample(list(train[train['behavior_id']==behavior_id]['fragment_id'].unique()),20)
    plt.subplot(19,1,behavior_id+1)
    plt.title('behavior_id {}'.format(behavior_id))
    for frag_id in frag:
        sns.lineplot(x='time_point',y='acc_x',data=train[train['fragment_id']==frag_id])

In [33]:

used_feat = [f for f in train.columns if f not in (['fragment_id', 'behavior_id'])]
print(len(used_feat))
print(used_feat)

train_x = train[used_feat]
train_y = train['behavior_id']
test_x = test[used_feat]

scores = []
imp = pd.DataFrame()
imp['feat'] = used_feat

params = {
    'learning_rate': 0.02,
    'metric': 'multi_error',
    'objective': 'multiclass',
    'num_class': 19,
    'feature_fraction': 0.80,
    'bagging_fraction': 0.75,
    'bagging_freq': 2,
    'n_jobs': 4,
    'seed': 2020,
    'max_depth': 7,
    'num_leaves': 31,
    'lambda_l1': 0.7,
    'lambda_l2': 0.7,
}

oof_train = np.zeros((len(train_x), 19))
preds = np.zeros((len(test_x), 19))
folds = 5
seeds = [44]#, 2020, 527, 1527]
for seed in seeds:
    kfold = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)
    for fold, (trn_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
        x_trn, y_trn, x_val, y_val = train_x.iloc[trn_idx], train_y.iloc[trn_idx], train_x.iloc[val_idx], train_y.iloc[val_idx]
        train_set = lgb.Dataset(x_trn, y_trn)
        val_set = lgb.Dataset(x_val, y_val)

        model = lgb.train(params, train_set, num_boost_round=5000,
                          valid_sets=(train_set, val_set), early_stopping_rounds=50,
                          verbose_eval=20)
        oof_train[val_idx] += model.predict(x_val) / len(seeds)
        preds += model.predict(test_x) / folds / len(seeds)
        scores.append(model.best_score['valid_1']['multi_error'])
        imp['gain' + str(fold + 1)] = model.feature_importance(importance_type='gain')
        imp['split' + str(fold + 1)] = model.feature_importance(importance_type='split')
        del x_trn, y_trn, x_val, y_val, model, train_set, val_set
        gc.collect()

7
['time_point', 'acc_x', 'acc_y', 'acc_z', 'acc_xg', 'acc_yg', 'acc_zg']
Training until validation scores don't improve for 50 rounds
[20]	training's multi_error: 0.59481	valid_1's multi_error: 0.596988
[40]	training's multi_error: 0.573284	valid_1's multi_error: 0.575759
[60]	training's multi_error: 0.558875	valid_1's multi_error: 0.561665
[80]	training's multi_error: 0.549136	valid_1's multi_error: 0.551592
[100]	training's multi_error: 0.540094	valid_1's multi_error: 0.54287
[120]	training's multi_error: 0.531216	valid_1's multi_error: 0.534888
[140]	training's multi_error: 0.523784	valid_1's multi_error: 0.527871
[160]	training's multi_error: 0.519688	valid_1's multi_error: 0.523815
[180]	training's multi_error: 0.514257	valid_1's multi_error: 0.518902
[200]	training's multi_error: 0.510616	valid_1's multi_error: 0.515281
[220]	training's multi_error: 0.507181	valid_1's multi_error: 0.511672
[240]	training's multi_error: 0.50334	valid_1's multi_error: 0.508111
[260]	training's mul

KeyboardInterrupt: 

In [3]:
X_extracted_features= pd.read_csv('X_extracted_features.csv')

In [7]:
X_extracted_features.head()

Unnamed: 0,"acc_x__agg_linear_trend__attr_""intercept""__chunk_len_1000__f_agg_""max""","acc_x__agg_linear_trend__attr_""intercept""__chunk_len_1000__f_agg_""mean""","acc_x__agg_linear_trend__attr_""intercept""__chunk_len_1000__f_agg_""min""","acc_x__agg_linear_trend__attr_""intercept""__chunk_len_1000__f_agg_""var""","acc_x__agg_linear_trend__attr_""intercept""__chunk_len_5000__f_agg_""max""","acc_x__agg_linear_trend__attr_""intercept""__chunk_len_5000__f_agg_""mean""","acc_x__agg_linear_trend__attr_""intercept""__chunk_len_5000__f_agg_""min""","acc_x__agg_linear_trend__attr_""intercept""__chunk_len_5000__f_agg_""var""","acc_x__agg_linear_trend__attr_""intercept""__chunk_len_500__f_agg_""max""","acc_x__agg_linear_trend__attr_""intercept""__chunk_len_500__f_agg_""mean""",...,acc_zg__ratio_beyond_r_sigma__r_3,acc_zg__ratio_beyond_r_sigma__r_5,acc_zg__ratio_beyond_r_sigma__r_6,acc_zg__ratio_beyond_r_sigma__r_7,acc_zg__skewness,acc_zg__spkt_welch_density__coeff_2,acc_zg__spkt_welch_density__coeff_5,acc_zg__spkt_welch_density__coeff_8,acc_zg__standard_deviation,acc_zg__variance
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.017544,0.0,0.0,0.0,-0.74665,0.140756,0.164204,0.149466,0.335753,0.11273
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.017857,0.0,0.0,0.0,0.917398,0.019443,0.054591,0.005658,0.239359,0.057293
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.017544,0.0,0.0,0.0,-0.103604,0.145285,0.257611,0.263671,0.42937,0.184358
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.472631,0.070492,0.195622,0.195563,0.410587,0.168582
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.018182,0.0,0.0,0.0,-0.615809,0.364417,1.525267,2.157596,0.746554,0.557342
