In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

import os
import gc
import pickle

from IPython.core.debugger import set_trace

from tqdm import tqdm
from sklearn import preprocessing

import lightgbm
import xgboost
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import AdaBoostClassifier 
from sklearn.linear_model import LogisticRegression


import catboost
import random
random.seed(20)

# Install tsflex and seglearn
!pip install tsflex --no-index --find-links=file:///kaggle/input/tsflex
!pip install seglearn --no-index --find-links=file:///kaggle/input/segalearn


from seglearn.feature_functions import base_features, emg_features

from tsflex.features import FeatureCollection, MultipleFeatureDescriptors
from tsflex.features.integrations import seglearn_feature_dict_wrapper

Looking in links: file:///kaggle/input/tsflex
Processing /kaggle/input/tsflex/tsflex-0.3.0-py3-none-any.whl
Installing collected packages: tsflex
Successfully installed tsflex-0.3.0
[0mLooking in links: file:///kaggle/input/segalearn
Processing /kaggle/input/segalearn/seglearn-1.2.5-py3-none-any.whl
Installing collected packages: seglearn
Successfully installed seglearn-1.2.5
[0m

In [2]:
def fix_invalid_events(df):
    for e_type in ["StartHesitation", "Turn",'Walking']:
        df.loc[(df["Valid"] == False) | (df["Task"] == False), e_type] = 0
        
    return df
def min_max_feature(df, feature):
    new_feature = f"precent_prograss_{feature}"
    df[new_feature] = (df[feature] - df[feature].min()) / (df[feature].max() - df[feature].min())
    df[new_feature] = df[new_feature]                                                                                                     
    return df

In [3]:
def reduce_memory_usage(df):
    
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype.name
        if ((col_type != 'datetime64[ns]') & (col_type != 'category')):
            if (col_type != 'object'):
                c_min = df[col].min()
                c_max = df[col].max()

                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)

                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        pass
            else:
                continue
    mem_usg = df.memory_usage().sum() / 1024**2 
    print("Memory usage became: ",mem_usg," MB")
    
    return df

In [4]:
ss = pd.read_csv("/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/sample_submission.csv")

In [5]:
def FE_tsflex(df, is_td, isTest=False ):
#     tdcsfog (128 timesteps per second)
#     defog (100 timesteps per second).

    if is_td:
        second = 128
    else:
        second = 100

    
    if not isTest:
        if "Valid" in df.columns:
            df = fix_invalid_events(df)

    for col in ["Time", "AccV", "AccML", "AccAP"]:
        df = min_max_feature(df, col)
    
    #TODO TDFLEX FEATURES
    basic_feats = MultipleFeatureDescriptors(
        functions=seglearn_feature_dict_wrapper(base_features()),
        series_names=['AccV', 'AccML', 'AccAP'],
        windows=[5_000, 10_000],
        strides=[5_000, 10_000],
    )

    emg_feats = emg_features()
    del emg_feats['simple square integral'] # is same as abs_energy (which is in base_features)

    emg_feats = MultipleFeatureDescriptors(
        functions=seglearn_feature_dict_wrapper(emg_feats),
        series_names=['AccV', 'AccML', 'AccAP'],
        windows=[5_000, 10_000],
        strides=[5_000, 10_000],
    )

    fc = FeatureCollection([basic_feats, emg_feats])
    df_feats = fc.calculate(df, return_df=True, include_final_window=True, approve_sparsity=True, window_idx="begin")
    df = df.merge(df_feats, how="left", left_index=True, right_index=True).fillna(method="ffill")
    
    return df

In [6]:
def describe_max_min(df, ranges_maxs):
    new_cols = []
    for col in ["AccV", "AccML", "AccAP"]:
        past_1 = df[col].shift(1).fillna(method="bfill")
        future_1 = df[col].shift(-1).fillna(method="ffill")

        is_max = np.where((df[col] > past_1) & (df[col] > future_1), True, False)
        is_min = np.where((df[col] < past_1) & (df[col] < future_1), True, False)

        last_max_temp = df[col].where(is_max).ffill().fillna(0)
        last_min_temp = df[col].where(is_min).ffill().fillna(0)

        maxs_df = pd.DataFrame(df[col].where(is_max).dropna())
        maxs_df['Time'] = df["Time"].where(is_max).dropna().astype("int")

        cols_to_stat = []
        for lags_max in ranges_maxs:
            lags = list(range(1,lags_max+1))
            for lag in lags:
                if f"{col}_max_{lag}_ago" not in cols_to_stat:
                    cols_to_stat.append(f"{col}_max_{lag}_ago")
                    maxs_df[f"{col}_max_{lag}_ago"] = maxs_df[col].shift(lag).fillna(method="bfill")

            df[f"{col}_min_past_{lags_max}_maxs"] = maxs_df[cols_to_stat].min(axis=1)
            df[f"{col}_max_past_{lags_max}_maxs"] = maxs_df[cols_to_stat].max(axis=1)
            df[f"{col}_mean_past_{lags_max}_maxs"] = maxs_df[cols_to_stat].mean(axis=1)
            df[f"{col}_std_past_{lags_max}_maxs"] = maxs_df[cols_to_stat].std(axis=1)

            new_cols.extend([f"{col}_min_past_{lags_max}_maxs", 
                            f"{col}_max_past_{lags_max}_maxs",
                            f"{col}_mean_past_{lags_max}_maxs",
                            f"{col}_std_past_{lags_max}_maxs"])


        mins_df = pd.DataFrame(df[col].where(is_min).dropna())
        mins_df['Time'] = df["Time"].where(is_min).dropna().astype("int")

        cols_to_stat = []
        for lags_max in ranges_maxs:
            lags = list(range(1,lags_max+1))
            for lag in lags:
                if f"{col}_min_{lag}_ago" not in cols_to_stat:
                    cols_to_stat.append(f"{col}_min_{lag}_ago")
                    mins_df[f"{col}_min_{lag}_ago"] = mins_df[col].shift(lag).fillna(method="bfill")

            df[f"{col}_min_past_{lags_max}_mins"] = mins_df[cols_to_stat].min(axis=1)
            df[f"{col}_max_past_{lags_max}_mins"] = mins_df[cols_to_stat].max(axis=1)
            df[f"{col}_mean_past_{lags_max}_mins"] = mins_df[cols_to_stat].mean(axis=1)
            df[f"{col}_std_past_{lags_max}_mins"] = mins_df[cols_to_stat].std(axis=1)
    
            new_cols.extend([f"{col}_min_past_{lags_max}_mins", 
                f"{col}_max_past_{lags_max}_mins",
                f"{col}_mean_past_{lags_max}_mins",
                f"{col}_std_past_{lags_max}_mins"])
    for col in new_cols:
        df[col] = df[col].fillna(method="ffill").fillna(method="bfill")
    del mins_df, maxs_df
    gc.collect()
    
    return df

In [7]:
def lag_feature(df,col_to_lag, lag, second,lag_scale="Time" ):
    if lag_scale == "Time":
        df[f"{col_to_lag}_{lag}_{lag_scale}_ago"] = df[col_to_lag].shift(lag*second).fillna(method="bfill")
        df[f"{col_to_lag}_{lag}_{lag_scale}_from_now"] = df[col_to_lag].shift(-lag*second).fillna(method="ffill")
        if df[f"{col_to_lag}_{lag}_{lag_scale}_ago"].isna().sum() > 0:
            df[f"{col_to_lag}_{lag}_{lag_scale}_ago"] = df[f"{col_to_lag}_{lag}_{lag_scale}_ago"].fillna(0)
        if df[f"{col_to_lag}_{lag}_{lag_scale}_from_now"].isna().sum() > 0:
            df[f"{col_to_lag}_{lag}_{lag_scale}_from_now"] = df[f"{col_to_lag}_{lag}_{lag_scale}_from_now"].fillna(0)
        df[f"{col_to_lag}_{lag}_{lag_scale}_ago"] = df[f"{col_to_lag}_{lag}_{lag_scale}_ago"].astype("float32")
        df[f"{col_to_lag}_{lag}_{lag_scale}_from_now"] = df[f"{col_to_lag}_{lag}_{lag_scale}_from_now"].astype("float32")
                                                                                                               
    return df

def min_max_feature(df, feature):
    new_feature = f"precent_prograss_{feature}"
    df[new_feature] = (df[feature] - df[feature].min()) / (df[feature].max() - df[feature].min())
    df[new_feature] = df[new_feature]                                                                                                     
    return df

def fix_invalid_events(df):
    for e_type in ["StartHesitation", "Turn",'Walking']:
        df.loc[(df["Valid"] == False) | (df["Task"] == False), e_type] = 0
        
    return df

"""
- Stats in fences, for example, 2 seconds ago - 4 seconds ago,
what was the mean? std? max? min?
"""
def fences_features(df,col, margin, width, stat, second):
    stat_feature = f"moving_{stat}_{col}_{width}"
    df[f"fence_{col}_m{margin}_w{width}_past"] = df[stat_feature].shift(margin*second).fillna(method="bfill")
    df[f"fence_{col}_m{margin}_w{width}_future"] = df[stat_feature].shift(-margin*second).fillna(method="ffill")
    
    df[f"fence_{col}_m{margin}_w{width}_past"] = df[f"fence_{col}_m{margin}_w{width}_past"].fillna(0)
    df[f"fence_{col}_m{margin}_w{width}_future"] = df[f"fence_{col}_m{margin}_w{width}_future"].fillna(0)
    df[f"fence_{col}_m{margin}_w{width}_future"]= df[f"fence_{col}_m{margin}_w{width}_future"].replace([np.inf, -np.inf], np.nan).fillna(method="bfill").fillna(-1).astype("float32")
    df[f"fence_{col}_m{margin}_w{width}_past"]= df[f"fence_{col}_m{margin}_w{width}_past"].replace([np.inf, -np.inf], np.nan).fillna(method="bfill").fillna(-1).astype("float32")

    return df

# def whole file stats - std/min/max/precentiles
def whole_file_stats(df):
    for col in ["AccV", "AccML", "AccAP"]:
        df[f"graph_{col}_mean"] = df[col].mean()
        df[f"graph_{col}_std"] = df[col].std()
        df[f"graph_{col}_min"] = df[col].min()
        df[f"graph_{col}_max"] = df[col].max()
        
        half_file = df.shape[0] // 2
        
        first_half = df.iloc[:half_file]
        second_half = df.iloc[half_file:]
        
        df[f"graph_first_half_{col}_mean"] = first_half[col].mean()
        df[f"graph_first_half_{col}_std"] = first_half[col].std()
        df[f"graph_first_half_{col}_min"] = first_half[col].min()
        df[f"graph_first_half_{col}_max"] = first_half[col].max()
        
        df[f"graph_second_half_{col}_mean"] = second_half[col].mean()
        df[f"graph_second_half_{col}_std"] = second_half[col].std()
        df[f"graph_second_half_{col}_min"] = second_half[col].min()
        df[f"graph_second_half_{col}_max"] = second_half[col].max()
    return df
        
def FE(df, is_td, isTest=False ):
#     tdcsfog (128 timesteps per second)
#     defog (100 timesteps per second).

    df = describe_max_min(df, [10,20, 30, 40])
    df = whole_file_stats(df)

    if is_td:
        second = 128
    else:
        second = 100

    
    if not isTest:
        if "Valid" in df.columns:
            df = fix_invalid_events(df)
                                                                                                               
    for col in ["Time", "AccV", "AccAP"]:
        df = min_max_feature(df, col)
    
    
    for col in ["AccV", "AccAP"]:
        new_col = col + "_pct"
        df[new_col] = df[col].pct_change()
        df[new_col] = df[new_col].replace([np.inf, -np.inf], np.nan).fillna(method="bfill")
        df[new_col] = df[new_col].fillna(0).astype("float32") # 0 to 0 returns null
       
                        
        for margin in [5,10, 15, 25, 35, 45, 55, 65, 75, 85, 95, 105]:
            for w in[5,10]:
                real_winow= second * w
                df[f"moving_std_{new_col}_{w}"] = df[new_col].rolling(window = real_winow).std().fillna(method="bfill")
                # in case the file is smaller than w
                df[f"moving_std_{new_col}_{w}"] = df[f"moving_std_{new_col}_{w}"].fillna(-1)
                df = fences_features(df,new_col, margin=margin, width=w, stat="std", second=second)
                df[f"moving_std_{new_col}_{w}"] =  df[f"moving_std_{new_col}_{w}"].astype("float32")
        
        df[f"moving_std_all_{col}"] = df[col].expanding().std().fillna(method="bfill")
        df = min_max_feature(df, f"moving_std_all_{col}")
        
        df[f"moving_mean_all_{col}"] = df[col].expanding().mean().fillna(method="bfill")
        df = min_max_feature(df, f"moving_mean_all_{col}")

    return df

In [8]:
tdcsfog_metadata = pd.read_csv("/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/tdcsfog_metadata.csv")
# rememver, Visit only relevant for defog
tdcsfog_metadata["Medication"] = np.where(tdcsfog_metadata["Medication"] == "on", 1, 0)
tdcsfog_subject_dict =  dict(zip(tdcsfog_metadata["Id"], tdcsfog_metadata["Subject"]))
tdcsfog_medication_dict = dict(zip(tdcsfog_metadata["Id"], tdcsfog_metadata["Medication"]))
tdcsfog_Id_Visit = dict(zip(tdcsfog_metadata["Id"], tdcsfog_metadata["Visit"]))
tdcsfog_Id_Test  =dict(zip(tdcsfog_metadata["Id"], tdcsfog_metadata["Test"]))

In [9]:
defog_metadata = pd.read_csv("/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/defog_metadata.csv")
subjects = pd.read_csv("/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/subjects.csv")
subjects["Visit"] = subjects["Visit"].fillna(1)
subjects = subjects.drop_duplicates(subset=["Subject", "Visit"])

defog_metadata["Medication"] = np.where(defog_metadata["Medication"] == "on", 1, 0)
defog_subject_dict = dict(zip(defog_metadata["Id"], defog_metadata["Subject"]))
defog_medication_dict = dict(zip(defog_metadata["Id"], defog_metadata["Medication"]))
defog_Id_Visit = dict(zip(defog_metadata["Id"], defog_metadata["Visit"]))
defog_Id_Test  =dict(zip(defog_metadata["Id"], np.zeros(defog_metadata.shape[0])))

In [10]:
subjects["UPDRSIII_On"] = subjects["UPDRSIII_On"].fillna(0)
subjects["UPDRSIII_Off"] = subjects["UPDRSIII_Off"].fillna(0)

In [11]:
def flat_outliers(df):
    for col in ['AccV','AccML','AccV']:
        max_value = df[col].quantile(q=0.99)
        min_value = df[col].quantile(q=0.01)
        df[col] = np.where(df[col] > max_value, max_value, df[col])
        df[col] = np.where(df[col] < min_value, min_value, df[col])
    return df

In [12]:
def create_target(df):
    class_dict = {0: "StartHesitation", 1: "Turn", 2:"Walking", 3:"None"}
    df["target"] = 3
    df["target"] = np.where(df["StartHesitation"] == 1, 0, df["target"] )
    df["target"] = np.where(df["Turn"] == 1, 1, df["target"] )
    df["target"] = np.where(df["Walking"] == 1, 2, df["target"] )
    
    df = df.drop(["StartHesitation", "Turn", "Walking"], axis = 1)
    return df

In [13]:
def make_df(base,isTest=False, black_list = []):
    train= pd.DataFrame()
    if "tdcsfog" in base:
        is_td = True

    else:
        is_td = False
    
    for train_path in tqdm(os.listdir(base)):
        file_path = base + '/'+train_path
        df = pd.read_csv(file_path)
        
        df = flat_outliers(df)
        df = FE(df, is_td)
        
        df_time = df["Time"].copy()
        df = df.set_index("Time")
        df["Time"] = df_time
        del df_time
        gc.collect()
        
        df = FE_tsflex(df, is_td)
        
        if not isTest:
            df = create_target(df)
            
        df["file"] = train_path.split(".")[0]
        df["id"] = df["file"].astype("str") + "_" + df["Time"].astype("str")
        
        dot_index = train_path.index(".")
        file_id = train_path[:dot_index]
        
        if "tdcsfog" in base:
            df["Subject"] = tdcsfog_subject_dict[file_id]
            df["Medication"] =  tdcsfog_medication_dict[file_id]
            df["Visit"] = tdcsfog_Id_Visit[file_id]
            df["Test_level"] =tdcsfog_Id_Test[file_id]

        else:
            df["Subject"] = defog_subject_dict[file_id]
            df["Medication"] = defog_medication_dict[file_id]
            df["Visit"] = defog_Id_Visit[file_id]
            df["Test_level"] =defog_Id_Test[file_id]

        if train.shape[0] == 0:
            cur_black_list = [c for c in black_list if c in df.columns]
            coverted_first = reduce_memory_usage(df.drop(cur_black_list, axis = 1))
            new_dtypes_dict = coverted_first.dtypes.to_dict()
            train_cols = [c for c in df.columns if c not in cur_black_list]
            
        train = train.append(df[train_cols].astype(new_dtypes_dict))[train_cols]
        del df
        gc.collect()

    train = reduce_memory_usage(train)
    train.reset_index(drop=True, inplace=True)     
    for col in train.columns:
        if train[col].dtype != "object":
            if train[col].max() > 99999 or train[col].min()<-9999:
                train[col]=train[col].replace([np.inf,-np.inf], np.nan).fillna(0)
    return train

In [14]:
def get_white_list(models):
    white_list = set([])
    for model_name in models: 
        model = models[model_name][0]
        if "xgboost" in model_name :
            model_cols = model.get_booster().feature_names

        elif "lgbm" in model_name:
            model_cols = model.feature_name_

        elif "rf" in model_name or "adaboost" in model_name: 
            model_cols =  model.feature_names_in_
        else:
            model_cols = model.feature_names_

        white_list = white_list.union(set(model_cols)).union(["Subject", "Visit","id"])
        
    return white_list

In [15]:
catboost_tsflex_model_dict = pickle.load(open("/kaggle/input/make-final-model-full-catboost-v9-tsflex/model_dict_catboost_full.pkl", "rb"))
xgboost_tsflex_model_dict = pickle.load(open("/kaggle/input/make-final-model-full-xgboost-v9-tsflex/model_dict_xgboost_full.pkl", "rb"))
lgbm_tsflex_model_dict = pickle.load(open("/kaggle/input/make-final-model-full-lgbm-v9-tsflex/model_dict_lgbm_full.pkl", "rb"))
rf_tsflex_model_dict = pickle.load(open("/kaggle/input/make-final-model-full-rf-v9-tsflex/model_dict_rf_full.pkl", "rb"))
adaboost_tsflex_model_dict = pickle.load(open("/kaggle/input/make-final-model-full-adaboost-v9-tsflex/model_dict_adaboost_full.pkl", "rb"))

catboost_model_dict = pickle.load(open("/kaggle/input/make-final-model-full-catboost-v9/model_dict_catboost_full.pkl", "rb"))
xgboost_model_dict = pickle.load(open("/kaggle/input/make-final-model-full-xgboost-v9/model_dict_xgboost_full.pkl", "rb"))
lgbm_model_dict = pickle.load(open("/kaggle/input/make-final-model-full-lgbm-v9/model_dict_lgbm_full.pkl", "rb"))
rf_model_dict = pickle.load(open("/kaggle/input/make-final-model-full-rf-v9/model_dict_rf_full.pkl", "rb"))
adaboost_model_dict = pickle.load(open("/kaggle/input/make-final-model-full-adaboost-v9/model_dict_adaboost_full.pkl", "rb"))

white_list = pickle.load(open("/kaggle/input/black-list-v9/white_list.pkl","rb"))

In [16]:
black_list = pickle.load(open("/kaggle/input/black-list-v9/black_list.pkl", "rb"))
td_test = make_df("/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/test/tdcsfog",black_list= black_list, isTest=True)
de_test = make_df("/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/test/defog", black_list = black_list,isTest=True)


ss_de = ss.loc[ss.Id.isin(de_test.id)].reset_index(drop=True)
ss_td = ss.loc[ss.Id.isin(td_test.id)].reset_index(drop=True)

  


Memory usage of dataframe is 8.70 MB
Memory usage became:  2.8541927337646484  MB


100%|██████████| 1/1 [00:02<00:00,  2.60s/it]


Memory usage of dataframe is 2.73 MB
Memory usage became:  2.728178024291992  MB


  0%|          | 0/1 [00:00<?, ?it/s]

Memory usage of dataframe is 523.85 MB
Memory usage became:  170.85753631591797  MB


100%|██████████| 1/1 [00:35<00:00, 35.65s/it]


Memory usage of dataframe is 162.79 MB
Memory usage became:  162.7949981689453  MB


In [17]:
print(td_test.shape)
# merging only on Subject because Visit is relevant for defog only
td_test = td_test.merge(subjects.drop("Visit",axis = 1),
                                    on=["Subject"], how="left")
print(td_test.shape)

print(de_test.shape)
de_test = de_test.merge(subjects, on=["Subject","Visit"], how="left")
print(de_test.shape)

td_test['Sex'] = np.where(td_test['Sex'] == "M", 1, 0)
de_test['Sex'] = np.where(de_test['Sex'] == "M", 1, 0)

td_test = reduce_memory_usage(td_test)
de_test = reduce_memory_usage(de_test)

(4682, 290)
(4682, 296)
(281688, 290)
(281688, 296)
Memory usage of dataframe is 2.94 MB
Memory usage became:  2.7683639526367188  MB
Memory usage of dataframe is 175.69 MB
Memory usage became:  165.2127456665039  MB


In [18]:
def fix_y(y):
    return y - 1 # (1,2,3) -> (0,1,2)

def predict(test, train_cols, model, model_name="dummy", scaler = None):
    preds = model.predict_proba(test[train_cols])

    preds_df = pd.DataFrame()
    preds_df["Id"] = test["id"]


    preds_df["StartHesitation"] = [p[0] for p in preds]
    preds_df["Turn"] = [p[1] for p in preds]
    preds_df["Walking"] = [p[2] for p in preds]

    preds_df.index=test.index
    return preds_df

def combine_preds(preds_td, preds_de, ss):
    ss = ss.drop(["StartHesitation", "Turn", "Walking"], axis = 1)
    
    ss = ss.merge(preds_td, on="Id", how="left")
    ss = ss.merge(preds_de, on="Id", how="left",  suffixes = ("_td", "_de"))
    
    ss["StartHesitation"] = np.where(ss["StartHesitation_td"].isna(),ss["StartHesitation_de"],  ss["StartHesitation_td"])
    ss["Turn"] = np.where(ss["Turn_td"].isna(),ss["Turn_de"],  ss["Turn_td"])
    ss["Walking"] = np.where(ss["Walking_td"].isna(),ss["Walking_de"],  ss["Walking_td"])
    
    return ss[["Id","StartHesitation", "Turn", "Walking"]]

In [19]:
# catboost
preds_td_catboost = predict(td_test, train_cols=catboost_model_dict["features"], model=catboost_model_dict["model"])
preds_de_catboost = predict(de_test, train_cols=catboost_model_dict["features"], model=catboost_model_dict["model"])
final_ss_catboost = combine_preds(preds_td_catboost, preds_de_catboost, ss.copy())

#xgb
preds_td_xgboost = predict(td_test, train_cols=xgboost_model_dict["features"], model=xgboost_model_dict["model"])
preds_de_xgboost = predict(de_test,train_cols=xgboost_model_dict["features"], model=xgboost_model_dict["model"])
final_ss_xgboost = combine_preds(preds_td_xgboost, preds_de_xgboost, ss.copy())


#lgbm
preds_td_lgbm = predict(td_test, train_cols=lgbm_model_dict["features"], model=lgbm_model_dict["model"])
preds_de_lgbm = predict(de_test, train_cols=lgbm_model_dict["features"], model = lgbm_model_dict["model"])
final_ss_lgbm = combine_preds(preds_td_lgbm, preds_de_lgbm, ss.copy())


#rf
preds_td_rf = predict(td_test, train_cols=rf_model_dict["features"], model=rf_model_dict["model"])
preds_de_rf = predict(de_test,train_cols=rf_model_dict["features"], model=rf_model_dict["model"])
final_ss_rf = combine_preds(preds_td_rf, preds_de_rf, ss.copy())

#adaboost
preds_td_adaboost = predict(td_test,  train_cols=adaboost_model_dict["features"], model=adaboost_model_dict["model"])
preds_de_adaboost = predict(de_test, train_cols=adaboost_model_dict["features"], model=adaboost_model_dict["model"])
final_ss_adaboost = combine_preds(preds_td_adaboost, preds_de_adaboost, ss.copy())

In [20]:
# catboost

preds_td_catboost_tsflex = predict(td_test, train_cols=catboost_tsflex_model_dict["features"], model=catboost_tsflex_model_dict["model"])
preds_de_catboost_tsflex = predict(de_test, train_cols=catboost_tsflex_model_dict["features"], model=catboost_tsflex_model_dict["model"])
final_ss_tsflex_catboost = combine_preds(preds_td_catboost_tsflex, preds_de_catboost_tsflex, ss.copy())

#xgb
preds_td_xgboost_tsflex = predict(td_test, train_cols=xgboost_tsflex_model_dict["features"], model=xgboost_tsflex_model_dict["model"])
preds_de_xgboost_tsflex = predict(de_test,train_cols=xgboost_tsflex_model_dict["features"], model=xgboost_tsflex_model_dict["model"])
final_ss_tsflex_xgboost = combine_preds(preds_td_xgboost_tsflex, preds_de_xgboost_tsflex, ss.copy())


#lgbm
preds_td_lgbm_tsflex = predict(td_test, train_cols=lgbm_tsflex_model_dict["features"], model=lgbm_tsflex_model_dict["model"])
preds_de_lgbm_tsflex = predict(de_test, train_cols=lgbm_tsflex_model_dict["features"], model = lgbm_tsflex_model_dict["model"])
final_ss_tsflex_lgbm = combine_preds(preds_td_lgbm_tsflex, preds_de_lgbm_tsflex, ss.copy())


#rf
preds_td_rf_tsflex = predict(td_test, train_cols=rf_tsflex_model_dict["features"], model=rf_tsflex_model_dict["model"])
preds_de_rf_tsflex = predict(de_test,train_cols=rf_tsflex_model_dict["features"], model=rf_tsflex_model_dict["model"])
final_ss_tsflex_rf = combine_preds(preds_td_rf_tsflex, preds_de_rf_tsflex, ss.copy())

#adaboost
preds_td_adaboost_tsflex = predict(td_test,  train_cols=adaboost_tsflex_model_dict["features"], model=adaboost_tsflex_model_dict["model"])
preds_de_adaboost_tsflex = predict(de_test, train_cols=adaboost_tsflex_model_dict["features"], model=adaboost_tsflex_model_dict["model"])
final_ss_tsflex_adaboost = combine_preds(preds_td_adaboost_tsflex, preds_de_adaboost_tsflex, ss.copy())

In [21]:
finals = {"final_ss_catboost": final_ss_catboost, "final_ss_xgboost": final_ss_xgboost,
         "final_ss_lgbm": final_ss_lgbm,"final_ss_rf":final_ss_rf, 'final_ss_adaboost':final_ss_adaboost,
        "final_ss_tsflex_catboost":final_ss_tsflex_catboost, "final_ss_tsflex_xgboost":final_ss_tsflex_xgboost,
          "final_ss_tsflex_lgbm":final_ss_tsflex_lgbm, "final_ss_tsflex_rf":final_ss_tsflex_rf,
             "final_ss_tsflex_adaboost":final_ss_tsflex_adaboost }
 # from baging lab
weights = {'final_ss_rf': 0.3, 'final_ss_lgbm': 2,
           'final_ss_xgboost': 3, 'final_ss_catboost': 3, 'final_ss_adaboost': 0.0654058370958252,
           'final_ss_tsflex_rf': 0.3, 'final_ss_tsflex_lgbm': 2, 
           'final_ss_tsflex_xgboost': 3.5, 'final_ss_tsflex_catboost': 3, 
           'final_ss_tsflex_adaboost': 0.7}


In [22]:
c = 0
total_models = 0
to_sub=pd.DataFrame()
for final_name in finals:
    final = finals[final_name]
    sh_cols = [c for c in final.columns if "StartH" in c]
    turn_cols= [c for c in final.columns if "Turn" in c]
    walking_cols= [c for c in final.columns if "Walking" in c]
    
    total_models += len(sh_cols)
    if c== 0:
        to_sub["StartHesitation"]=final[sh_cols].sum(axis=1) * weights[final_name]
        to_sub["Turn"]=final[turn_cols].sum(axis=1) * weights[final_name]
        to_sub["Walking"]=final[walking_cols].sum(axis=1) * weights[final_name]
    
    else:
        to_sub["StartHesitation"] += final[sh_cols].sum(axis=1) * weights[final_name]
        to_sub["Turn"] += final[turn_cols].sum(axis=1) * weights[final_name]
        to_sub["Walking"] += final[walking_cols].sum(axis=1) * weights[final_name]
    c += 1

to_sub = to_sub / total_models
to_sub.insert(0, "Id",final_ss_xgboost["Id"])

In [23]:
to_sub.to_csv("submission.csv", index=False)