In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

import os

import pickle

from tqdm import tqdm

import lightgbm
import xgboost
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier 

import catboost
import gc
import random
random.seed(2233)

# for ens X
from sklearn.metrics import average_precision_score 
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.model_selection import StratifiedShuffleSplit

import matplotlib.pyplot as plt

In [2]:
MODEL = "rf"
all_cols = ['target', 'Subject', 'AccAP__mean_crossings__w=5000', 'AccAP__slope_sign_changes__w=5000', 'AccV__slope_sign_changes__w=5000', 'AccAP__std__w=5000', 'AccAP__kurt__w=5000', 'precent_prograss_AccAP', 'AccV__maximum__w=5000', 'precent_prograss_Time', 'AccML__slope_sign_changes__w=5000', 'AccAP__var__w=5000', 'id', 'AccV__var__w=5000', 'Visit', 'AccV__std__w=5000', 'AccML__std__w=5000', 'AccV__mean_crossings__w=5000', 'AccML__mean_crossings__w=5000', 'AccML__maximum__w=5000', 'AccML__kurt__w=5000', 'precent_prograss_AccV', 'AccAP__maximum__w=5000', 'AccV__kurt__w=5000', 'AccV__abs_energy__w=10000', 'AccAP__abs_sum__w=10000', 'AccML__abs_sum__w=10000', 'AccV__abs_sum__w=10000']
parms = {'n_estimators': 32, 'max_depth': 8, 'min_samples_split': 85, 'min_samples_leaf': 65, 'max_features': 0.20883261560511004}

de_train = pd.read_pickle("/kaggle/input/make-data-tsflex-v9-de/train_de.pkl")[all_cols]
td_train = pd.read_pickle("/kaggle/input/make-data-v9-tsflex-td/train_td.pkl")[all_cols]

de_valid = pd.read_pickle("/kaggle/input/make-data-tsflex-v9-de/valid_de.pkl")[all_cols]
td_valid = pd.read_pickle("/kaggle/input/make-data-v9-tsflex-td/valid_td.pkl")[all_cols]

td_train["Valid"] = 1
td_train["Task"] = 1

td_valid["Valid"] = 1
td_valid["Task"] = 1

df_for_cv =  de_train.append(td_train).reset_index(drop=True)
df_for_sub = df_for_cv.append(de_valid).append(td_valid).reset_index(drop=True)

# folds_train_indexes_cv = pickle.load(open("/kaggle/input/v9-make-folds/folds_train_indexs_cv.pkl","rb"))
# folds_train_indexes_full = pickle.load(open("/kaggle/input/v9-make-folds/folds_train_indexs_full.pkl","rb"))

# folds_valid_indexes_cv = pickle.load(open("/kaggle/input/v9-make-folds/folds_valid_indexs_cv.pkl","rb"))
# folds_valid_indexes_full = pickle.load(open("/kaggle/input/v9-make-folds/folds_valid_indexs_full.pkl","rb"))

In [3]:
# # angry work-around
# def angry_workaround(df, indexs):
#     for fold in range(len(indexs)):
#         df.loc[indexs[fold][0], "target"] = 0    
#         df.loc[indexs[fold][1], "target"] = 1 
#         df.loc[indexs[fold][2], "target"] = 2    
#         df.loc[indexs[fold][3], "target"] = 3
#     return df


# df_for_cv = angry_workaround(df_for_cv, folds_train_indexes_cv)
# df_for_sub = angry_workaround(df_for_sub, folds_train_indexes_full)

In [4]:
# folds_train_indexs_full = pickle.load(open("/kaggle/input/v9-make-folds/folds_train_indexs_full.pkl", "rb"))
# train_ind = set([])
# for ele in folds_train_indexs_full:
#     for ind in ele:
#         train_ind.add(ind)
# relevant_ind = list(train_ind)

# df_for_sub = df_for_sub.loc[relevant_ind]

In [5]:
def get_model(parms= None, model_name="lgbm"):
    if model_name == "lgbm":
        if parms is None:
            model = lightgbm.LGBMClassifier()
        else:
            model = lightgbm.LGBMClassifier(**parms)
    elif model_name == "rf":
        if parms is None:
            model = RandomForestClassifier()
        else:
            model = RandomForestClassifier(**parms)     
            
    elif model_name == "xgboost":
        if parms is None:
            model = xgboost.XGBClassifier()
        else:
            model = xgboost.XGBClassifier(**parms) 
            
    elif model_name == "catboost":
        if parms is None:
            model = catboost.CatBoostClassifier()
        else:
            model = catboost.CatBoostClassifier(**parms) 
            
    elif model_name == "adaboost":
        if parms is None:
            model = AdaBoostClassifier()
        else:
            model = AdaBoostClassifier(**parms)
            
    return model

In [6]:
def fix_y(y):
    return y - 1 # (1,2,3) -> (0,1,2)

def fit_save(train_X,train_y, train_cols, cv_or_full,parms= None, model_name="lgbm"):
    no_train_cols = ["Time","Valid","Task",'Subject','file', 'id', 'target']    
    train_cols = [c for c in train_cols if c not in no_train_cols]
    
    model = get_model(parms, model_name)
    no_sh = False
    
    if set(train_y.unique()) == set([1,2,3]): # no SH
        train_y = fix_y(train_y)
        no_sh = True
    gc.collect()
    model.fit(train_X[train_cols], train_y)
    gc.collect()
    model_dict = {"model_name" : model_name,
                 "model": model,
                 "parms": parms,
                 "features": train_cols}
    pickle.dump(model_dict, open(f"model_dict_{model_name}_{cv_or_full}.pkl", "wb"))

In [7]:
#print(f"fit_save_{MODEL}_for_CV")
#fit_save(df_for_cv,df_for_cv["target"], train_cols=all_cols, parms= parms, model_name=MODEL, cv_or_full = "cv")

# print(f"Make_ENS_Xy_{MODEL}_for_CV")
# preds, gt = pipeline(df_for_cv, all_cols, folds=5, parms=parms, model_name=MODEL, cv_or_full = "cv")

In [8]:
print(f"fit_save_{MODEL}_for_submission")
fit_save(df_for_sub,df_for_sub["target"], train_cols=all_cols, parms= parms, model_name=MODEL, cv_or_full = "full")

# print(f"Make_ENS_Xy_{MODEL}_for_submission")
# preds, gt = pipeline(df_for_sub, all_cols, folds_total=folds_total, folds_start=folds_start,folds_end=folds_end,
#                      train_folds_list=folds_train_indexes_full , valid_folds_list= folds_valid_indexes_full, parms=parms, model_name=MODEL, cv_or_full = "full")


fit_save_rf_for_submission
