In [1]:
import gc
import numpy as np
import pandas as pd

In [2]:
def aggregation(input_df, group_key, group_values, agg_methods):
    new_df = []
    for agg_method in agg_methods:
        for col in group_values:
            if callable(agg_method):
                agg_method_name = agg_method.__name__
            else:
                agg_method_name = agg_method
            new_col = f"agg_{agg_method_name}_{col}_grpby_{group_key}"
            df_agg = (input_df[[col] + [group_key]].groupby(group_key)[[col]].agg(agg_method))
            df_agg.columns = [new_col]
            new_df.append(df_agg)
            
    _df = pd.concat(new_df, axis=1).reset_index()
    output_df = pd.merge(input_df[[group_key]], _df, on=group_key, how="left")
    return output_df.drop(group_key, axis=1)

def get_raw_features(input_df):
    cols = [
        "time_step",
        "u_in",
    ]
    output_df = input_df[cols].copy()
    return output_df

def get_ohe_features(input_df):
    cols = ["R", "C"]
    encoder = ce.OneHotEncoder()
    output_df = encoder.fit_transform(input_df[cols].astype(str))
    return output_df

def get_fold_value(input_df):
    return input_df[["fold"]]

def get_target_value(input_df):
    return input_df[['pressure']]

def get_cumlative_grpby_breath_id_features(input_df):    
    input_df["area"] = input_df["time_step"] * input_df["u_in"]
    group_key = "breath_id"
    group_values = ["u_in", "area"]
    
    output_df = pd.DataFrame()
    for group_val in group_values:
        col_name = f"agg_cumsum_{group_val}_grpby_{group_key}"
        output_df[col_name] = input_df.groupby(group_key)[group_val].cumsum()
    
    # tubotubo feats
    output_df["divede_cumsum_u_in_by_time_step"] = np.log1p(output_df["agg_cumsum_u_in_grpby_breath_id"] /
                                                    (input_df["time_step"] + 1e-2))
    
    return output_df.fillna(0)

def get_mask_feature(input_df):
    output_df = pd.DataFrame()
    output_df["mask"] = input_df["u_out"] == 0
    return output_df

def _get_agg_col_name(group_key, group_values, agg_methods):
    out_cols = []
    for group_val in group_values:
        for agg_method in agg_methods:
            out_cols.append(f"agg_{agg_method}_{group_val}_grpby_{group_key}")
    return out_cols

def get_shift_grpby_breath_id_features(input_df):
    shift_times = [1, 2, 3, 4]
    group_key = "breath_id"
    group_values = ["u_in"]
    
    output_df = pd.DataFrame()
    for t in shift_times:
        _df = input_df.groupby(group_key)[group_values].shift(t)
        _df.columns = [f'shift={t}_{col}_grpby_{group_key}' for col in group_values]
        output_df = pd.concat([output_df, _df], axis=1)
    return output_df.fillna(0)

def get_diff_grpby_breath_id_features(input_df):
    diff_times = [1, 2, 3, 4]
    group_key = "breath_id"
    group_values = ["time_step", "u_in"]
    
    output_df = pd.DataFrame()
    output_df["breath_id"] = input_df["breath_id"].copy()

    for t in diff_times:
        _df = input_df.groupby(group_key)[group_values].diff(t)
        _df.columns = [f'diff={t}_{col}_grpby_{group_key}' for col in group_values]
        output_df = pd.concat([output_df, _df], axis=1)

    # 1st derivative
    for n in [1]:
        col = f'slope={n}_time_step_u_in'
        val = (output_df[f'diff={n}_u_in_grpby_breath_id'] /
               (output_df[f'diff={n}_time_step_grpby_breath_id'] + 1e-8))
        output_df[col] = val
               
    return output_df.fillna(0).drop("breath_id", axis=1)

def get_accel_grpby_breath_id_feature(input_df):
    grby_u_in = input_df.groupby('breath_id')['u_in']
    grby_time_step = input_df.groupby('breath_id')['time_step']
    
    # 2nd derivative
    output_df = pd.DataFrame()
    output_df["accel"] = ((grby_u_in.shift(0) - 2 * grby_u_in.shift(1) + grby_u_in.shift(2)) /
                 (grby_time_step.diff(1) * grby_time_step.diff(1).shift(1)))

    # clip accel
    p001 = output_df["accel"].quantile(0.01)
    p099 = output_df["accel"].quantile(0.99)
    output_df["accel"] = output_df["accel"].clip(p001, p099)
    
    return output_df.fillna(0)

def get_features_df(train, test):
    whole_df = pd.concat([train, test]).reset_index(drop=True)
    whole_df["u_in"] = np.log1p(whole_df["u_in"])  
    output_df = pd.DataFrame()
    
    funcs = [
        get_target_value,
        get_mask_feature,
        get_fold_value,
        get_raw_features,
        get_ohe_features,
        get_cumlative_grpby_breath_id_features,
        get_shift_grpby_breath_id_features,
        get_diff_grpby_breath_id_features,
        get_accel_grpby_breath_id_feature,
    ]

    for func in funcs:
        print(func.__name__)
        _df = func(whole_df)
        if func.__name__ not in [
                                 "get_mask_feature",
                                 "get_target_value",
                                 "get_fold_value" 
                                 ]:

            scaler = RobustScaler()
            _df[whole_df["u_out"]==0] = scaler.fit_transform(_df[whole_df["u_out"]==0])
            _df[whole_df["u_out"]==1] = scaler.transform(_df[whole_df["u_out"]==1])
            _df = reduce_mem_usage(_df)
            
        output_df = pd.concat([output_df, _df], axis=1)
    
    output_df["breath_id"] = whole_df["breath_id"].copy()
    train_feats_df = output_df.iloc[:len(train)]
    test_feats_df = output_df.iloc[len(train):].reset_index(drop=True)
    
    return train_feats_df, test_feats_df

In [3]:
data_train_df = pd.read_csv('./Database/train.csv')
data_test_df = pd.read_csv('./Database/test.csv')

In [4]:
train_feats_df, test_feats_df = get_features_df(data_train_df, data_test_df)

get_target_value
get_mask_feature
get_fold_value


KeyError: "None of [Index(['fold'], dtype='object')] are in the [columns]"