In [1]:
import numpy as np
import pandas as pd
import gc

In [2]:
train = pd.read_feather('./train_data.ftr')
catg = ['B_30', 'B_31', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
cont_features = sorted([f for f in train.columns if f not in catg + ["customed_ID", 'target', 'S_2']])

# Feature Engineering
Common steps to follow:
- ...

Here are the strategies to try for each model:
- Aggregate features for every customer
- Use rows as they are


In [3]:
def aggregate_features(data, categorical_cols, numerical_cols):
    
    categorical_cols = [c for c in categorical_cols if c not in ["customer_ID", "S_2"]]
    numerical_cols = [c for c in numerical_cols if c not in ["customer_ID", "S_2"]]

    
    num_agg = data.groupby("customer_ID")[numerical_cols].agg(['mean', 'std', 'min', 'max', 'last'])
    num_agg.columns = ['_'.join(x) for x in num_agg.columns]

    catg_agg = data.groupby('customer_ID')[categorical_cols].agg(['count', 'last', 'nunique'])
    catg_agg.columns = ['_'.join(x) for x in catg_agg.columns]

    temp = pd.concat([num_agg, catg_agg], axis=1)
    del num_agg
    del catg_agg
    gc.collect()

    return temp

In [4]:
agg_df = aggregate_features(train, catg, cont_features)
agg_df.to_csv("aggregated_data.csv")
agg_df

Unnamed: 0_level_0,B_1_mean,B_1_std,B_1_min,B_1_max,B_1_last,B_10_mean,B_10_std,B_10_min,B_10_max,B_10_last,...,D_63_nunique,D_64_count,D_64_last,D_64_nunique,D_66_count,D_66_last,D_66_nunique,D_68_count,D_68_last,D_68_nunique
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a,0.012009,0.006546,0.001930,0.021652,0.009384,0.270264,0.181835,0.096191,0.741699,0.326172,...,1,13,O,1,0,,0,13,6.0,1
00000fd6641609c6ece5454664794f0340ad84dddce9a267a310b5ae68e9d8e5,0.025650,0.027750,0.006710,0.109619,0.034698,0.298828,0.003044,0.293945,0.302734,0.297119,...,1,13,O,1,0,,0,13,6.0,1
00001b22f846c82c51f6e3958ccd81970162bae8b007e80662ef27519fcc18c1,0.004387,0.002786,0.001472,0.009995,0.004284,0.273682,0.052867,0.162109,0.302734,0.296387,...,1,13,R,1,0,,0,13,6.0,1
000041bdba6ecadd89a52d11886e8eaaec9325906c9723355abb5ca523658edc,0.059875,0.080538,0.005909,0.280029,0.012566,0.306641,0.079525,0.192993,0.431885,0.411621,...,1,13,O,1,0,,0,13,3.0,3
00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8ad51ca8b8c4a24cefed,0.005939,0.002475,0.000776,0.009804,0.007679,0.100342,0.074579,0.044739,0.260742,0.125244,...,1,13,O,1,13,1.0,1,13,6.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ffff41c8a52833b56430603969b9ca48d208e7c192c6a4081a6acc28cf4f8af7,0.029175,0.014286,0.006084,0.051941,0.028519,0.591309,0.373031,0.366211,1.417969,0.436035,...,1,13,U,1,13,1.0,1,13,6.0,1
ffff518bb2075e4816ee3fe9f3b152c57fc0e6f01bf7fdd3e5b57cfcbee30286,0.368408,0.051249,0.292480,0.452148,0.292480,0.042603,0.016051,0.021011,0.083557,0.021011,...,1,13,R,1,0,,0,13,6.0,1
ffff9984b999fccb2b6127635ed0736dda94e544e67e026eee4d20f680639ff6,0.043030,0.040382,0.013000,0.162476,0.020569,0.268555,0.069366,0.092590,0.302734,0.302734,...,1,13,U,1,0,,0,13,5.0,1
ffffa5c46bc8de74f5a4554e74e239c8dee6b9baf388145b2c3d01967fcce461,0.018158,0.009637,0.000281,0.030563,0.015839,0.039734,0.041169,0.006924,0.154907,0.035461,...,1,13,U,2,0,,0,13,3.0,2
