In [3]:
import numpy as np
import pandas as pd
import gc
from sklearn.model_selection import train_test_split

In [5]:
train = pd.read_feather('./train_data.ftr')
cat_features = ['B_30', 'B_31', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
cont_features = sorted([f for f in train.columns if f not in cat_features + ["customer_ID", 'target', 'S_2']])
labels = train["target"]
train = train.drop(columns=["target"])

In [6]:
def detect_null_columns(df, threshold=0.7):
    null_counts = pd.DataFrame(df.isna().sum(), columns=["null_count"])
    drop_null_cols = null_counts[null_counts["null_count"] > (len(df) * threshold)].index
    return drop_null_cols

columns_to_drop = detect_null_columns(train, threshold=0.8)
columns_to_drop

Index(['D_42', 'D_49', 'D_66', 'D_73', 'D_76', 'R_9', 'B_29', 'D_87', 'D_88',
       'D_106', 'R_26', 'D_108', 'D_110', 'D_111', 'B_39', 'B_42', 'D_132',
       'D_134', 'D_135', 'D_136', 'D_137', 'D_138', 'D_142'],
      dtype='object')

In [7]:
def drop_null_columns(df, columns_to_drop, catg_cols, num_cols):
    temp = df.drop(columns=columns_to_drop)
    temp = temp.reset_index(drop=True)
    for col in columns_to_drop:
        if col in catg_cols:
            catg_cols.remove(col)
        elif col in num_cols:
            num_cols.remove(col)
    return temp, catg_cols, num_cols

train_drpd, cat_features_drpd, cont_features_drpd = drop_null_columns(train, columns_to_drop, cat_features, cont_features)
train_drpd

Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_130,D_131,D_133,R_28,D_139,D_140,D_141,D_143,D_144,D_145
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-03-09,0.938477,0.001734,0.008728,1.006836,0.009224,0.124023,0.008774,0.004707,...,0.002052,0.005970,0.004345,0.001534,0.002426,0.003706,0.003819,0.000569,0.000610,0.002674
1,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-04-07,0.936523,0.005775,0.004925,1.000977,0.006153,0.126709,0.000798,0.002714,...,0.001034,0.004837,0.007496,0.004932,0.003956,0.003166,0.005032,0.009575,0.005493,0.009216
2,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-05-28,0.954102,0.091492,0.021652,1.009766,0.006817,0.123962,0.007599,0.009422,...,0.005680,0.005497,0.009224,0.009125,0.003269,0.007328,0.000427,0.003429,0.006985,0.002604
3,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-06-13,0.960449,0.002455,0.013687,1.002930,0.001372,0.117188,0.000685,0.005531,...,0.007107,0.008263,0.007206,0.002409,0.006119,0.004517,0.003201,0.008423,0.006527,0.009598
4,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-07-16,0.947266,0.002483,0.015190,1.000977,0.007607,0.117310,0.004654,0.009308,...,0.009682,0.004848,0.006313,0.004463,0.003672,0.004944,0.008888,0.001670,0.008125,0.009827
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5531446,fffff1d38b785cef84adeace64f8f83db3a0c31e8d92ea...,2017-11-05,0.979492,0.416016,0.020813,0.828125,0.003487,0.090759,0.005341,0.025146,...,0.008896,0.004509,0.000776,0.007069,0.006836,0.003679,0.000457,0.000906,0.001497,0.002775
5531447,fffff1d38b785cef84adeace64f8f83db3a0c31e8d92ea...,2017-12-23,0.984863,0.296631,0.007210,0.812500,0.005905,0.079895,0.002243,0.023697,...,0.005093,0.003407,0.001741,0.002056,0.003309,0.007095,0.007858,0.002777,0.008224,0.008858
5531448,fffff1d38b785cef84adeace64f8f83db3a0c31e8d92ea...,2018-01-06,0.982910,0.444092,0.013153,0.815430,0.003456,0.100525,0.002111,0.012344,...,0.009148,0.002947,0.006062,0.005077,0.009956,0.009995,0.001088,0.005692,0.006775,0.005566
5531449,fffff1d38b785cef84adeace64f8f83db3a0c31e8d92ea...,2018-02-06,0.969727,0.442627,0.009857,1.003906,0.005116,0.101807,0.009933,0.008575,...,0.009323,0.008514,0.003811,0.008682,0.005543,0.006565,0.009880,0.008125,0.001168,0.003983


In [17]:
def build_features(data, categorical_cols, numerical_cols):
    
    categorical_cols = [c for c in categorical_cols if c not in ["customer_ID", "S_2"]]
    numerical_cols = [c for c in numerical_cols if c not in ["customer_ID", "S_2"]]

  
    num_agg = data.groupby("customer_ID")[numerical_cols].agg(['first', 'mean', 'std', 'min', 'max', 'last'])
    num_agg.columns = ['_'.join(x) for x in num_agg.columns]
    num_agg.reset_index(inplace=True)

    catg_agg = data.groupby('customer_ID')[categorical_cols].agg(['count', 'last', 'nunique'])
    catg_agg.columns = ['_'.join(x) for x in catg_agg.columns]
    catg_agg.reset_index(inplace=True)

    num_agg.drop(columns=["customer_ID"], inplace=True)
    
    for col in num_agg.columns:
        if 'last' in col:
            num_agg[col + "_lag_sub"] = num_agg[col] - num_agg[col.replace("last", "first")]
            num_agg[col + "_lag_div"] = num_agg[col] / num_agg[col.replace("last", "first")]

    temp = pd.concat([num_agg, catg_agg], axis=1)

    new_num_cols = num_agg.columns
    new_catg_cols = catg_agg.columns

    del num_agg
    del catg_agg
    gc.collect()

    return temp, list(new_num_cols), list(new_catg_cols)



In [19]:
agg_train, agg_num_cols, agg_catg_cols = build_features(train, cat_features_drpd, cont_features_drpd)
print("Aggregated train set: ", agg_train.shape)
agg_train

Aggregated train set:  (8, 1266)


  num_agg[col + "_lag_sub"] = num_agg[col] - num_agg[col.replace("last", "first")]
  num_agg[col + "_lag_div"] = num_agg[col] / num_agg[col.replace("last", "first")]
  num_agg[col + "_lag_sub"] = num_agg[col] - num_agg[col.replace("last", "first")]
  num_agg[col + "_lag_div"] = num_agg[col] / num_agg[col.replace("last", "first")]
  num_agg[col + "_lag_sub"] = num_agg[col] - num_agg[col.replace("last", "first")]
  num_agg[col + "_lag_div"] = num_agg[col] / num_agg[col.replace("last", "first")]
  num_agg[col + "_lag_sub"] = num_agg[col] - num_agg[col.replace("last", "first")]
  num_agg[col + "_lag_div"] = num_agg[col] / num_agg[col.replace("last", "first")]
  num_agg[col + "_lag_sub"] = num_agg[col] - num_agg[col.replace("last", "first")]
  num_agg[col + "_lag_div"] = num_agg[col] / num_agg[col.replace("last", "first")]
  num_agg[col + "_lag_sub"] = num_agg[col] - num_agg[col.replace("last", "first")]
  num_agg[col + "_lag_div"] = num_agg[col] / num_agg[col.replace("last", "first")]
  nu

Unnamed: 0,B_1_first,B_1_mean,B_1_std,B_1_min,B_1_max,B_1_last,B_10_first,B_10_mean,B_10_std,B_10_min,...,D_126_nunique,D_63_count,D_63_last,D_63_nunique,D_64_count,D_64_last,D_64_nunique,D_68_count,D_68_last,D_68_nunique
0,0.008728,0.012009,0.006546,0.00193,0.021652,0.009384,0.096191,0.270264,0.181835,0.096191,...,1,13,CR,1,13,O,1,13,6.0,1
1,0.025787,0.02565,0.02775,0.00671,0.109619,0.034698,0.302734,0.298828,0.003044,0.293945,...,1,13,CO,1,13,O,1,13,6.0,1
2,0.001472,0.004387,0.002786,0.001472,0.009995,0.004284,0.222168,0.273682,0.052867,0.162109,...,1,13,CO,1,13,R,1,13,6.0,1
3,0.070312,0.059875,0.080538,0.005909,0.280029,0.012566,0.297852,0.306641,0.079525,0.192993,...,1,13,CO,1,13,O,1,13,3.0,3
4,0.003433,0.005939,0.002475,0.000776,0.009804,0.007679,0.058777,0.100342,0.074579,0.044739,...,1,13,CO,1,13,O,1,13,6.0,1
5,0.035156,0.027115,0.014723,0.007851,0.051575,0.007851,0.302002,0.289062,0.020943,0.241577,...,1,13,CO,1,13,R,1,13,6.0,1
6,0.044281,0.070801,0.031472,0.009415,0.11261,0.009415,0.051208,0.073181,0.069608,0.020004,...,1,13,CO,1,13,R,2,11,3.0,2
7,0.04068,0.058197,0.031164,0.029419,0.102051,0.102051,0.286865,0.2771,0.014998,0.249268,...,1,9,CO,1,9,R,1,9,5.0,1


In [20]:
for col in agg_train.columns:
    if(col == "customer_ID"):
        print(col)

customer_ID


In [82]:
half_floats = agg_train.select_dtypes(include="float16")
agg_train[half_floats.columns] = half_floats.astype("float32")

In [83]:
agg_train.to_parquet("processed_data/agg_data_preprocess_2.gzip", compression="gzip")