In [45]:
import numpy as np
import pandas as pd
import gc
from sklearn.model_selection import train_test_split

In [46]:
train = pd.read_feather('./train_data.ftr')
catg = ['B_30', 'B_31', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
cont_features = sorted([f for f in train.columns if f not in catg + ["customed_ID", 'target', 'S_2']])

In [47]:
labels = train["target"]
train = train.drop(columns=["target"])

# Feature Engineering
Common steps to follow:
- ...

Here are the strategies to try for each model:
- Aggregate features for every customer
- Use rows as they are


In [48]:
X_train, X_test, y_train, y_test = train_test_split(train,labels, test_size=0.20, shuffle=False)
print("Train set: ", X_train.shape, y_train.shape)
print("Test set: ", X_test.shape,y_test.shape)

Train set:  (4425160, 190) (4425160,)
Test set:  (1106291, 190) (1106291,)


In [49]:
def aggregate_features(data, categorical_cols, numerical_cols):
    
    categorical_cols = [c for c in categorical_cols if c not in ["customer_ID", "S_2"]]
    numerical_cols = [c for c in numerical_cols if c not in ["customer_ID", "S_2"]]

    
    num_agg = data.groupby("customer_ID")[numerical_cols].agg(['mean', 'std', 'min', 'max', 'last'])
    num_agg.columns = ['_'.join(x) for x in num_agg.columns]

    num_cols = num_agg.columns

    catg_agg = data.groupby('customer_ID')[categorical_cols].agg(['count', 'last', 'nunique'])
    catg_agg.columns = ['_'.join(x) for x in catg_agg.columns]

    catg_cols = catg_agg.columns

    temp = pd.concat([num_agg, catg_agg], axis=1)
    del num_agg
    del catg_agg
    gc.collect()

    return temp, list(num_cols), list(catg_cols)

In [50]:
agg_train, agg_num_cols, agg_catg_cols = aggregate_features(X_train, catg, cont_features)
agg_test, _ , _ = aggregate_features(X_test, catg, cont_features)
print("Aggregated train set: ", agg_train.shape)
print("Aggregated test set: ", agg_test.shape)


Aggregated train set:  (367145, 916)
Aggregated test set:  (91769, 916)


In [51]:
def detect_null_columns(df, threshold=0.7):
    null_counts = pd.DataFrame(df.isna().sum(), columns=["null_count"])
    drop_null_cols = null_counts[null_counts["null_count"] > (len(df) * threshold)].index
    return drop_null_cols

columns_to_drop = detect_null_columns(agg_train, threshold=0.7)

In [52]:
def drop_null_columns(df, columns_to_drop, catg_cols, num_cols):
    temp = df.drop(columns=columns_to_drop)
    temp = temp.reset_index()
    for col in columns_to_drop:
        if col in catg_cols:
            catg_cols.remove(col)
        elif col in agg_num_cols:
            num_cols.remove(col)
    return temp, catg_cols, num_cols

agg_train, agg_catg_cols, agg_num_cols = drop_null_columns(agg_train, columns_to_drop, agg_catg_cols, agg_num_cols)
agg_test, _, _ = drop_null_columns(agg_test, columns_to_drop, agg_catg_cols, agg_num_cols )

In [53]:
print(agg_train.select_dtypes("category").isna().sum())
print("rows left after dropping: {}".format(len(agg_train.dropna())))

B_30_last       27
B_38_last       27
D_114_last    3787
D_116_last    3787
D_117_last    3787
D_120_last    3787
D_126_last       0
D_63_last        0
D_64_last        0
D_68_last     4209
dtype: int64
rows left after dropping: 8868


The categorical variables that contains NaN values obscures the categorical encoding. Dropping rows that contains NaN values is not an option because it almost drops all of the dataset. Therefore missing data imputation must be applied.

In [54]:
from feature_engine.imputation import MeanMedianImputer, CategoricalImputer

median_imputer = MeanMedianImputer(
                   imputation_method='median')

median_imputer.fit(agg_train[agg_num_cols])
agg_train[agg_num_cols] = median_imputer.transform(agg_train[agg_num_cols])
agg_test[agg_num_cols] = median_imputer.transform(agg_test[agg_num_cols])

In [55]:
categorical_imputer = CategoricalImputer(imputation_method="frequent", ignore_format=True)

categorical_imputer.fit(agg_train[agg_catg_cols])
agg_train[agg_catg_cols] = categorical_imputer.transform(agg_train[agg_catg_cols])
agg_test[agg_catg_cols] = categorical_imputer.transform(agg_test[agg_catg_cols])

In [56]:
from feature_engine.encoding import OneHotEncoder

encoder = OneHotEncoder(ignore_format=True)
encoder.fit(agg_train[agg_catg_cols])
encoded_train_df = encoder.transform(agg_train[agg_catg_cols])
encoded_test_df = encoder.transform(agg_test[agg_catg_cols])


  X[f"{feature}_{category}"] = np.where(X[feature] == category, 1, 0)
  X[f"{feature}_{category}"] = np.where(X[feature] == category, 1, 0)
  X[f"{feature}_{category}"] = np.where(X[feature] == category, 1, 0)
  X[f"{feature}_{category}"] = np.where(X[feature] == category, 1, 0)
  X[f"{feature}_{category}"] = np.where(X[feature] == category, 1, 0)
  X[f"{feature}_{category}"] = np.where(X[feature] == category, 1, 0)
  X[f"{feature}_{category}"] = np.where(X[feature] == category, 1, 0)
  X[f"{feature}_{category}"] = np.where(X[feature] == category, 1, 0)
  X[f"{feature}_{category}"] = np.where(X[feature] == category, 1, 0)
  X[f"{feature}_{category}"] = np.where(X[feature] == category, 1, 0)
  X[f"{feature}_{category}"] = np.where(X[feature] == category, 1, 0)
  X[f"{feature}_{category}"] = np.where(X[feature] == category, 1, 0)
  X[f"{feature}_{category}"] = np.where(X[feature] == category, 1, 0)
  X[f"{feature}_{category}"] = np.where(X[feature] == category, 1, 0)
  X[f"{feature}_{cat

In [57]:
temp = agg_train.drop(columns=agg_catg_cols)
agg_train_to_save = pd.concat([temp, encoded_train_df], axis=1)
temp = agg_test.drop(columns=agg_catg_cols)
agg_test_to_save = pd.concat([temp, encoded_test_df], axis=1)

0          0
1          0
2          0
3          0
4          0
          ..
5531446    0
5531447    0
5531448    0
5531449    0
5531450    0
Name: target, Length: 5531451, dtype: int64

In [61]:
agg_train_to_save.to_feather("aggregated_train_data.ftr")
agg_test_to_save.to_feather("aggregated_test_data.ftr")