In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
import glob 
import os
import matplotlib.pyplot as plt 
import matplotlib
from datetime import datetime
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
import gc
from sklearn.impute import SimpleImputer
from sklearn import preprocessing
from scipy.stats import linregress 
from sklearn.feature_selection import RFE
from sklearn.svm import SVR
from sklearn.feature_selection import SelectKBest, chi2
from itertools import compress
from sklearn.decomposition import PCA
import random

In [2]:
path = Path('/home/jovyan/workspace/amex-challenge/archive')

In [3]:
train_data = pd.read_feather(path / f'data/processed_train_data_v3.ftr')
train_labels = pd.read_feather(path / f'data/train_labels.ftr')
train_data = train_data.set_index("customer_ID")
train_labels = train_labels.set_index("customer_ID")
joined = train_data.join(train_labels)
y = joined.target
X = joined.drop("target", axis=1)

In [4]:
_ = gc.collect()

In [6]:
non_numeric_cols = set(X.columns).difference(set(X.select_dtypes(include=(np.number)).columns))
print("Non-numerical columns:", non_numeric_cols)

cols_with_null = set(X.columns[X.isna().any()].tolist())
print("Columns with null values:", cols_with_null)

Non-numerical columns: set()
Columns with null values: set()


# Feature Selection

## Correlation Filtering

In [9]:
kbest = SelectKBest(chi2, k=80).fit(X, y).get_support()
selected_features = list(compress(X.columns, kbest))

In [10]:
selected_features

['P_2',
 'D_39',
 'B_2',
 'R_1',
 'D_41',
 'B_3',
 'D_44',
 'B_4',
 'D_45',
 'R_2',
 'D_48',
 'B_8',
 'D_51',
 'B_11',
 'S_6',
 'R_4',
 'S_8',
 'D_55',
 'D_58',
 'B_16',
 'B_18',
 'B_19',
 'B_20',
 'R_6',
 'S_13',
 'B_22',
 'D_72',
 'B_23',
 'P_4',
 'D_74',
 'D_75',
 'D_78',
 'R_10',
 'D_81',
 'R_15',
 'R_19',
 'B_32',
 'S_20',
 'R_21',
 'B_33',
 'D_92',
 'R_24',
 'D_112',
 'D_127',
 'D_128',
 'D_129',
 'D_130',
 'D_131',
 'D_133',
 'D_139',
 'D_140',
 'D_141',
 'D_143',
 'S_7_null',
 'D_56_null',
 'D_46_null',
 'B_17_null',
 'D_48_null',
 'D_53_null',
 'D_77_null',
 'S_3_null',
 'D_62_null',
 'D_43_null',
 'D_61_null',
 'B_38_2.0',
 'B_38_3.0',
 'B_38_1.0',
 'B_38_5.0',
 'B_38_7.0',
 'B_38_4.0',
 'B_38_6.0',
 'D_120_0.0',
 'D_120_1.0',
 'D_64_O',
 'D_64_U',
 'D_114_1.0',
 'D_114_0.0',
 'B_30_0.0',
 'B_30_1.0',
 'D_68_6.0']

## Dimensionality Reduction

In [19]:
pca = PCA(n_components=80)
X_shortened = pca.fit_transform(X_filtered, y)

## Recursive Feature Elimination

### XGB

In [None]:
est = GradientBoostingClassifier(verbose=True)
selector = RFE(est, n_features_to_select=30)
selector = selector.fit(X_shortened, y)
features = np.array(features)
important = features[selector.support_]
print(important)

      Iter       Train Loss   Remaining Time 
         1           1.0601           13.43m
         2           0.9957           13.29m
         3           0.9436           13.14m
         4           0.9005           12.99m
         5           0.8646           12.84m
         6           0.8340           12.72m
         7           0.8075           12.57m
         8           0.7851           12.43m
         9           0.7660           12.36m
        10           0.7487           12.21m
        20           0.6521           11.01m
        30           0.6127            9.57m
        40           0.5909            8.17m
        50           0.5781            6.82m
        60           0.5682            5.47m
        70           0.5619            4.10m
        80           0.5568            2.73m
        90           0.5527            1.37m
       100           0.5496            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.0601           13.03m
        

In [39]:
important

{'B_3',
 'B_38_7.0',
 'B_41_null',
 'B_42_null',
 'B_5',
 'D_113_null',
 'D_116_1.0',
 'D_131',
 'D_137_null',
 'D_144',
 'D_41_null',
 'D_44_null',
 'D_47',
 'D_51',
 'D_59_null',
 'D_63_CL',
 'D_63_XL',
 'D_76_null',
 'D_77_null',
 'D_84_null',
 'D_87_null',
 'D_91_null',
 'R_20',
 'R_23',
 'R_24',
 'S_11',
 'S_13',
 'S_9_null'}

# Output Data

In [18]:
version = 1
output = co_data.reset_index()
output.to_feather(path / f"data/processed_{dataset}_data_v{version}.ftr")

In [19]:
output.shape

(458913, 346)

In [None]:
{'B_3',
 'B_38_7.0',
 'B_41_null',
 'B_42_null',
 'B_5',
 'D_113_null',
 'D_116_1.0',
 'D_131',
 'D_137_null',
 'D_144',
 'D_41_null',
 'D_44_null',
 'D_47',
 'D_51',
 'D_59_null',
 'D_63_CL',
 'D_63_XL',
 'D_76_null',
 'D_77_null',
 'D_84_null',
 'D_87_null',
 'D_91_null',
 'R_20',
 'R_23',
 'R_24',
 'S_11',
 'S_13',
 'S_9_null'}