In [1]:
cd ../..

/Users/adobles/projects/snap/relbench-user-study


## Feature Ideas

- User features:
    - `locale`
    - `age`
    - `gender`
    - `days_on_app`
    - `location`
    - `timezone`
    - `num_friends`
- For monthly windows between 1 and 5 months ago calculate the following features:
    - `num_invited` (as per the `event_attendees` table)
    - `num_yes`
    - `num_no`
    - `num_maybe`
    - `avg_event_start_hour`
    - `modal_event_dow`
    - `num_invites` (as per the `event_interest` table)
    - `num_interested`
    - `num_not_interested`
    - `num_invited_and_interested`
    - `num_invited_and_not_interested`

In [2]:
import duckdb
from torch_frame.utils import infer_df_stype

import utils

conn = duckdb.connect('event/event.db')
%load_ext sql
%sql conn --alias duckdb
%config SqlMagic.displaycon=False
%config SqlMagic.autopandas=True

In [3]:
with open('event/user-repeat/feats.sql', 'r') as f:
    # run once with train_labels and once with val_labels
    template = f.read()

# create train, val and test features
# takes 1 - 5 mins
for s in ['train', 'val', 'test']:
    print(f'Creating {s} table')
    query = utils.render_jinja_sql(template, dict(set=s, subsample=0))
    conn.sql(query)
    print(f'{s} table created')

Creating train table
train table created
Creating val table
val table created
Creating test table
test table created


In [6]:
utils.validate_feature_tables('user_repeat', conn)

Validating train
train labels size: 3,842 x 3
train feats size: 3,842 x 65

Validating val
val labels size: 268 x 3
val feats size: 268 x 65

Validating test
test labels size: 246 x 2
test feats size: 246 x 64

✅ All tables are valid!


In [7]:
%%sql train_df <<
from user_repeat_train_feats

In [8]:
infer_df_stype(train_df)

{'user': <stype.numerical: 'numerical'>,
 'timestamp': <stype.timestamp: 'timestamp'>,
 'target': <stype.categorical: 'categorical'>,
 'locale': <stype.text_embedded: 'text_embedded'>,
 'age': <stype.numerical: 'numerical'>,
 'gender': <stype.categorical: 'categorical'>,
 'days_on_app': <stype.numerical: 'numerical'>,
 'location': <stype.text_embedded: 'text_embedded'>,
 'timezone': <stype.numerical: 'numerical'>,
 'num_friends': <stype.numerical: 'numerical'>,
 'past_1_num_invited': <stype.numerical: 'numerical'>,
 'past_1_num_yes': <stype.numerical: 'numerical'>,
 'past_1_num_no': <stype.numerical: 'numerical'>,
 'past_1_num_maybe': <stype.numerical: 'numerical'>,
 'past_1_avg_event_start_hour': <stype.numerical: 'numerical'>,
 'past_1_modal_event_dow': <stype.categorical: 'categorical'>,
 'past_1_num_invites': <stype.numerical: 'numerical'>,
 'past_1_num_interested': <stype.numerical: 'numerical'>,
 'past_1_num_not_interested': <stype.numerical: 'numerical'>,
 'past_1_num_invited_an

In [9]:
utils.feature_summary_df(train_df, 'target', classification=False)

  c /= stddev[:, None]
  c /= stddev[None, :]


Unnamed: 0,Label Corr.,Label MI,NaN %
user,0.061,0.12,0.0%
past_1_num_no,0.066,0.027,0.0%
past_1_num_yes,0.169,0.026,0.0%
past_2_num_yes,0.204,0.026,43.0%
past_1_num_maybe,0.185,0.02,0.0%
past_1_num_invited,-0.06,0.018,0.0%
past_2_num_maybe,0.2,0.016,43.0%
past_4_num_invited_and_interested,-0.367,0.013,98.8%
age,-0.046,0.012,4.1%
past_3_num_invited,-0.04,0.012,69.4%


## Feature Importances

In [None]:
import numpy as np
import shap
from torch_frame import TaskType, stype
from torch_frame.data import Dataset
from torch_frame.gbdt import LightGBM

from inferred_stypes import task_to_stypes
from train_gbdt import TASK_PARAMS

TASK = 'rel-event-user-repeat'

task_params = TASK_PARAMS[TASK]

In [None]:
%%sql val_df <<
select * from user_repeat_val_feats;

In [None]:
col_to_stype = task_to_stypes[TASK].copy()
del col_to_stype['title']
del col_to_stype['last_review_summary']
val_tf = Dataset(
    val_df,
    col_to_stype=col_to_stype,
    target_col=task_params['target_col'],
).materialize().tensor_frame

In [None]:
gbdt = LightGBM(task_type=task_params['task_type'])
gbdt.load(f'models/{TASK}_lgbm.json')
pred = gbdt.predict(tf_test=val_tf).numpy()

In [None]:
shap.initjs()
explainer = shap.TreeExplainer(gbdt.model)

sample = np.random.randint(0, len(val_tf), size=10_000)

val_arr, _, _ = gbdt._to_lightgbm_input(val_tf[sample])
shap_values = explainer.shap_values(val_arr, pred[sample])

# TODO verify
feat_names = val_tf.col_names_dict.get(stype.categorical, []) + val_tf.col_names_dict[stype.numerical]

shap.summary_plot(shap_values, val_arr, plot_type='violin', max_display=30, feature_names=feat_names)