In [1]:
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset
from typing import NamedTuple

In [2]:
from typing import NamedTuple
from typing import Dict

class InteractionsBatch(NamedTuple):
    user: torch.Tensor
    adgroup_id: torch.Tensor
    user_features: Dict[str, torch.Tensor]
    ad_features: Dict[str, torch.Tensor]

In [3]:
class UserBatch(NamedTuple):
    user: torch.Tensor

In [29]:
UserBatch._fields

('user',)

In [32]:
int_data.data[list(UserBatch._fields)]

Unnamed: 0,user
0,220005
1,531363
2,893880
3,769846
4,891293
...,...
18124330,779468
18124331,30395
18124332,250273
18124333,1026536


In [2]:
class RawInteractionsDataset(Dataset):
    def __init__(self):
        super().__init__()
        self.raw_sample = pd.read_csv("../data/raw_sample.csv").drop(columns=["pid", "nonclk"])

        self.data = self._dedup_interactions()
        self.train_df, self.test_df = self._train_test_split(self.data)

    def _dedup_interactions(self):
        sorted_sample = self.raw_sample.sort_values(by=["user", "adgroup_id", "time_stamp"])
        timestamp_diff = sorted_sample["time_stamp"].diff().fillna(-1)
        user_diff = sorted_sample["user"].diff().fillna(-1)
        adgroup_diff = sorted_sample["adgroup_id"].diff().fillna(-1)
        deduped = sorted_sample[~((adgroup_diff == 0) & (user_diff == 0) & (timestamp_diff < 15 * 60))]
        return deduped
    
    def _train_test_split(self, df):
        clicks = df[df["clk"] == 1]
        clk_per_user = clicks.groupby(by="user")["clk"].count()
        max_clk_per_user = clk_per_user.max()

        click_cnt = clicks.sort_values(by=["user", "time_stamp"]).groupby("user")["clk"].rolling(max_clk_per_user, min_periods=1).sum()
        clicks = clicks.reset_index().drop(columns="index")
        clicks["clk_cnt"] = click_cnt.reset_index()["clk"]
        clicks = clicks.merge(clk_per_user.reset_index().rename({"clk":"clk_per_user"}, axis="columns"), on="user")
        split_timestamp = clicks[(clicks["clk_cnt"] == clicks["clk_per_user"]) & (clicks["clk_per_user"] > 1)][["user", "time_stamp"]]

        to_split = df.merge(split_timestamp.rename({"time_stamp": "split_timestamp"}, axis="columns"), on="user", how="left")
        test_filter = (to_split["clk"] == 1) & (to_split["split_timestamp"] <= to_split["time_stamp"])
        train_filter = (to_split["split_timestamp"] > to_split["time_stamp"]) | (to_split["split_timestamp"].isnull())
        train_df, test_df = to_split[train_filter], to_split[test_filter]

        return train_df, test_df
    
    def get_train_test_split(self):
        return self.train_df, self.test_df

    def __getitem__(self, index):
        return self.data.iloc[index]

In [24]:
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset
from typing import NamedTuple


class UserBatch(NamedTuple):
    user: torch.Tensor


class AdBatch(NamedTuple):
    adgroup_id: torch.Tensor


class InteractionsBatch(NamedTuple):
    user_feats: UserBatch
    ad_feats: AdBatch
    timestamp: torch.Tensor


class RawInteractionsDataset(Dataset):
    def __init__(self):
        super().__init__()
        self.raw_sample = pd.read_csv("../data/raw_sample.csv").drop(columns=["pid", "nonclk"])

        self.data = self._dedup_interactions()
        self.train_df, self.test_df = self._train_test_split(self.data)

    def _dedup_interactions(self):
        sorted_sample = self.raw_sample.sort_values(by=["user", "adgroup_id", "time_stamp"])
        timestamp_diff = sorted_sample["time_stamp"].diff().fillna(-1)
        user_diff = sorted_sample["user"].diff().fillna(-1)
        adgroup_diff = sorted_sample["adgroup_id"].diff().fillna(-1)
        deduped = sorted_sample[~((adgroup_diff == 0) & (user_diff == 0) & (timestamp_diff < 15 * 60))]
        return deduped
    
    def _train_test_split(self, df):
        clicks = df[df["clk"] == 1]
        clk_per_user = clicks.groupby(by="user")["clk"].count()
        max_clk_per_user = clk_per_user.max()

        click_cnt = clicks.sort_values(by=["user", "time_stamp"]).groupby("user")["clk"].rolling(max_clk_per_user, min_periods=1).sum()
        clicks = clicks.reset_index().drop(columns="index")
        clicks["clk_cnt"] = click_cnt.reset_index()["clk"]
        clicks = clicks.merge(clk_per_user.reset_index().rename({"clk":"clk_per_user"}, axis="columns"), on="user")
        split_timestamp = clicks[(clicks["clk_cnt"] == clicks["clk_per_user"]) & (clicks["clk_per_user"] > 1)][["user", "time_stamp"]]

        to_split = df.merge(split_timestamp.rename({"time_stamp": "split_timestamp"}, axis="columns"), on="user", how="left")
        test_filter = (to_split["clk"] == 1) & (to_split["split_timestamp"] <= to_split["time_stamp"])
        train_filter = (to_split["split_timestamp"] > to_split["time_stamp"]) | (to_split["split_timestamp"].isnull())
        to_split = to_split.drop(columns="split_timestamp")
        train_df, test_df = to_split[train_filter], to_split[test_filter]

        return train_df, test_df
    
    def get_train_test_split(self):
        return self.train_df, self.test_df

    def __getitem__(self, index):
        return self.data.iloc[index]


class InteractionsDataset(Dataset):
    def __init__(self, raw_interactions_dataset: RawInteractionsDataset, shuffle: bool = True, is_train: bool = True):
        idx = 0 if is_train else 1
        self.user_profile = pd.read_csv("../data/user_profile.csv").rename({"userid": "user"}, axis="columns")
        self.ad_feature = pd.read_csv("../data/ad_feature.csv")
        self.data = raw_interactions_dataset.get_train_test_split()[idx].merge(
            self.ad_feature, on="adgroup_id", how="left"
        ).merge(
            self.user_profile, on="user", how="left"
        )

        if shuffle:
            self.data = self.data.iloc[np.random.permutation(np.arange(len(self.data)))].reset_index().drop(columns="index")
    
    def __getitem__(self, index) -> InteractionsBatch:
        data = self.data.iloc[index]

        user_feats = data[list(UserBatch._fields)]
        ad_feats = data[list(AdBatch._fields)]
        user_batch = UserBatch(*torch.tensor(user_feats.to_numpy()).split(1, dim=-1))
        ad_batch = AdBatch(*torch.tensor(ad_feats.to_numpy()).split(1, dim=-1))
        timestamp = torch.tensor(data["time_stamp"].to_numpy()).unsqueeze(1)
        return InteractionsBatch(
            user_feats=user_batch,
            ad_feats=ad_batch,
            timestamp=timestamp
        )
        

In [5]:
dataset = RawInteractionsDataset()

In [25]:
int_data = InteractionsDataset(dataset)

In [26]:
int_data[100:164]

InteractionsBatch(user_feats=UserBatch(user=tensor([[1063721],
        [ 891376],
        [ 116000],
        [  95692],
        [ 663530],
        [ 414577],
        [ 849294],
        [ 297379],
        [ 987904],
        [  95482],
        [ 731787],
        [1018198],
        [ 569670],
        [ 640614],
        [ 531185],
        [ 568420],
        [1074173],
        [ 457428],
        [ 581843],
        [ 720866],
        [ 425056],
        [ 553027],
        [ 136433],
        [ 684792],
        [  64201],
        [ 566882],
        [  48031],
        [ 233582],
        [1090546],
        [ 581359],
        [ 939522],
        [1127051],
        [ 156141],
        [ 571999],
        [ 964291],
        [ 522449],
        [ 674045],
        [ 465636],
        [ 845319],
        [ 322077],
        [ 157418],
        [ 235198],
        [ 245996],
        [ 419821],
        [1120079],
        [ 758872],
        [ 131665],
        [ 830154],
        [ 250025],
        [ 869454],
      

In [23]:
torch.tensor(int_data.iloc[100:164]["time_stamp"].to_numpy()).unsqueeze(1)

tensor([[1494316823],
        [1494337654],
        [1494517982],
        [1494068846],
        [1494424186],
        [1494034552],
        [1494479407],
        [1494474558],
        [1494291682],
        [1494064753],
        [1494435243],
        [1494514789],
        [1494634679],
        [1494151079],
        [1494479708],
        [1494000845],
        [1494180006],
        [1494652792],
        [1494397257],
        [1494079556],
        [1494551915],
        [1494227534],
        [1494577812],
        [1494669327],
        [1494022497],
        [1494484534],
        [1494596812],
        [1494555723],
        [1494169330],
        [1494388620],
        [1494397252],
        [1494159059],
        [1494647452],
        [1494053066],
        [1494064220],
        [1494058707],
        [1494249910],
        [1494142599],
        [1494412171],
        [1494312029],
        [1494158183],
        [1494659663],
        [1494114541],
        [1494180370],
        [1494112009],
        [1

In [18]:
int_data[100:164].user_feats.user.shape

torch.Size([64, 1])

In [7]:
data = int_data.data.iloc[100:164]

In [10]:
user_feats = data[list(UserBatch._fields)]
ad_feats = data[list(AdBatch._fields)]
user_batch = UserBatch(torch.tensor(user_feats.to_numpy()).split(1, dim=-1))
ad_batch = AdBatch(torch.tensor(ad_feats.to_numpy()).split(1, dim=-1))
timestamp = torch.tensor(data["time_stamp"].to_numpy()).split(1, dim=-1)

In [11]:
InteractionsBatch(user_feats=user_batch,ad_feats=ad_batch,timestamp=timestamp)

InteractionsBatch(user_feats=UserBatch(user=(tensor([[ 373061],
        [1024958],
        [1134285],
        [ 270872],
        [  79621],
        [ 780973],
        [ 174988],
        [ 213399],
        [  80318],
        [1027189],
        [ 472530],
        [ 672993],
        [ 835791],
        [ 968170],
        [ 430473],
        [ 186097],
        [   6926],
        [ 601444],
        [ 282165],
        [ 795952],
        [ 943078],
        [ 738660],
        [  93819],
        [ 347931],
        [ 593509],
        [ 358427],
        [ 542330],
        [1014452],
        [ 544610],
        [ 565460],
        [ 208392],
        [ 314078],
        [ 226643],
        [  44501],
        [ 964858],
        [ 339780],
        [ 189956],
        [ 648531],
        [  20468],
        [1074884],
        [1039968],
        [ 103713],
        [ 527357],
        [ 356960],
        [ 974617],
        [ 381272],
        [ 351396],
        [ 556822],
        [ 524961],
        [ 196968],
     

In [3]:
torch.tensor(int_data.data.iloc[100:164]["time_stamp"].to_numpy()).split(1, dim=-1)

NameError: name 'int_data' is not defined

In [42]:

AdBatch(torch.tensor(int_data.data.iloc[100:164][list(AdBatch._fields)].to_numpy()).split(1, dim=-1))

AdBatch(adgroup_id=(tensor([[344981],
        [480621],
        [741610],
        [622823],
        [577029],
        [303285],
        [459546],
        [ 80733],
        [775843],
        [547634],
        [717743],
        [630496],
        [ 97194],
        [ 49140],
        [699786],
        [708363],
        [761975],
        [294784],
        [444105],
        [569209],
        [208108],
        [594094],
        [584459],
        [390962],
        [187427],
        [393539],
        [656656],
        [787072],
        [227859],
        [502928],
        [ 91009],
        [561101],
        [126616],
        [809668],
        [630371],
        [416063],
        [762325],
        [170924],
        [479928],
        [531937],
        [393348],
        [607743],
        [600840],
        [476518],
        [612092],
        [767827],
        [514411],
        [697893],
        [608045],
        [ 72640],
        [759279],
        [371900],
        [323711],
        [412331],
        

In [154]:
int_data[0]

AttributeError: 'Series' object has no attribute 'columns'

In [131]:
int_data.data.iloc[40:60]

Unnamed: 0,user,time_stamp,adgroup_id,clk,split_timestamp,cate_id,campaign_id,customer,brand,price,cms_segid,cms_group_id,final_gender_code,age_level,pvalue_level,shopping_level,occupation,new_user_class_level
40,326971,1494305823,558473,0,1494502000.0,4291,398013,52614,349200.0,168.0,0.0,9.0,1.0,3.0,1.0,3.0,0.0,
41,202355,1494504469,366197,0,,4289,87015,29870,223490.0,2868.0,19.0,3.0,2.0,3.0,2.0,3.0,0.0,2.0
42,839266,1494076986,497245,0,,6261,47462,80022,448549.0,328.0,0.0,2.0,2.0,2.0,1.0,3.0,0.0,
43,782829,1494081215,448520,0,,6172,348591,249048,,29.8,0.0,7.0,1.0,1.0,,1.0,0.0,
44,421561,1494116470,741254,0,,6261,28384,110451,419304.0,198.0,46.0,5.0,2.0,5.0,3.0,3.0,0.0,2.0
45,267138,1494125883,692619,0,,6261,65045,115606,319827.0,1338.0,19.0,3.0,2.0,3.0,2.0,3.0,0.0,2.0
46,267748,1494199435,472522,0,1494294000.0,6426,260896,186336,51258.0,399.0,30.0,4.0,2.0,4.0,1.0,3.0,0.0,2.0
47,947601,1494044355,404818,0,1494483000.0,4289,404795,59619,169275.0,790.0,0.0,1.0,2.0,1.0,,1.0,0.0,
48,813815,1494261052,276522,0,1494345000.0,1133,275633,82220,201925.0,88.0,7.0,2.0,2.0,2.0,2.0,3.0,0.0,2.0
49,874590,1494498257,794673,0,,1665,296449,242104,,256.0,34.0,4.0,2.0,4.0,2.0,3.0,0.0,3.0


In [133]:
list(int_data.data.columns)

['user',
 'time_stamp',
 'adgroup_id',
 'clk',
 'split_timestamp',
 'cate_id',
 'campaign_id',
 'customer',
 'brand',
 'price',
 'cms_segid',
 'cms_group_id',
 'final_gender_code',
 'age_level',
 'pvalue_level',
 'shopping_level',
 'occupation',
 'new_user_class_level ']

In [43]:
len(pd.unique(int_data.data["campaign_id"]))

392407

In [144]:
int_data.data.iloc[40:60].to_numpy()

array([[3.26971000e+05, 1.49430582e+09, 5.58473000e+05, 0.00000000e+00,
        1.49450164e+09, 4.29100000e+03, 3.98013000e+05, 5.26140000e+04,
        3.49200000e+05, 1.68000000e+02, 0.00000000e+00, 9.00000000e+00,
        1.00000000e+00, 3.00000000e+00, 1.00000000e+00, 3.00000000e+00,
        0.00000000e+00,            nan],
       [2.02355000e+05, 1.49450447e+09, 3.66197000e+05, 0.00000000e+00,
                   nan, 4.28900000e+03, 8.70150000e+04, 2.98700000e+04,
        2.23490000e+05, 2.86800000e+03, 1.90000000e+01, 3.00000000e+00,
        2.00000000e+00, 3.00000000e+00, 2.00000000e+00, 3.00000000e+00,
        0.00000000e+00, 2.00000000e+00],
       [8.39266000e+05, 1.49407699e+09, 4.97245000e+05, 0.00000000e+00,
                   nan, 6.26100000e+03, 4.74620000e+04, 8.00220000e+04,
        4.48549000e+05, 3.28000000e+02, 0.00000000e+00, 2.00000000e+00,
        2.00000000e+00, 2.00000000e+00, 1.00000000e+00, 3.00000000e+00,
        0.00000000e+00,            nan],
       [7.828

In [148]:
torch.tensor(int_data.data.iloc[40:60].to_numpy()).split(1, dim=1)

(tensor([[ 326971.],
         [ 202355.],
         [ 839266.],
         [ 782829.],
         [ 421561.],
         [ 267138.],
         [ 267748.],
         [ 947601.],
         [ 813815.],
         [ 874590.],
         [ 867672.],
         [ 159111.],
         [ 611284.],
         [ 405165.],
         [ 736674.],
         [ 143309.],
         [ 940827.],
         [ 877643.],
         [1024393.],
         [ 279311.]], dtype=torch.float64),
 tensor([[1.4943e+09],
         [1.4945e+09],
         [1.4941e+09],
         [1.4941e+09],
         [1.4941e+09],
         [1.4941e+09],
         [1.4942e+09],
         [1.4940e+09],
         [1.4943e+09],
         [1.4945e+09],
         [1.4942e+09],
         [1.4942e+09],
         [1.4946e+09],
         [1.4942e+09],
         [1.4943e+09],
         [1.4946e+09],
         [1.4945e+09],
         [1.4943e+09],
         [1.4944e+09],
         [1.4947e+09]], dtype=torch.float64),
 tensor([[558473.],
         [366197.],
         [497245.],
         [4485