In [1]:
import sys
import os
import numpy as np 
import pandas as pd
from tempfile import TemporaryDirectory
import tensorflow as tf

import dkn_utils as util

In [2]:
#!python -m nltk.downloader punkt

[nltk_data] Downloading package punkt to /h/bjimenez/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Modify data folder path in lines 311-313
#!python3 data_preprocess.py

Process data for training
Parse behaviors
Parse ./data_small/train/behaviors.tsv
Please modify `num_users` in `src/config.py` into 1 + 50000
Balancing data: 156965it [00:07, 21018.86it/s]
Parse news
Parse ./data_small/train/news.tsv
Pandas Apply: 100%|#####################| 51282/51282 [00:47<00:00, 1070.43it/s]
Please modify `num_categories` in `src/config.py` into 1 + 274
Please modify `num_words` in `src/config.py` into 1 + 70972
Please modify `num_entities` in `src/config.py` into 1 + 12957
Generate word embedding
Rate of word missed in pretrained embedding: 0.2332
Transform entity embeddings

Process data for validation
Parse news
Parse ./data_small/val/news.tsv
Pandas Apply: 100%|#####################| 42416/42416 [00:38<00:00, 1090.58it/s]

Process data for test
Parse news
Parse ./data_small/test/news.tsv
Pandas Apply: 100%|#####################| 42416/42416 [00:39<00:00, 1087.26it/s]


In [4]:
cwd = os.getcwd()

# small data
data_path = os.path.dirname(cwd) + '/DKN/data_small/'
# all data
#data_path = os.path.dirname(cwd)+'/data/'

train_news_file = os.path.join(data_path, 'train', r'news.tsv')
train_behaviors_file = os.path.join(data_path, 'train', r'behaviors.tsv')

valid_news_file = os.path.join(data_path, 'val', r'news.tsv')
valid_behaviors_file = os.path.join(data_path, 'val', r'behaviors.tsv')





In [8]:
train_behaviors_file = os.path.join(data_path, 'train', r'behaviors_parsed.tsv')
raw_behaviour = pd.read_csv(train_behaviors_file, sep='\t')
raw_behaviour.head()

Unnamed: 0,user,clicked_news,candidate_news,clicked
0,2,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N17059 N58114 N33677,1 0 0
1,3,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N23814 N10960 N12330,1 0 0
2,4,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N49685 N33632 N35729,1 0 0
3,5,N10078 N56514 N14904 N33740,N8400 N36446 N61497,1 0 0
4,6,N39074 N14343 N32607 N32320 N22007 N442 N19001...,N21119 N2869 N53696,1 0 0


In [9]:
train_behaviors_file = os.path.join(data_path, 'train', r'news_parsed.tsv')
raw_behaviour = pd.read_csv(train_behaviors_file, sep='\t')
raw_behaviour.head()

Unnamed: 0,id,category,subcategory,title,abstract,title_entities,abstract_entities
0,N55528,1,2,"[1, 2, 3, 4, 5, 6, 7, 5, 8, 6, 9, 10, 11, 0, 0...","[12, 1, 13, 5, 14, 5, 8, 15, 16, 1, 17, 18, 19...","[0, 0, 2, 2, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,N19639,3,4,"[23, 24, 25, 26, 27, 28, 0, 0, 0, 0, 0, 0, 0, ...","[29, 30, 31, 25, 32, 33, 34, 35, 8, 36, 34, 37...","[0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,N61837,5,6,"[1, 41, 42, 43, 44, 45, 46, 47, 1, 48, 42, 49,...","[51, 52, 53, 54, 55, 56, 57, 42, 58, 59, 60, 1...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,N53526,3,7,"[79, 66, 67, 80, 81, 22, 82, 44, 83, 84, 85, 8...","[79, 89, 90, 79, 66, 56, 91, 5, 8, 92, 67, 80,...","[0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, ..."
4,N38324,3,8,"[83, 64, 99, 100, 42, 101, 102, 5, 103, 64, 56...","[105, 106, 31, 5, 107, 108, 44, 56, 109, 40, 1...","[0, 0, 0, 0, 0, 6, 6, 0, 0, 0, 0, 7, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


# Reformat data from raw files

The behaviors.tsv file contains the impression logs and users' news click histories. It has 5 columns divided by the tab symbol:

    Impression ID. The ID of an impression.
    User ID. The anonymous ID of a user.
    Time. The impression time with format "MM/DD/YYYY HH:MM:SS AM/PM".
    History. The news click history (ID list of clicked news) of this user before this impression. The clicked news articles are ordered by time.
    Impressions. List of news displayed in this impression and user's click behaviors on them (1 for click and 0 for non-click). The orders of news in a impressions have been shuffled.


In [3]:
raw_behaviour = pd.read_csv(train_behaviors_file,
                            sep="\t",
                            names=["impressionId","userId","timestamp","click_history","impressions"])

print(f"The dataset consist of {len(raw_behaviour):,.0f} number of interactions.")
raw_behaviour.head()

The dataset consist of 156,965 number of interactions.


Unnamed: 0,impressionId,userId,timestamp,click_history,impressions
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...
2,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...
3,4,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0
4,5,U8125,11/12/2019 4:11:21 PM,N10078 N56514 N14904 N33740,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...


In [4]:
## Indexize users
unique_userIds = raw_behaviour['userId'].unique()
# Allocate a unique index for each user, but let the zeroth index be a UNK index:
ind2user = {idx +1: itemid for idx, itemid in enumerate(unique_userIds)}
user2ind = {itemid : idx for idx, itemid in ind2user.items()}
print(f"We have {len(user2ind):,.0f} unique users in the dataset")

# Create a new column with userIdx:
raw_behaviour['userIdx'] = raw_behaviour['userId'].map(lambda x: user2ind.get(x,0))

We have 50,000 unique users in the dataset


In [5]:
raw_behaviour.head()

Unnamed: 0,impressionId,userId,timestamp,click_history,impressions,userIdx
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0,1
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...,2
2,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...,3
3,4,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0,4
4,5,U8125,11/12/2019 4:11:21 PM,N10078 N56514 N14904 N33740,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...,5


We also need to get the content information of each article. We will use the news.tsv file to index the items.

In [6]:
news = pd.read_csv(train_news_file, 
                   sep="\t",
                   names=["itemId","category","subcategory","title","abstract","url","title_entities","abstract_entities"])

# Build index of items
ind2item = {idx +1: itemid for idx, itemid in enumerate(news['itemId'].values)}
item2ind = {itemid : idx for idx, itemid in ind2item.items()}

news.head()

Unnamed: 0,itemId,category,subcategory,title,abstract,url,title_entities,abstract_entities
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ..."
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI..."


Now we need to process the click history and impressions. We need to both indexize the strings, but also to decode impressions into clicks and non-clicks.

In [7]:
# Indexize click history field
def process_click_history(s):
    list_of_strings = str(s).split(" ")
    return [item2ind.get(l, 0) for l in list_of_strings]
        
raw_behaviour['click_history_idx'] = raw_behaviour.click_history.map(lambda s:  process_click_history(s))
raw_behaviour.head()

Unnamed: 0,impressionId,userId,timestamp,click_history,impressions,userIdx,click_history_idx
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0,1,"[6893, 10050, 15556, 21467, 26358, 4946, 14071..."
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...,2,"[25816, 2334, 8524, 12087, 13463, 14202, 12733..."
2,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...,3,"[5477, 4207, 11684, 7704, 8124, 23394, 22970, ..."
3,4,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0,4,"[13827, 19085, 28506, 7024, 22910, 16667, 1559..."
4,5,U8125,11/12/2019 4:11:21 PM,N10078 N56514 N14904 N33740,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...,5,"[23643, 4853, 27686, 31189]"


In [8]:
# collect one click and one no-click from impressions:
def process_impression(s):
    list_of_strings = s.split(" ")
    itemid_rel_tuple = [l.split("-") for l in list_of_strings]
    noclicks = []
    for entry in itemid_rel_tuple:
        if entry[1] =='0':
            noclicks.append(entry[0])
        if entry[1] =='1':
            click = entry[0]
    return noclicks, click


In [9]:
raw_behaviour['noclicks'], raw_behaviour['click'] = zip(*raw_behaviour['impressions'].map(process_impression))
# We can then indexize these two new columns:
raw_behaviour['noclicks'] = raw_behaviour['noclicks'].map(lambda list_of_strings: [item2ind.get(l, 0) for l in list_of_strings])
raw_behaviour['click'] = raw_behaviour['click'].map(lambda x: item2ind.get(x,0))

raw_behaviour.head()

Unnamed: 0,impressionId,userId,timestamp,click_history,impressions,userIdx,click_history_idx,noclicks,click
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0,1,"[6893, 10050, 15556, 21467, 26358, 4946, 14071...",[50689],33900
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...,2,"[25816, 2334, 8524, 12087, 13463, 14202, 12733...","[37405, 41306, 34907, 35307, 44370, 37210, 439...",32187
2,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...,3,"[5477, 4207, 11684, 7704, 8124, 23394, 22970, ...","[39528, 33356, 38720, 43459, 794, 38061, 39830...",5767
3,4,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0,4,"[13827, 19085, 28506, 7024, 22910, 16667, 1559...","[50689, 50106, 50022]",50715
4,5,U8125,11/12/2019 4:11:21 PM,N10078 N56514 N14904 N33740,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...,5,"[23643, 4853, 27686, 31189]","[2006, 33272, 39220, 37210, 45683, 50113, 3663...",31475


In [10]:
# convert timestamp value to hours since epoch
raw_behaviour['epochhrs'] = pd.to_datetime(raw_behaviour['timestamp']).values.astype(np.int64)/(1e6)/1000/3600
raw_behaviour['epochhrs'] = raw_behaviour['epochhrs'].round()

raw_behaviour.head()

Unnamed: 0,impressionId,userId,timestamp,click_history,impressions,userIdx,click_history_idx,noclicks,click,epochhrs
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0,1,"[6893, 10050, 15556, 21467, 26358, 4946, 14071...",[50689],33900,437073.0
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...,2,"[25816, 2334, 8524, 12087, 13463, 14202, 12733...","[37405, 41306, 34907, 35307, 44370, 37210, 439...",32187,437106.0
2,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...,3,"[5477, 4207, 11684, 7704, 8124, 23394, 22970, ...","[39528, 33356, 38720, 43459, 794, 38061, 39830...",5767,437143.0
3,4,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0,4,"[13827, 19085, 28506, 7024, 22910, 16667, 1559...","[50689, 50106, 50022]",50715,437069.0
4,5,U8125,11/12/2019 4:11:21 PM,N10078 N56514 N14904 N33740,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...,5,"[23643, 4853, 27686, 31189]","[2006, 33272, 39220, 37210, 45683, 50113, 3663...",31475,437104.0


In [11]:
raw_behaviour.loc[1,:]

impressionId                                                         2
userId                                                          U91836
timestamp                                        11/12/2019 6:11:30 PM
click_history        N31739 N6072 N63045 N23979 N35656 N43353 N8129...
impressions          N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...
userIdx                                                              2
click_history_idx    [25816, 2334, 8524, 12087, 13463, 14202, 12733...
noclicks             [37405, 41306, 34907, 35307, 44370, 37210, 439...
click                                                            32187
epochhrs                                                      437106.0
Name: 1, dtype: object

In this preprocessing we have processed behaviour data, article data and user data. Most importantly, we have indexized users and items in the behaviour dataframe, as pytorch requires integer indicies instead of strings for user and item IDs.

    Dataframebehaviour contains all interactions: epochhrs (a timestamp), clicks, no clicks and historical clicks per user for each interaction
    Dictionary ind2item: mapping the item indicies given in behaviour to the real item Id given in the dataset.
    Dictionary ind2user: mapping the user indicies given in behaviour to the real user Id given in the dataset.
    Dataframe news: content data on items. We will not use this in the first iteration.

The main component is behaviour, and for collaborative filtering purposes this is all we need. However, if we want to utilize content data on the news items some preprocessing on the news dataframe must be used.

In [12]:
## Select the columns that we now want to use for further analysis
behaviour = raw_behaviour[['epochhrs','userIdx','click_history_idx','noclicks','click']]
behaviour.head()

Unnamed: 0,epochhrs,userIdx,click_history_idx,noclicks,click
0,437073.0,1,"[6893, 10050, 15556, 21467, 26358, 4946, 14071...",[50689],33900
1,437106.0,2,"[25816, 2334, 8524, 12087, 13463, 14202, 12733...","[37405, 41306, 34907, 35307, 44370, 37210, 439...",32187
2,437143.0,3,"[5477, 4207, 11684, 7704, 8124, 23394, 22970, ...","[39528, 33356, 38720, 43459, 794, 38061, 39830...",5767
3,437069.0,4,"[13827, 19085, 28506, 7024, 22910, 16667, 1559...","[50689, 50106, 50022]",50715
4,437104.0,5,"[23643, 4853, 27686, 31189]","[2006, 33272, 39220, 37210, 45683, 50113, 3663...",31475


In [None]:
yaml_file = os.path.join(data_path, r'dkn.yaml')

In [None]:
epochs = 10
history_size = 50
batch_size = 100

In [None]:
hparams = util.prepare_hparams(yaml_file,
                              news_feature_file = news_feature_file,
                              user_history_file = user_history_file,
                              wordEmb_file=wordEmb_file,
                              entityEmb_file=entityEmb_file,
                              contextEmb_file=contextEmb_file,
                              epochs=epochs,
                              history_size=history_size,
                              batch_size=batch_size)

In [None]:
from recommenders.models.deeprec.deeprec_utils import prepare_hparams, check_nn_config, check_type, load_yaml, flat_config

import inspect
lines = inspect.getsource(flat_config)
print(lines)