In [30]:
import json
import gzip
import os
import pickle
from pathlib import Path
from smart_open import open
from transformers import BertTokenizer, BertModel


import string
import random
import pandas as pd

import matplotlib.pyplot as plt
from collections import OrderedDict, defaultdict
from tqdm import tqdm

import nltk

In [31]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/henning/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
!pip install smart_open



In [2]:
def data_stream_generator(data_dir):
    files = [file for file in Path(data_dir).iterdir() if
             file.is_file() and file.name[0] not in '_.']
    for file in files:
        with open(file) as rf:
            yield from (json.loads(line) for line in rf.read().splitlines() if line)

In [3]:
data_dir = "../datasets/dpg/"

In [4]:
os.listdir(data_dir)

['item_ids_10k.pkl',
 'users5k_on_items100k.pkl',
 'users_5k.pkl',
 'items',
 'item_ids_100k.pkl',
 'data_neg_sample.pkl',
 'item_ids.pkl',
 'users_on_item_subset.pkl',
 'users',
 'users_50k.pkl']

In [46]:
# set specifications for subset
config = {}
config["n_items"] = 1e5
config["n_users"] = 5e3
config["snippet_len"] = 50
config["name"] = "i100k_u50k_s50"

save_path = data_dir + config["name"] + "/"

config["save_path"] = save_path


In [47]:
try:
    os.mkdir(config["save_path"])
except:
    pass    

In [45]:
def get_text_snippet(text, len_snippet):
    naive_tokens = str(text).split(" ")[:len_snippet]
    #TODO: remove stop words            
    return " ".join(naive_tokens) 

def item_id_generator(data_dir):
    i_ids = set()
    for item in tqdm(data_stream_generator(data_dir + "items")):
        i_ids.add(item['short_id'])
        
    return i_ids

In [6]:
i_ids = item_id_generator(data_dir)

2698562it [03:15, 13806.39it/s]


In [7]:
len(i_ids)

2698562

In [12]:
with open(data_dir + "item_ids.pkl", 'wb') as fout:
    pickle.dump(i_ids, fout)

In [13]:
#i_ids = list(i_ids)

In [14]:
with open(data_dir + "item_ids_100k.pkl", 'wb') as fout:
    pickle.dump(list(i_ids)[:100000], fout)

In [17]:
%%time
item_dict = OrderedDict()
#len_snippet = 20
snippet_len = config["snippet_len"]

keys_to_exclude = ["short_id", "url"]

for i, item in enumerate(data_stream_generator(data_dir + "items")):
    #data[i] = item
    item_dict[item['short_id']] = {key:val for (key,val) in item.items() if key not in keys_to_exclude}
    
    if snippet_len != None:      
        #print(item['text'])
        item_dict[item['short_id']]['snippet'] = get_text_snippet(item['text'], snippet_len)
    
    if i == config["n_items"]-1:
        break
    elif i == 0:
        print(item.keys())
        #print(item_dict[item['short_id']]['snippet'])   


dict_keys(['text', 'pub_date', 'author', 'url', 'short_id'])
CPU times: user 5.96 s, sys: 204 ms, total: 6.16 s
Wall time: 6.16 s


In [18]:
len(item_dict.keys())

100000

In [20]:
item_dict[next(iter(item_dict))]["text"]

'Nieuw: patatje met ... snert Naast het vertrouwde patatje \'met\' en oorlog serveert Bram\'s Gourmet Frites sinds vrijdag een oer-Hollandse variant: \'Winterse Snert Stoof\'. Het broertje van Bram Ladage is geïnspireerd door de traditionele \'oma\'s erwtensoep\'. Dat gerecht is in een nieuw jasje gestoken. In de snert zitten onder meer stukken rookworst en spekjes. Om het af te maken, komt er knapperige groentechips op de bak met patat. Het frietje past volgens Rocco Ladage van BRAM\'S goed bij het winterweer. "Want wat is nu meer nostalgisch dan opwarmen met erwtensoep op een ijskoude winterdag?” De seizoenspatat zal de komende maanden op de kaart staan. Eerder kwam de Rotterdamse patatboer al met een topping van wild stoofvlees uit de Hoeksche Waard en patatje roti stoof. Elke ochtend up-to-date met het laatste nieuws uit Rotterdam en omstreken? Schrijf je hier gratis in!'

In [49]:
with open(config["save_path"] + "news_data.pkl", 'wb') as fout:
    pickle.dump(item_dict, fout)

In [21]:
%%time
# params
min_n_arts = 5
remove_unk_arts = True
replace_with_unk = False
unk_art_id = 1
USER_CHUNK = config["n_users"]

#valid_item_ids = item_dict.keys()
i_ids_subset = list(i_ids)[:100000]
valid_item_ids = set(i_ids_subset)
#valid_item_ids = i_ids

# init
user_dict = OrderedDict()
unk_arts = defaultdict(int)
removed_users = 0

for i, item in tqdm(enumerate(data_stream_generator(data_dir + "users"))):
    #quick preliminary eval
    if len(item['articles_read']) >= min_n_arts:    
        # remove unknown articles from reading history
        if remove_unk_arts:
            history = []
            #check if article ID is present in our set of valid IDs
            for entry in item['articles_read']:
                art_id = entry[1]

                if art_id not in valid_item_ids:
                    unk_arts[art_id] += 1

                    if replace_with_unk:
                        entry[1] = unk_art_id
                        history.append(entry)
                else:
                    history.append(entry)    
            item['articles_read'] = history

        #evaluate length reading history
        if len(item['articles_read']) < min_n_arts or len(item['articles_read']) == 0:
            removed_users+=1   
            if removed_users % 1e3 == 0:
                print(removed_users)
        else:
            #add valid user that fulfills min read condition
            user_dict[item['user_id']] = {key:val for (key,val) in item.items() if key != 'user_id'}    
            user_dict[item['user_id']]['n_articles_read'] = len(user_dict[item['user_id']]['articles_read'])

    #break condition
    if len(user_dict.keys()) == USER_CHUNK or i == USER_CHUNK * 10:
        break
    if i == 0:
        print(item.keys())
    if i % 1e2 == 0:
        print("iteration {}".format(i))

1903it [00:00, 1408.72it/s]

dict_keys(['user_id', 'articles_read', 'opened_pushes', 'articles_pushed'])
iteration 0
iteration 100
iteration 200
iteration 300
iteration 400
iteration 500
iteration 600
iteration 700
iteration 800
iteration 900
iteration 1000
iteration 1100
1000
iteration 1200
iteration 1300
iteration 1400
iteration 1500
iteration 1600
iteration 1700
iteration 1800
iteration 1900
iteration 2000
iteration 2100
iteration 2200
2000
iteration 2300
iteration 2400
iteration 2500
iteration 2600
iteration 2700
iteration 2800
iteration 2900
iteration 3000


5710it [00:00, 3330.92it/s]

iteration 3100
iteration 3200
iteration 3300
iteration 3400
3000
iteration 3500
iteration 3600
iteration 3700
iteration 3800
iteration 3900
iteration 4000
iteration 4100
iteration 4200
iteration 4300
iteration 4400
iteration 4500
4000
iteration 4600
iteration 4700
iteration 4800
iteration 4900
iteration 5000
iteration 5100
iteration 5200
iteration 5300
iteration 5400
iteration 5500
iteration 5600
5000
iteration 5700
iteration 5800
iteration 5900
iteration 6000
iteration 6100
iteration 6200
iteration 6300
iteration 6400
iteration 6500
iteration 6600
iteration 6700
iteration 6800
6000
iteration 6900
iteration 7000
iteration 7100


8627it [00:00, 5016.83it/s]

iteration 7200
iteration 7300
iteration 7400
iteration 7500
iteration 7600
iteration 7700
iteration 7800
iteration 7900
7000
iteration 8000
iteration 8100
iteration 8200
iteration 8300
iteration 8400
iteration 8500
iteration 8600
iteration 8700
iteration 8800
iteration 8900
iteration 9000
iteration 9100
8000
iteration 9200


10033it [00:00, 6216.16it/s]

iteration 9300
iteration 9400
iteration 9500
iteration 9600
iteration 9700
iteration 9800
iteration 9900
iteration 10000
iteration 10100
iteration 10200


12804it [00:01, 5601.01it/s]

9000
iteration 10300
iteration 10400
iteration 10500
iteration 10600
iteration 10700
iteration 10800
iteration 10900
iteration 11000
iteration 11100
iteration 11200
iteration 11300
10000
iteration 11400
iteration 11500
iteration 11600
iteration 11700
iteration 11800
iteration 11900
iteration 12000
iteration 12100
iteration 12200
iteration 12300
iteration 12400
iteration 12500
11000
iteration 12600
iteration 12700
iteration 12800
iteration 12900
iteration 13000
iteration 13100
iteration 13200
iteration 13300
iteration 13400
iteration 13500
iteration 13600

15463it [00:01, 7637.87it/s]


12000
iteration 13700
iteration 13800
iteration 13900
iteration 14000
iteration 14100
iteration 14200
iteration 14300
iteration 14400
iteration 14500
iteration 14600
iteration 14700
iteration 14800
13000
iteration 14900
iteration 15000
iteration 15100
iteration 15200
iteration 15300
iteration 15400
iteration 15500
iteration 15600
iteration 15700
iteration 15800
iteration 15900
14000
iteration 16000
iteration 16100
iteration 16200
iteration 16300
iteration 16400
iteration 16500

18324it [00:01, 9327.11it/s]


iteration 16600
iteration 16700
iteration 16800
iteration 16900
iteration 17000
iteration 17100
15000
iteration 17200
iteration 17300
iteration 17400
iteration 17500
iteration 17600
iteration 17700
iteration 17800
iteration 17900
iteration 18000
iteration 18100
iteration 18200
16000
iteration 18300
iteration 18400
iteration 18500
iteration 18600
iteration 18700
iteration 18800
iteration 18900
iteration 19000
iteration 19100
iteration 19200
iteration 19300
iteration 19400
17000
iteration 19500
iteration 19600

21481it [00:02, 10247.50it/s]


iteration 19700
iteration 19800
iteration 19900
iteration 20000
iteration 20100
iteration 20200
iteration 20300
iteration 20400
iteration 20500
18000
iteration 20600
iteration 20700
iteration 20800
iteration 20900
iteration 21000
iteration 21100
iteration 21200
iteration 21300
iteration 21400
iteration 21500
iteration 21600
iteration 21700
19000
iteration 21800
iteration 21900


23889it [00:02, 9847.97it/s] 

iteration 22000
iteration 22100
iteration 22200
iteration 22300
iteration 22400
iteration 22500
iteration 22600
iteration 22700
iteration 22800
20000
iteration 22900
iteration 23000
iteration 23100
iteration 23200
iteration 23300
iteration 23400
iteration 23500
iteration 23600
iteration 23700
iteration 23800
iteration 23900
21000
iteration 24000
iteration 24100
iteration 24200
iteration 24300
iteration 24400


26672it [00:02, 10943.53it/s]

iteration 24500
iteration 24600
iteration 24700
iteration 24800
iteration 24900
iteration 25000
iteration 25100
22000
iteration 25200
iteration 25300
iteration 25400
iteration 25500
iteration 25600
iteration 25700
iteration 25800
iteration 25900
iteration 26000
iteration 26100
iteration 26200
23000
iteration 26300
iteration 26400
iteration 26500
iteration 26600
iteration 26700
iteration 26800
iteration 26900
iteration 27000
iteration 27100


29583it [00:02, 11689.38it/s]

iteration 27200
iteration 27300
24000
iteration 27400
iteration 27500
iteration 27600
iteration 27700
iteration 27800
iteration 27900
iteration 28000
iteration 28100
iteration 28200
iteration 28300
iteration 28400
iteration 28500
25000
iteration 28600
iteration 28700
iteration 28800
iteration 28900
iteration 29000
iteration 29100
iteration 29200
iteration 29300
iteration 29400
iteration 29500
iteration 29600
26000
iteration 29700
iteration 29800
iteration 29900
iteration 30000
iteration 30100
iteration 30200
iteration 30300
iteration 30400


32340it [00:03, 11971.37it/s]

iteration 30500
iteration 30600
iteration 30700
iteration 30800
27000
iteration 30900
iteration 31000
iteration 31100
iteration 31200
iteration 31300
iteration 31400
iteration 31500
iteration 31600
iteration 31700
iteration 31800
iteration 31900
28000
iteration 32000
iteration 32100
iteration 32200
iteration 32300
iteration 32400
iteration 32500
iteration 32600
iteration 32700
iteration 32800
iteration 32900
iteration 33000
iteration 33100
29000
iteration 33200


35004it [00:03, 11916.82it/s]

iteration 33300
iteration 33400
iteration 33500
iteration 33600
iteration 33700
iteration 33800
iteration 33900
iteration 34000
iteration 34100
iteration 34200
30000
iteration 34300
iteration 34400
iteration 34500
iteration 34600
iteration 34700
iteration 34800
iteration 34900
iteration 35000
iteration 35100
iteration 35200
iteration 35300
31000
iteration 35400
iteration 35500
iteration 35600
iteration 35700
iteration 35800


37810it [00:03, 12247.81it/s]

iteration 35900
iteration 36000
iteration 36100
iteration 36200
iteration 36300
iteration 36400
iteration 36500
32000
iteration 36600
iteration 36700
iteration 36800
iteration 36900
iteration 37000
iteration 37100
iteration 37200
iteration 37300
iteration 37400
iteration 37500
iteration 37600
33000
iteration 37700
iteration 37800
iteration 37900
iteration 38000
iteration 38100
iteration 38200
iteration 38300
iteration 38400


39077it [00:03, 11455.21it/s]

iteration 38500
iteration 38600
iteration 38700
iteration 38800
34000
iteration 38900
iteration 39000
iteration 39100
iteration 39200
iteration 39300
iteration 39400
iteration 39500
iteration 39600
iteration 39700
iteration 39800
iteration 39900
35000
iteration 40000
iteration 40100
iteration 40200
iteration 40300
iteration 40400
iteration 40500
CPU times: user 3.87 s, sys: 75.9 ms, total: 3.95 s
Wall time: 3.91 s





In [22]:
removed_users

35526

In [23]:
len(user_dict.keys())

5000

In [24]:
with open(data_dir + "users5k_on_items100k.pkl", "wb") as fout:
    pickle.dump(user_dict, fout)

In [25]:
user_dict[next(iter(user_dict))]

{'articles_read': [['ad', '738d607', 1576587959],
  ['ad', 'f76be54', 1575663575],
  ['ad', '20bfd89', 1576074081],
  ['ad', '20bfd89', 1576091644],
  ['ad', 'c5b4f55', 1575378563],
  ['ad', 'c5b4f55', 1575400773],
  ['ad', 'b06aedd', 1575234851]],
 'opened_pushes': [],
 'articles_pushed': [],
 'n_articles_read': 7}

In [18]:
df_user = pd.DataFrame.from_dict(user_dict, orient='index')

In [19]:
df_user.head()

Unnamed: 0,articles_read,opened_pushes,articles_pushed,n_articles_read
002c6eaf-f12d-4536-8bfc-9313c84c7759,"[[None, a78da5d, 1576585653], [None, fd0064c, ...",[],[],5
002cc435-31cd-4d1a-ac2d-80624f453559,"[[hln, 2b3bfb8, 1575394723], [hln, d19cdc0, 15...",[],[],7
0057733f-3914-4185-ab5b-443fff7ed386,"[[ad, ba25332, 1577200927], [ad, 67ee81d, 1577...",[],[],69
006ee30d-309d-4ba4-8d5f-053ed98839e9,"[[hln, 86969e2, 1575441080], [hln, b879684, 15...",[],[],17
00adde41-fdb8-4728-8838-a7c882a70e39,"[[ad, eeacf11, 1577694799], [ad, 5e45340, 1577...",[],[],16


In [22]:
df_user[df_user['opened_pushes'].apply(lambda x: len(x)) > 0][:5]

Unnamed: 0,articles_read,opened_pushes,articles_pushed,n_articles_read
3a13757e-4b31-491f-a8ed-67cec9ef2c65,"[[ad, 4061b53, 1577791498], [ad, 47babdb, 1575...","[[1d393b49-cd7d-4bcc-8ca4-0079f35d563f, 298dac...",[[0d53887c0efb325a52427e8106e687c5ab3681677b5e...,91
5b85220d-2570-40ce-9e68-98ffa1207117,"[[ad, c2b7a5d, 1576980043], [ad, f7cf158, 1576...","[[2cc1e5a5-4801-4a2d-9000-7e0d2486d390, 046676...",[[1179b7c55edfdfc0fe8abf467bc824ea328d26c3b11e...,94
72c0c491-3067-4976-8808-1ce033ad0c0d,"[[ad, e70bb01, 1576856024], [ad, e70bb01, 1576...","[[7faeede9-ee2f-4231-91f0-2db8060dc39f, 080431...",[[fa7fe63dabb9c0c6ff5f061c212acbc5f36e09fbb9b7...,46
a357dbf1-cdce-4876-8ca5-6b4972348f2f,"[[ad, 256f435, 1575990750], [ad, 14a930c, 1575...","[[9d44da34-a72b-47ab-ad6f-cd3d05c8bb29, cd4bb9...",[[86effadd06b5fb896aeee84cd1b19e772b5fde17f6e8...,16
ebfa508c-ee8c-4aa8-9ed7-fb052a0553bb,"[[ad, 3f8da29, 1576386659], [ad, 119b6b2, 1575...","[[63e56095-8968-443a-a896-2d479c8aaca4, 196342...",[[d8BX3d2xkTE:APA91bEqCrQBhlpV6dFGArFi92QUymHL...,30


In [134]:
#plt.hist(df_user['n_articles_read'])

In [23]:
list(user_dict.keys())[:10]

['002c6eaf-f12d-4536-8bfc-9313c84c7759',
 '002cc435-31cd-4d1a-ac2d-80624f453559',
 '0057733f-3914-4185-ab5b-443fff7ed386',
 '006ee30d-309d-4ba4-8d5f-053ed98839e9',
 '00adde41-fdb8-4728-8838-a7c882a70e39',
 '00b4881f-9930-4052-85ca-23fcf883a552',
 '00c14cb1-5cd2-42c3-8dae-41b1da61e057',
 '01054836-8f9e-4400-8bf8-3a7737b567c0',
 '010618df-38b9-4d4f-8f6a-a928d5cbcfa6',
 '0160ab5c-d8f1-4584-ac95-4ac04cbed850']

In [25]:
user_dict['002c6eaf-f12d-4536-8bfc-9313c84c7759']['opened_pushes']

[]

In [9]:
bert_path = "../bert/bert-base-dutch-cased"

In [10]:
tokenizer = BertTokenizer.from_pretrained(bert_path)

In [21]:
sent = item_dict[next(iter(item_dict))]["text"]

In [22]:
tokenizer.tokenize(sent)

['nieuw',
 ':',
 'patat',
 '##je',
 'met',
 '.',
 '.',
 '.',
 'sne',
 '##r',
 '##t',
 'naast',
 'het',
 'vertrouwde',
 'patat',
 '##je',
 "'",
 'met',
 "'",
 'en',
 'oorlog',
 'serveer',
 '##t',
 'bra',
 '##m',
 "'",
 '[UNK]',
 'go',
 '##ur',
 '##met',
 'fr',
 '##ites',
 'sinds',
 'vrijdag',
 'een',
 '[UNK]',
 '-',
 'hol',
 '##landse',
 'variant',
 ':',
 "'",
 'winter',
 '##s',
 '##e',
 'sne',
 '##r',
 '##t',
 'stoof',
 "'",
 '.',
 'het',
 'broertje',
 'van',
 'bra',
 '##m',
 'la',
 '##dag',
 '##e',
 'is',
 'ge',
 '##in',
 '##spir',
 '##eerd',
 'door',
 'de',
 'traditionele',
 "'",
 'om',
 '##a',
 "'",
 '[UNK]',
 'er',
 '##w',
 '##ten',
 '##soep',
 "'",
 '.',
 'dat',
 'gerecht',
 'is',
 'in',
 'een',
 'nieuw',
 'jasje',
 'gestoken',
 '.',
 'in',
 'de',
 'sne',
 '##r',
 '##t',
 'zitten',
 'onder',
 'meer',
 'stukken',
 'rook',
 '##worst',
 'en',
 'spek',
 '##jes',
 '.',
 'om',
 'het',
 'af',
 'te',
 'maken',
 ',',
 'komt',
 'er',
 'knappe',
 '##rig',
 '##e',
 'groente',
 '##chi',
 '##p'

In [27]:
#tokenizer.vocab

In [33]:
from nltk.tokenize import word_tokenize

In [35]:
word_tokenize(sent, language='dutch')

['Nieuw',
 ':',
 'patatje',
 'met',
 '...',
 'snert',
 'Naast',
 'het',
 'vertrouwde',
 'patatje',
 "'met",
 "'",
 'en',
 'oorlog',
 'serveert',
 'Bram',
 "'s",
 'Gourmet',
 'Frites',
 'sinds',
 'vrijdag',
 'een',
 'oer-Hollandse',
 'variant',
 ':',
 "'Winterse",
 'Snert',
 'Stoof',
 "'",
 '.',
 'Het',
 'broertje',
 'van',
 'Bram',
 'Ladage',
 'is',
 'geïnspireerd',
 'door',
 'de',
 'traditionele',
 "'oma",
 "'s",
 'erwtensoep',
 "'",
 '.',
 'Dat',
 'gerecht',
 'is',
 'in',
 'een',
 'nieuw',
 'jasje',
 'gestoken',
 '.',
 'In',
 'de',
 'snert',
 'zitten',
 'onder',
 'meer',
 'stukken',
 'rookworst',
 'en',
 'spekjes',
 '.',
 'Om',
 'het',
 'af',
 'te',
 'maken',
 ',',
 'komt',
 'er',
 'knapperige',
 'groentechips',
 'op',
 'de',
 'bak',
 'met',
 'patat',
 '.',
 'Het',
 'frietje',
 'past',
 'volgens',
 'Rocco',
 'Ladage',
 'van',
 'BRAM',
 "'S",
 'goed',
 'bij',
 'het',
 'winterweer',
 '.',
 '``',
 'Want',
 'wat',
 'is',
 'nu',
 'meer',
 'nostalgisch',
 'dan',
 'opwarmen',
 'met',
 'erwt

In [15]:
%%time
bert_model = BertModel.from_pretrained(bert_path)

CPU times: user 1.87 s, sys: 629 ms, total: 2.49 s
Wall time: 8.02 s


In [16]:
del(bert_model)