# Preprocessing data from scratch.

**Follow procedures below step by step.**

DownLoad the training set and validation set from [URL](https://msnews.github.io/).

Please unzip the two downloaded files.

The files are shown as below.

In [2]:
!ls -R

[1m[36mMINDlarge_dev[m[m   [1m[36mMINDlarge_train[m[m pre.ipynb

./MINDlarge_dev:
__placeholder__        entity_embedding.vec   relation_embedding.vec
behaviors.tsv          news.tsv

./MINDlarge_train:
__placeholder__        entity_embedding.vec   relation_embedding.vec
behaviors.tsv          news.tsv


In [6]:
import pandas as pd

dev_news = './MINDlarge_dev/news.tsv'
train_news = './MINDlarge_train/news.tsv'

news1 = pd.read_csv(dev_news, sep='\t', 
    names=['NewsID', 'Category', 'SubCategory', 'Title', 'Abstract', 'URL', 'title_entities', 'abstract_entity'])
news2 = pd.read_csv(train_news, sep='\t',
    names=['NewsID', 'Category', 'SubCategory', 'Title', 'Abstract', 'URL', 'title_entities', 'abstract_entity'])

news_all = pd.concat([news1,news2])

In [5]:
print(news_all.shape)
news_all.drop_duplicates(subset=['NewsID'],keep='last',inplace=True) #remove duplicate news
print(news_all.shape)
# There are 130,380 news in MIND which is reported in table 2 of our paper. 
# Only the 104,151 news in train and dev sets was used in our experiments.

(173550, 8)
(104151, 8)


In [2]:
news_all.head()

Unnamed: 0,NewsID,Category,SubCategory,Title,Abstract,URL,title_entities,abstract_entity
0,N88753,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N23144,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
2,N86255,health,medical,Dispose of unwanted prescription drugs during ...,,https://assets.msn.com/labs/mind/AAISxPN.html,"[{""Label"": ""Drug Enforcement Administration"", ...",[]
3,N93187,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."
4,N75236,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ..."


We need to filter data in columns of the category and subcategory manually.

Otherwise, pre-trained language models cannot recognize unseen tokens, e.g., "lifestyle" and "lifestyleroyals".

For example, change "lifestyle" into "life style".

We use vscode to find and replace all occurances of these untokinezed tokens.

After filtering, please run codes above again.

Here shows clean data.

In [3]:

news_all.head(2)

Unnamed: 0,NewsID,Category,SubCategory,Title,Abstract,URL,title_entities,abstract_entity
0,N88753,life style,life style royals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N23144,health,weight loss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."


One search query for each news article was created by concatenating the texts of its category, subcategory and the entities in the metadata. 

For a few number of articles where the entities are missing (both title entities and abstract entities are empty sets), “NLTK”  was used to extract entities from the titles.

In [5]:
from nltk import ne_chunk, pos_tag, word_tokenize
from nltk.tree import Tree
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

def get_continuous_chunks(text):
    chunked = ne_chunk(pos_tag(word_tokenize(text)))
    continuous_chunk = []
    current_chunk = []
    for i in chunked:
            if type(i) == Tree:
                    current_chunk.append(" ".join([token for token, pos in i.leaves()]))
            if current_chunk:
                    named_entity = " ".join(current_chunk)
                    if named_entity not in continuous_chunk:
                            continuous_chunk.append(named_entity)
                            current_chunk = []
            else:
                    continue
    return continuous_chunk


In [6]:
from tqdm import tqdm

news_entity = news_all.copy()
news_entity['total_entities'] = ''
news = news_all.to_numpy()


for idx in tqdm(range(len(news)), position=0, leave=True):
    if news[idx][6] == '[]' and news[idx][7]=='[]':
        if type(news[idx][4])==float:
            '''news without abstract'''
            tit_abs = news[idx][3]
        else:
            tit_abs = news[idx][3]+' '+news[idx][4]
     
        entity = get_continuous_chunks(tit_abs)
        
        total_entities = ' '.join(entity)
    
        news_entity.at[idx,'total_entities'] = total_entities
    else:
        total_entities = []
        if news[idx][6] != '[]' and type(news[idx][6])!=float:
            try:
                total_entities.append([x['Label'] for x in eval(news[idx][6])])
            except:
                print(idx)
        if news[idx][7] != '[]' and type(news[idx][7])!=float:
            try:
                total_entities.append([x['Label'] for x in eval(news[idx][7])])
            except:
                print(idx)
        total_entities = [item for sub_list in total_entities for item in sub_list]
        total_entities = ' '.join(str(e) for e in total_entities)
        news_entity.at[idx,'total_entities'] = total_entities


100%|██████████| 104151/104151 [10:16<00:00, 168.85it/s]


In [8]:
news_entity.head(3)

Unnamed: 0,NewsID,Category,SubCategory,Title,Abstract,URL,title_entities,abstract_entity,total_entities
0,N88753,life style,life style royals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[],"Prince Philip, Duke of Edinburgh Charles, Prin..."
1,N23144,health,weight loss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...",Adipose tissue Adipose tissue
2,N86255,health,medical,Dispose of unwanted prescription drugs during ...,,https://assets.msn.com/labs/mind/AAISxPN.html,"[{""Label"": ""Drug Enforcement Administration"", ...",[],Drug Enforcement Administration


In [13]:
# store intermdediate results
news_entity = news_entity.dropna(subset=['NewsID'])
news_entity.to_csv('news_entity_.tsv',sep='\t',index=False)

In [26]:
'''gather texts into groups(query and item)'''
from tqdm import tqdm
news = news_entity.copy()
news['query'] = ''  # query text
news['embedding']='' # item text 
news = news.drop(['URL','Category','SubCategory','Title','Abstract','total_entities','title_entities','abstract_entity'],axis=1)
# news = news.to_numpy()
news_entity = news_entity.fillna('')

for idx in tqdm(range(len(news))):
    news.loc[idx,'query'] = news_entity.loc[idx,'Category']+' '+news_entity.loc[idx,'SubCategory']+' '+news_entity.loc[idx,'total_entities']
    news.loc[idx,'embedding'] = news_entity.loc[idx,'Title']+' '+news_entity.loc[idx,'Abstract']


100%|██████████| 104151/104151 [00:17<00:00, 6042.59it/s]


In [27]:
print(news.shape)
news.head(3)

(104151, 3)


Unnamed: 0,NewsID,query,embedding
0,N88753,"life style life style royals Prince Philip, Du...","The Brands Queen Elizabeth, Prince Charles, an..."
1,N23144,health weight loss Adipose tissue Adipose tissue,50 Worst Habits For Belly Fat These seemingly ...
2,N86255,health medical Drug Enforcement Administration,Dispose of unwanted prescription drugs during ...


In [37]:
news.to_csv('news.tsv', sep='\t', index=False)

Now we use the line number as index for each news.

In [28]:
import json
ID2idx = dict()

cnt = 1
# index starts from 1
# 0 for padding
for idx,line in news.iterrows():
    if ID2idx.__contains__(line[0]) == False:
        ID2idx[line[0]] = cnt
        cnt += 1

json_str = json.dumps(ID2idx)
with open('ID2idx.json', 'w') as json_file:
    json_file.write(json_str)

Then we process the user-item interaction data.

In [30]:
folder_name = 'MINDlarge_train'
behaviors = pd.read_csv(folder_name+"/behaviors.tsv",sep='\t')
behaviors = behaviors.dropna(axis=0) #drop users without history
behaviors = behaviors.to_numpy()


import json
ID2idx = open('ID2idx.json','r')
ID2idx = json.load(ID2idx)

padding_length = 50 # truncate user browsing history at 50

fout = open('./train.csv','w')

for line in tqdm(behaviors):
    user = line[3].split(' ')
    user = [ID2idx[x] for x in user]
    if len(user)>padding_length:
        user = user[:padding_length]
    else:
        for i in range(padding_length-len(user)):
            user.append(0)
    iteract_items = line[4].split(' ')
    iteract_items = [item.split('-') for item in iteract_items]
    for iteract in iteract_items:
        # fout.write(str(user))
        for item in user:
            fout.write(str(item)+',')
        # fout.write(',')
        fout.write(str(ID2idx[iteract[0]]))
        fout.write(',')
        fout.write(str(iteract[1]))
        fout.write('\n')

100%|██████████|2186682/2186682 [32:26<00:00, 1103.29it/s]


In [33]:
#show training dataset
!head -n 15 train.csv

16340,49602,39636,18665,23724,28793,872,59097,35523,35775,10698,40145,50154,36619,54478,54645,41267,39983,29231,38994,57195,71239,65272,71796,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,86996,0
16340,49602,39636,18665,23724,28793,872,59097,35523,35775,10698,40145,50154,36619,54478,54645,41267,39983,29231,38994,57195,71239,65272,71796,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,92955,0
16340,49602,39636,18665,23724,28793,872,59097,35523,35775,10698,40145,50154,36619,54478,54645,41267,39983,29231,38994,57195,71239,65272,71796,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,66393,0
16340,49602,39636,18665,23724,28793,872,59097,35523,35775,10698,40145,50154,36619,54478,54645,41267,39983,29231,38994,57195,71239,65272,71796,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,71655,0
16340,49602,39636,18665,23724,28793,872,59097,35523,35775,10698,40145,50154,36619,54478,54645,41267,39983,29231,38994,57195,71239,65272,71796,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,

In [35]:
folder_name = 'MINDlarge_dev'
behaviors = pd.read_csv(folder_name+"/behaviors.tsv",sep='\t')
behaviors = behaviors.dropna(axis=0) #drop users without history
behaviors = behaviors.to_numpy()


import json
ID2idx = open('ID2idx.json','r')
ID2idx = json.load(ID2idx)

padding_length = 50 # truncate user browsing history at 50

fout = open('./dev.csv','w')
# fout.write('user,item,ctr\n')
for line in tqdm(behaviors):
    user = line[3].split(' ')
    user = [ID2idx[x] for x in user]
    if len(user)>padding_length:
        user = user[:padding_length]
    else:
        for i in range(padding_length-len(user)):
            user.append(0)
    iteract_items = line[4].split(' ')
    iteract_items = [item.split('-') for item in iteract_items]
    for iteract in iteract_items:
        # fout.write(str(user))
        for item in user:
            fout.write(str(item)+',')
        # fout.write(',')
        fout.write(str(ID2idx[iteract[0]]))
        fout.write(',')
        fout.write(str(iteract[1]))
        fout.write('\n')

100%|██████████| 365200/365200 [04:24<00:01, 1081.76it/s]


In [36]:
#show test dataset
!head -n 15 dev.csv

62326,56055,50011,56063,14115,18785,39544,19156,27416,52335,8319,14915,940,872,18691,23304,45940,49597,56185,13217,21563,58333,35183,9354,30137,44644,47782,55564,12334,34692,53057,49667,38773,5266,27909,53101,46648,27154,41911,37117,3043,45472,47277,29900,56863,46850,56989,34069,30705,45279,70258,0
62326,56055,50011,56063,14115,18785,39544,19156,27416,52335,8319,14915,940,872,18691,23304,45940,49597,56185,13217,21563,58333,35183,9354,30137,44644,47782,55564,12334,34692,53057,49667,38773,5266,27909,53101,46648,27154,41911,37117,3043,45472,47277,29900,56863,46850,56989,34069,30705,45279,71795,0
62326,56055,50011,56063,14115,18785,39544,19156,27416,52335,8319,14915,940,872,18691,23304,45940,49597,56185,13217,21563,58333,35183,9354,30137,44644,47782,55564,12334,34692,53057,49667,38773,5266,27909,53101,46648,27154,41911,37117,3043,45472,47277,29900,56863,46850,56989,34069,30705,45279,70216,0
62326,56055,50011,56063,14115,18785,39544,19156,27416,52335,8319,14915,940,872,18691,23304,45940,495

Lastly, we embed news and queries into vectors using a pre-trained language model.

We use the pre-trained model provided by huggingface transformer.

The next two code blocks cost several hours to run. We suggest you to use GPU to accelerate the process.

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel, pipeline
from tqdm import tqdm
import math
import torchsnooper

model = AutoModel.from_pretrained('distilbert-base-uncased')
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased',do_lower_case=True)

model = model.to('cuda:0')

import pandas as pd
news = pd.read_csv('news.tsv',sep='\t')
embedding = news['embedding'].to_numpy() #generate item embedding
 
import gc
embedding_vec = torch.zeros((embedding.shape[0],768))
batch_size = 8
for i in tqdm(range(math.ceil(embedding.shape[0] / batch_size))):
    batch_input = embedding[i*batch_size:(i+1)*batch_size].tolist()
    tokens = tokenizer(batch_input,padding="max_length",truncation=True,return_tensors="pt",max_length=100)
    tokens = tokens.to('cuda:0')
    features = model(**tokens)
    features = features.last_hidden_state[:,0]
    embedding_vec[i*batch_size+1:(i+1)*batch_size+1] = features
    del features
    gc.collect()

embedding_vec = embedding_vec.numpy()

import numpy as np
embedding_vec = np.vstack([np.array([0.0]*768), embedding_vec]) #padding
np.save('embedding_vec.npy', embedding_vec)



In [None]:
import torch
from transformers import AutoTokenizer, AutoModel, pipeline
from tqdm import tqdm
import math
import torchsnooper

model = AutoModel.from_pretrained('distilbert-base-uncased')
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased',do_lower_case=True)

model = model.to('cuda:0')

import pandas as pd
news = pd.read_csv('news.tsv',sep='\t')
embedding = news['query'].to_numpy() #generate query embedding
 
import gc
embedding_vec = torch.zeros((embedding.shape[0],768))
batch_size = 8
for i in tqdm(range(math.ceil(embedding.shape[0] / batch_size))):
    batch_input = embedding[i*batch_size:(i+1)*batch_size].tolist()
    tokens = tokenizer(batch_input,padding="max_length",truncation=True,return_tensors="pt",max_length=100)
    tokens = tokens.to('cuda:0')
    features = model(**tokens)
    features = features.last_hidden_state[:,0]
    embedding_vec[i*batch_size+1:(i+1)*batch_size+1] = features
    del features
    gc.collect()

embedding_vec = embedding_vec.numpy()

import numpy as np
embedding_vec = np.vstack([np.array([0.0]*768), embedding_vec]) #padding
np.save('query_vec.npy', embedding_vec) 


On the MIND dataset, 𝑁 was set to 1 since only one query was created for each item (news article). $N$ is the number of queries used as IVs in Equation 2 in Section 4.2 of the paper.

In our experiments, since the query embedding matrix is fixed, we can calculate pseudoinverse of corresponding query embedding matrix offline to accelerate training.


In [None]:
import numpy as np
from tqdm import tqdm
import math
import ast
import torch

query = np.load("query_vec.npy",allow_pickle=True)
# query_vec.npy is Zt
query = torch.Tensor(query)


Zt = query.unsqueeze(-1)#np.zeros((query_index.shape[0], 64, top_k))
Zt_pinv = np.zeros((query.shape[0],  1, 768))

      
for i in tqdm(range(Zt.shape[0])):
    Zt_pinv[i] = np.linalg.pinv(Zt[i])

np.save("./Zt_pinv.npy", Zt_pinv)