In [2]:
import pandas as pd

In [3]:
news_train = 'data/MINDsmall_train/news.tsv'
behavior_train = 'data/MINDsmall_train/behaviors.tsv'
entity_train = 'data/MINDsmall_train/entity_embedding.vec'
relation_train = 'data/MINDsmall_train/relation_embedding.vec'
#-------------------------------------------
news_test = 'data/MINDsmall_dev/news.tsv'
behavior_test = 'data/MINDsmall_dev/behaviors.tsv'
entity_test = 'data/MINDsmall_dev/entity_embedding.vec'
relation_test = 'data/MINDsmall_dev/relation_embedding.vec' 

def load_df(path):
    if 'news' in path:
        columns = ['News ID',
                "Category",
                "SubCategory",
                "Title",
                "Abstract",
                "URL",
                "Title Entities",
                "Abstract Entities"]
    
    elif 'behavior' in path:
        columns = ['Impression ID',
                "User ID",
                "Time",
                "History",
                "Impressions"]
    else:
        return pd.read_csv(path, sep='\t', header=None)
    
    df = pd.read_csv(path, sep='\t', header=None, names=columns)
    return df

news_train, news_test, behavior_train, behavior_test = map(load_df, [news_train, news_test, behavior_train, behavior_test])
entity_train, relation_train, entity_test, relation_test = map(load_df, [entity_train, relation_train, entity_test, relation_test])
print('MIND-small:')
print(f"{'Dataset':<15} {'Train shape':<20} {'Test shape'}")
print(f"{'-'*50}")
print(f"{'news':<15} {str(news_train.shape):<20} {news_test.shape}")
print(f"{'behavior':<15} {str(behavior_train.shape):<20} {behavior_test.shape}")
print(f"{'entity':<15} {str(entity_train.shape):<20} {entity_test.shape}")
print(f"{'relation':<15} {str(relation_train.shape):<20} {relation_test.shape}")

MIND-small:
Dataset         Train shape          Test shape
--------------------------------------------------
news            (51282, 8)           (42416, 8)
behavior        (156965, 5)          (73152, 5)
entity          (26904, 102)         (22893, 102)
relation        (1091, 102)          (1091, 102)


In [4]:
#prepare history data
behavior_train["History"] = behavior_train["History"].apply(lambda x: x.split(" ") if isinstance(x, str) else []) #make it iterable
train = behavior_train.drop(["Impression ID", "Time", "Impressions"], axis=1)
train = train.explode("History")
train['label'] = 1
train.rename(columns={ "User ID": "user_id:token" ,"History": "item_id:token", "label": "label:float"}, inplace=True)
train['user_id:token'] = train['user_id:token'].str[1:]
train['item_id:token'] = train['item_id:token'].str[1:]
train

Unnamed: 0,user_id:token,item_id:token,label:float
0,13740,55189,1
0,13740,42782,1
0,13740,34694,1
0,13740,45794,1
0,13740,18445,1
...,...,...,...
156963,44625,43083,1
156963,44625,9288,1
156963,44625,37863,1
156964,64800,22997,1


In [5]:
#same with impressions
behavior_train["Impressions"] = behavior_train["Impressions"].apply(lambda x: x.split(" ") if isinstance(x, str) else [])#make it iterable
imp = behavior_train.drop(["Impression ID", "Time", "History"], axis=1)
imp = imp.explode("Impressions")
imp['label'] = imp['Impressions'].apply(lambda x: int(x.split("-")[-1]))
#delete last two character of impressions
imp['Impressions'] = imp['Impressions'].apply(lambda x: x[:-2])
imp.rename(columns={"User ID": "user_id:token" ,"Impressions": "item_id:token", "label": "label:float"}, inplace=True)
imp['user_id:token'] = imp['user_id:token'].str[1:]
imp['item_id:token'] = imp['item_id:token'].str[1:]
imp

Unnamed: 0,user_id:token,item_id:token,label:float
0,13740,55689,1
0,13740,35729,0
1,91836,20678,0
1,91836,39317,0
1,91836,58114,0
...,...,...,...
156963,44625,39317,0
156964,64800,61233,0
156964,64800,33828,1
156964,64800,19661,0


In [6]:
inter_train = pd.concat([train, imp], axis=0)

In [13]:
def five_core_filter(df):
    while True:
        before_shape = df.shape
        #users with at least 5 tracks
        counts = df['user_id:token'].value_counts()
        df = df[df['user_id:token'].isin(counts[counts >= 5].index)]
        #tracks with at least 5 users
        counts = df['item_id:token'].value_counts()
        df = df[df['item_id:token'].isin(counts[counts >= 5].index)]
        
        after_shape = df.shape
        #stop condition
        if before_shape == after_shape:
            break
    return df

In [14]:
print('Before filter:', inter_train.shape, "unique users:", inter_train['user_id:token'].nunique(), "unique items:", inter_train['item_id:token'].nunique())
inter_train = five_core_filter(inter_train)
print('After filter:', inter_train.shape, "unique users:", inter_train['user_id:token'].nunique(), "unique items:", inter_train['item_id:token'].nunique())

Before filter: (10954321, 3) unique users: 50000 unique items: 51282
After filter: (10918173, 3) unique users: 49897 unique items: 36827


In [15]:
#save
inter_train.to_csv('mind_train.inter', index=False, sep='\t')

––––––––––––––––––––––––––––––––––––––––––––––––––—

In [7]:
#prepare history data
behavior_test["History"] = behavior_test["History"].apply(lambda x: x.split(" ") if isinstance(x, str) else []) #make it iterable
test = behavior_test.drop(["Impression ID", "Time", "Impressions"], axis=1)
test = test.explode("History")
test['label'] = 1
test.rename(columns={ "User ID": "user_id:token" ,"History": "item_id:token", "label": "label:float"}, inplace=True)
test['user_id:token'] = test['user_id:token'].str[1:]
test['item_id:token'] = test['item_id:token'].str[1:]
test

Unnamed: 0,user_id:token,item_id:token,label:float
0,80234,55189,1
0,80234,46039,1
0,80234,51741,1
0,80234,53234,1
0,80234,11276,1
...,...,...,...
73151,68182,59576,1
73151,68182,2735,1
73151,68182,31883,1
73151,68182,22213,1


In [8]:
#same with impressions
behavior_test["Impressions"] = behavior_test["Impressions"].apply(lambda x: x.split(" ") if isinstance(x, str) else [])#make it iterable
imp = behavior_test.drop(["Impression ID", "Time", "History"], axis=1)
imp = imp.explode("Impressions")
imp['label'] = imp['Impressions'].apply(lambda x: int(x.split("-")[-1]))
#delete last two character of impressions
imp['Impressions'] = imp['Impressions'].apply(lambda x: x[:-2])
imp.rename(columns={"User ID": "user_id:token" ,"Impressions": "item_id:token", "label": "label:float"}, inplace=True)
imp['user_id:token'] = imp['user_id:token'].str[1:]
imp['item_id:token'] = imp['item_id:token'].str[1:]
imp

Unnamed: 0,user_id:token,item_id:token,label:float
0,80234,28682,0
0,80234,48740,0
0,80234,31958,1
0,80234,34130,0
0,80234,6916,0
...,...,...,...
73151,68182,48740,0
73151,68182,55237,0
73151,68182,31958,0
73151,68182,29091,0


In [9]:
inter_test = pd.concat([test, imp], axis=0)

In [71]:
#remove from test set users and items that are not in train set
inter_test = inter_test[inter_test['user_id:token'].isin(inter_train['user_id:token'])]
inter_test = inter_test[inter_test['item_id:token'].isin(inter_train['item_id:token'])]
#print n unique users and items
inter_test['user_id:token'].nunique(), inter_test['item_id:token'].nunique()

(50000, 42416)

In [10]:
#save
inter_test.to_csv('mind_test.inter', index=False, sep='\t')

–––––––––––––––––––––––––––––––––––––––––––––––––––––––––

In [16]:
inter = pd.concat([inter_train, inter_test], axis=0)

In [18]:
#get unique users and items
print('Unique users:', inter['user_id:token'].nunique(), 'Unique items:', inter['item_id:token'].nunique())

Unique users: 93971 Unique items: 55011


In [19]:
#save
inter.to_csv('mind.inter', index=False, sep='\t')

In [22]:
#inter_test.drop('label:float', axis=1, inplace=True)
inter_test.to_csv('mind.test', index=False, sep='\t')
inter_train.drop('label:float', axis=1, inplace=True)
inter_train.to_csv('mind.train', index=False, sep='\t')