In [1]:
import pandas as pd

[Data Preprocessing](#Data_preprocessing)<br>
[Data Splitting](#Datasplitting)

# Data preprocessing
<a id='#Data_preprocessing'></a>

In [22]:
news_train = 'data/MINDsmall_train/news.tsv'
behavior_train = 'data/MINDsmall_train/behaviors.tsv'
entity_train = 'data/MINDsmall_train/entity_embedding.vec'
relation_train = 'data/MINDsmall_train/relation_embedding.vec'
#-------------------------------------------
news_test = 'data/MINDsmall_dev/news.tsv'
behavior_test = 'data/MINDsmall_dev/behaviors.tsv'
entity_test = 'data/MINDsmall_dev/entity_embedding.vec'
relation_test = 'data/MINDsmall_dev/relation_embedding.vec' 

def load_df(path):
    if 'news' in path:
        columns = ['News ID',
                "Category",
                "SubCategory",
                "Title",
                "Abstract",
                "URL",
                "Title Entities",
                "Abstract Entities"]
    
    elif 'behavior' in path:
        columns = ['Impression ID',
                "User ID",
                "Time",
                "History",
                "Impressions"]
    else:
        return pd.read_csv(path, sep='\t', header=None)
    
    df = pd.read_csv(path, sep='\t', header=None, names=columns)
    return df

news_train, news_test, behavior_train, behavior_test = map(load_df, [news_train, news_test, behavior_train, behavior_test])
entity_train, relation_train, entity_test, relation_test = map(load_df, [entity_train, relation_train, entity_test, relation_test])
print('MIND-small:')
print(f"{'Dataset':<15} {'Train shape':<20} {'Test shape'}")
print(f"{'-'*50}")
print(f"{'news':<15} {str(news_train.shape):<20} {news_test.shape}")
print(f"{'behavior':<15} {str(behavior_train.shape):<20} {behavior_test.shape}")
print(f"{'entity':<15} {str(entity_train.shape):<20} {entity_test.shape}")
print(f"{'relation':<15} {str(relation_train.shape):<20} {relation_test.shape}")
behavior_train.head(1)

MIND-small:
Dataset         Train shape          Test shape
--------------------------------------------------
news            (51282, 8)           (42416, 8)
behavior        (156965, 5)          (73152, 5)
entity          (26904, 102)         (22893, 102)
relation        (1091, 102)          (1091, 102)


Unnamed: 0,Impression ID,User ID,Time,History,Impressions
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0


train data

In [23]:
#prepare history data
behavior_train["History"] = behavior_train["History"].apply(lambda x: x.split(" ") if isinstance(x, str) else []) #make it iterable
train = behavior_train.drop(["Impression ID", "Time", "Impressions"], axis=1)
train = train.explode("History")
train['label'] = 1
train.rename(columns={ "User ID": "user_id:token" ,"History": "item_id:token", "label": "label:float"}, inplace=True)
train['user_id:token'] = train['user_id:token'].str[1:]
train['item_id:token'] = train['item_id:token'].str[1:]
print(train.shape)
train.drop_duplicates(inplace=True)
print(train.shape)

(5110877, 3)
(915903, 3)


In [24]:
#same with impressions
behavior_train["Impressions"] = behavior_train["Impressions"].apply(lambda x: x.split(" ") if isinstance(x, str) else [])#make it iterable
imp = behavior_train.drop(["Impression ID", "Time", "History"], axis=1)
imp = imp.explode("Impressions")
imp['label'] = imp['Impressions'].apply(lambda x: int(x.split("-")[-1]))
#delete last two character of impressions
imp['Impressions'] = imp['Impressions'].apply(lambda x: x[:-2])
imp.rename(columns={"User ID": "user_id:token" ,"Impressions": "item_id:token", "label": "label:float"}, inplace=True)
imp['user_id:token'] = imp['user_id:token'].str[1:]
imp['item_id:token'] = imp['item_id:token'].str[1:]
print(imp.shape)
imp.drop_duplicates(inplace=True)
print(imp.shape)

(5843444, 3)
(5033875, 3)


In [25]:
inter_train = pd.concat([train, imp], axis=0)
print(inter_train.shape)
inter_train.drop_duplicates(inplace=True)
print(inter_train.shape)

(5949778, 3)
(5948746, 3)


In [26]:
def five_core_filter(df):
    while True:
        before_shape = df.shape
        #users with at least 5 tracks
        counts = df['user_id:token'].value_counts()
        df = df[df['user_id:token'].isin(counts[counts >= 5].index)]
        #tracks with at least 5 users
        counts = df['item_id:token'].value_counts()
        df = df[df['item_id:token'].isin(counts[counts >= 5].index)]
        
        after_shape = df.shape
        #stop condition
        if before_shape == after_shape:
            break
    return df

In [27]:
print('Before filter:', inter_train.shape, "unique users:", inter_train['user_id:token'].nunique(), "unique items:", inter_train['item_id:token'].nunique())
inter_train = five_core_filter(inter_train)
print('After filter:', inter_train.shape, "unique users:", inter_train['user_id:token'].nunique(), "unique items:", inter_train['item_id:token'].nunique())

Before filter: (5948746, 3) unique users: 50000 unique items: 51282
After filter: (5900343, 3) unique users: 49886 unique items: 24011


In [28]:
#save
inter_train.to_csv('mind_train.inter', index=False, sep='\t')
inter_train.head(3)

Unnamed: 0,user_id:token,item_id:token,label:float
0,13740,55189,1
0,13740,42782,1
0,13740,34694,1


––––––––––––––––––––––––––––––––––––––––––––––––––— <br>
test data

In [29]:
#prepare history data
behavior_test["History"] = behavior_test["History"].apply(lambda x: x.split(" ") if isinstance(x, str) else []) #make it iterable
test = behavior_test.drop(["Impression ID", "Time", "Impressions"], axis=1)
test = test.explode("History")
test['label'] = 1
test.rename(columns={ "User ID": "user_id:token" ,"History": "item_id:token", "label": "label:float"}, inplace=True)
test['user_id:token'] = test['user_id:token'].str[1:]
test['item_id:token'] = test['item_id:token'].str[1:]
print(test.shape)
test.drop_duplicates(inplace=True)
print(test.shape)

(2364728, 3)
(1260987, 3)


In [30]:
#same with impressions
behavior_test["Impressions"] = behavior_test["Impressions"].apply(lambda x: x.split(" ") if isinstance(x, str) else [])#make it iterable
imp = behavior_test.drop(["Impression ID", "Time", "History"], axis=1)
imp = imp.explode("Impressions")
imp['label'] = imp['Impressions'].apply(lambda x: int(x.split("-")[-1]))
#delete last two character of impressions
imp['Impressions'] = imp['Impressions'].apply(lambda x: x[:-2])
imp.rename(columns={"User ID": "user_id:token" ,"Impressions": "item_id:token", "label": "label:float"}, inplace=True)
imp['user_id:token'] = imp['user_id:token'].str[1:]
imp['item_id:token'] = imp['item_id:token'].str[1:]
print(imp.shape)
imp.drop_duplicates(inplace=True)
print(imp.shape)

(2740998, 3)
(2478564, 3)


In [31]:
inter_test = pd.concat([test, imp], axis=0)
print(inter_test.shape)
inter_test.drop_duplicates(inplace=True)
print(inter_test.shape)

(3739551, 3)
(3739366, 3)


In [32]:
#remove from test set users and items that are not in train set
#inter_test = inter_test[inter_test['user_id:token'].isin(inter_train['user_id:token'])]
#inter_test = inter_test[inter_test['item_id:token'].isin(inter_train['item_id:token'])]
#print n unique users and items
inter_test['user_id:token'].nunique(), inter_test['item_id:token'].nunique()

(50000, 42416)

In [33]:
#save
inter_test.to_csv('mind_test.inter', index=False, sep='\t')

–––––––––––––––––––––––––––––––––––––––––––––––––––––––––<br>
full data

In [35]:
inter = pd.concat([inter_train, inter_test], axis=0)
print(inter.shape)
inter.drop_duplicates(inplace=True)
print(inter.shape)

(9639709, 3)
(9471416, 3)


In [37]:
#get unique users and items
print('Unique users:', inter['user_id:token'].nunique(), 'Unique items:', inter['item_id:token'].nunique())
inter.head(3)

Unique users: 93960 Unique items: 51218


Unnamed: 0,user_id:token,item_id:token,label:float
0,13740,55189,1
0,13740,42782,1
0,13740,34694,1


In [12]:
#save
#print(inter.isnull().sum())
#delete rows with invalid id
#inter = inter.dropna(subset=['user_id', 'item_id'])
#print(inter.isnull().sum())
#make all columns integer
inter = inter.astype(int)
inter.to_csv('data/mind.inter', index=False, sep='\t')
inter['label'].value_counts()

label
0    7140340
1    2329669
Name: count, dtype: int64

–––––––––

In [None]:
#load df
#import pandas as pd

#inter = pd.read_csv('mind_small/mind_small.inter', sep='\t')
#inter.shape

# Data splitting <br>
<a id='#Datasplitting'></a>
for each user, their interaxtion are splitted into train, test, and validation data (<i>avoid cold start for now as it is not the focus of my research</i>)<br>
ration: 0.6, 0.2, and 0.2 for train, test, and validation

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

def five_core_filter(df, n):
    while True:
        before_shape = df.shape
        #users with at least 5 tracks
        counts = df['user_id'].value_counts()
        df = df[df['user_id'].isin(counts[counts >= n].index)]
        #tracks with at least 5 users
        counts = df['item_id'].value_counts()
        df = df[df['item_id'].isin(counts[counts >= n].index)]
        
        after_shape = df.shape
        #stop condition
        if before_shape == after_shape:
            break
    return df

In [3]:
data = pd.read_csv('data/mind.inter', sep='\t')
print(data.shape)
new_data = five_core_filter(data, 5)
print(new_data.shape)

(9470009, 3)
(9432207, 3)


In [26]:
def custom_train_test_val_split(data, train_size, test_size, val_size):
    train = pd.DataFrame()
    test = pd.DataFrame()
    val = pd.DataFrame()
    #for every user
    grouped = data.groupby('user_id')
    #split the data
    for _, group in grouped:
        #print(_)
        if len(group) < 5:  #if a user has <5 interaction
                            #I f* up the preprocessing
            train = pd.concat([train, group])
            print('ouch this should have never been printed :(')
        else:
            train_group, temp_group = train_test_split(group, train_size=train_size, random_state=42)
            if len(temp_group) > 1:
                test_group, val_group = train_test_split(temp_group, test_size=val_size/(test_size + val_size), random_state=42)
                test = pd.concat([test, test_group])
                val = pd.concat([val, val_group])
            else:
                if test_size > val_size:
                    test = pd.concat([test, temp_group])
                else:
                    val = pd.concat([val, temp_group])
                
            train = pd.concat([train, train_group])

    return train, test, val


train, test, val = custom_train_test_val_split(new_data, train_size=0.6, test_size=0.2, val_size=0.2)
print(train.shape, test.shape, val.shape)


(5622131, 3) (1886241, 3) (1923835, 3)


In [35]:
print('Train:', train['user_id'].nunique(), 'Test:', test['user_id'].nunique(), 'Val:', val['user_id'].nunique())
print('Train:', train['item_id'].nunique(), 'Test:', test['item_id'].nunique(), 'Val:', val['item_id'].nunique())
print('Label distribution:\n',  train['label'].value_counts(normalize=True), test['label'].value_counts(normalize=True), val['label'].value_counts(normalize=True))

Train: 93734 Test: 93734 Val: 93734
Train: 27941 Test: 26247 Val: 26377
Label distribution:
 label
0    0.765671
1    0.234329
Name: proportion, dtype: float64 label
0    0.744905
1    0.255095
Name: proportion, dtype: float64 label
0    0.742487
1    0.257513
Name: proportion, dtype: float64


In [36]:
#save
train.to_csv('data/train', index=False, sep='\t')
test.to_csv('data/test', index=False, sep='\t')
val.to_csv('data/val', index=False, sep='\t')

In [39]:
#check for duplicates
print('Train:', train.duplicated().sum(), 'Test:', test.duplicated().sum(), 'Val:', val.duplicated().sum())
#check if same interactions are in train and test   
print('Train:', train.isin(test).sum().sum(), 'Test:', test.isin(train).sum().sum(), 'Val:', val.isin(train).sum().sum())

Train: 0 Test: 0 Val: 0
Train: 0 Test: 0 Val: 0


In [41]:
print('Exact Row Overlap: Train-Test:', pd.merge(train, test, on=['user_id', 'item_id', 'label'], how='inner').shape[0],
      'Train-Val:', pd.merge(train, val, on=['user_id', 'item_id', 'label'], how='inner').shape[0],
      'Test-Val:', pd.merge(test, val, on=['user_id', 'item_id', 'label'], how='inner').shape[0])

Exact Row Overlap: Train-Test: 0 Train-Val: 0 Test-Val: 0
