In [None]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import Normalizer

# Item embedding: TF-IDF matrix

In [None]:
news_train = 'data/MINDsmall_train/news.tsv'
news_test = 'data/MINDsmall_dev/news.tsv'


def load_df(path):
    if 'news' in path:
        columns = ['News ID',
                "Category",
                "SubCategory",
                "Title",
                "Abstract",
                "URL",
                "Title Entities",
                "Abstract Entities"]

    elif 'behavior' in path:
        columns = ['Impression ID',
                "User ID",
                "Time",
                "History",
                "Impressions"]
    else:
        return pd.read_csv(path, sep='\t', header=None)

    df = pd.read_csv(path, sep='\t', header=None, names=columns)
    return df
news_train, news_test = load_df(news_train), load_df(news_test)
data = pd.concat([news_train, news_test])
inter = pd.read_csv('/Users/giulia/Desktop/tesi/mind_small15/mind_small15.inter', sep='\t', header=0)
inter = inter[inter['label:float'] != 0] # keep only clicked articles

In [None]:
data = pd.concat([news_train, news_test])
#remove the N in front of the news ID
data['News ID'] = data['News ID'].str[1:]
#drop everything but news ID & title
col = ['News ID', 'Title']
data = data[col]

In [None]:
#make the news ID string
data['News ID'] = data['News ID'].astype(str)
inter['item_id:token'] = inter['item_id:token'].astype(str)
unique_newsid = data['News ID'].unique()
unique_itemid = inter['item_id:token'].unique()

In [None]:
#check common news ID
common = np.intersect1d(unique_newsid, unique_itemid)
print(len(common), len(unique_newsid), len(unique_itemid))
#keep in data only the one in common
data = data[data['News ID'].isin(common)]
print(data.shape)
data = data.drop_duplicates(subset='News ID') #since it is made of both train and test
print(data.shape)

25232 65238 25232
(25232, 2)
(25232, 2)


In [None]:
#preprocessing
english_stopwords = set(stopwords.words('english'))
stemmer = PorterStemmer()

pattern_punctuation = re.compile(r'[^\w\s]')
pattern_numbers = re.compile(r'\w*\d+\w*')
pattern_short_words = re.compile(r'\b\w{1,3}\b')

def preprocess_text(text):
    text = text.lower()
    text = pattern_punctuation.sub('', text)  # del punctuation
    text = pattern_numbers.sub('', text)  # del numbers
    text = pattern_short_words.sub('', text)  # del words with len <= 2
    words = text.split()
    words = [word for word in words if word not in english_stopwords]  # del stopwords
    words = [stemmer.stem(word) for word in words]  # stemming
    return ' '.join(words)

data['processed_title'] = data['Title'].apply(preprocess_text)

[TfidfVectorizer documentation](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html)

In [None]:
#tfidf embedding
tfidf_vectorizer = TfidfVectorizer(min_df=5, max_df=0.95, stop_words='english', max_features=3000, norm='l2') #since DMF uses cos similarity L2 seems appropriate normalization
tfidf_matrix = tfidf_vectorizer.fit_transform(data['processed_title'])
feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)
tfidf_df['News ID'] = data['News ID'].values

In [None]:
tfidf_df.reset_index(drop=True, inplace=True)
#put column NEWS ID as first column
tfidf_df = tfidf_df[ ['News ID'] + [ col for col in tfidf_df.columns if col != 'News ID' ] ]
tfidf_df
tfidf_df.to_csv('tfidf_emb.csv', index=False)

Unnamed: 0,News ID,aaron,abandon,abl,abort,absolut,absurd,abus,accept,access,...,youth,youtub,youv,yovanovitch,zealand,zion,zodiac,zone,zozo,zuckerberg
0,61837,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
1,53526,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
2,38324,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
3,2073,0.0,0.0,0.566778,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
4,49186,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25227,5072,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
25228,31080,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
25229,62355,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.934059,0.0,0.0,0.0
25230,63860,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0


# User Embedding from Item Embedding

1. aggregate  news + average:   $$ u = \frac{1}{n} \sum_{i=1}^{n} v_i $$ <br>
where  $n$ is the number of items interacted with by the user, and $v_i$​ is the TF-IDF vector of the i-th item <br><br><br>
2. aggregate  news + weighted average based on label: $$ u = \frac{\sum_{i=1}^{n} w_i v_i}{\sum_{i=1}^{n} w_i }  $$ <br>
   e.g. $w_i = 0.1$ for item seen and not interacted and $w_i = 1$ otherwise <br>
   <br><br>
   e.g. a weight of 0.1 for item seen and not interacted (i.e. label = 0) scales down the contribution of this vector to the overall profile by 90%. <br><b>Q: Does this approach make sense with very sparse data?-> Maybe viable if embedding was denser e.g. word2vec or glove</b>

In [3]:
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

#tfidf = tfidf_df
tfidf = pd.read_csv('/content/drive/MyDrive/tesi/data/tfidf_emb.csv')
inter = pd.read_csv('/content/drive/MyDrive/tesi/data/mind_small15.inter', sep='\t', header=0)
inter = inter[inter['label:float'] != 0]
assert inter['item_id:token'].nunique() == tfidf.shape[0]


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:

tfidf_columns = tfidf.columns[:-1]
#initialize user vector & counter
user_tfidf_sum = pd.DataFrame(0, index=inter['user_id:token'].unique(), columns=tfidf_columns)
user_interaction_counts = pd.Series(0, index=inter['user_id:token'].unique())

#due to suicidal kernel, let's try incremental approach
chunk_size = 10000

for start in range(0, inter.shape[0], chunk_size):
    end = min(start + chunk_size, inter.shape[0])
    chunk = inter.iloc[start:end]

    user_news_chunk = pd.merge(chunk, tfidf, left_on='item_id:token', right_on='News ID')
    #sum of interacted item (emb)
    user_sum = user_news_chunk.groupby('user_id:token')[tfidf_columns].sum()
    user_tfidf_sum.loc[user_sum.index] += user_sum
    #count inter in chnk
    user_counts = user_news_chunk['user_id:token'].value_counts()
    user_interaction_counts[user_counts.index] += user_counts

#average over seen items
user_embeddings = user_tfidf_sum.div(user_interaction_counts, axis=0)


In [3]:
user_embeddings.rename(columns={'Unnamed: 0': 'uid:token'}, inplace=True)
user_embeddings

Unnamed: 0,uid:token,aaron,abandon,abl,abort,absolut,absurd,abus,accept,access,...,youth,youtub,youv,yovanovitch,zealand,zion,zodiac,zone,zozo,zuckerberg
0,13740,0.0,0.0,0.0,0.043779,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
1,91836,0.0,0.0,0.0,0.000000,0.0,0.0,0.006283,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2,73700,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
3,34670,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
4,8125,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86331,73518,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
86332,16981,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
86333,17300,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
86334,57759,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
#prepare and save as atomic file
user_embeddings = user_embeddings.set_index('uid:token')
user_embeddings['user_emb:float_seq'] = user_embeddings.apply(lambda row: ' '.join(row.astype(str)), axis=1)
user_embeddings.reset_index(inplace=True)
user_embeddings = user_embeddings[['uid:token', 'user_emb:float_seq']]
user_embeddings.to_csv('user_embeddings15.csv', index=False)


In [7]:
user_embeddings

Unnamed: 0,uid:token,user_emb:float_seq
0,13740,0.0 0.0 0.0 0.0437792077908793 0.0 0.0 0.0 0.0...
1,91836,0.0 0.0 0.0 0.0 0.0 0.0 0.0062834390437269 0.0...
2,73700,0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0....
3,34670,0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0....
4,8125,0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0....
...,...,...
86331,73518,0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0....
86332,16981,0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0....
86333,17300,0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0....
86334,57759,0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0....
