In [None]:
import pandas as pd
import numpy as np
import json
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import Normalizer
from gensim.models import KeyedVectors

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Item embedding: TF-IDF matrix

In [None]:
news_train = '/content/drive/MyDrive/tesi/mind_small/news_test.tsv'
news_test = '/content/drive/MyDrive/tesi/mind_small/news_train.tsv'


def load_df(path):
    if 'news' in path:
        columns = ['News ID',
                "Category",
                "SubCategory",
                "Title",
                "Abstract",
                "URL",
                "Title Entities",
                "Abstract Entities"]

    elif 'behavior' in path:
        columns = ['Impression ID',
                "User ID",
                "Time",
                "History",
                "Impressions"]
    else:
        return pd.read_csv(path, sep='\t', header=None)

    df = pd.read_csv(path, sep='\t', header=None, names=columns)
    return df
news_train, news_test = load_df(news_train), load_df(news_test)
data = pd.concat([news_train, news_test])
inter = pd.read_csv('/content/drive/MyDrive/tesi/mind_small/mind_small15.inter', sep='\t', header=0)

In [None]:
inter = inter[inter['label:float'] != 0] # keep only clicked articles
#remove the N in news ID
data['News ID'] = data['News ID'].str[1:]
#drop everything but news ID & title
col = ['News ID', 'Title']
data = data[col]
#make the news ID string
data['News ID'] = data['News ID'].astype(str)
inter['item_id:token'] = inter['item_id:token'].astype(str)
unique_newsid = data['News ID'].unique()
unique_itemid = inter['item_id:token'].unique()
#check common news ID
common = np.intersect1d(unique_newsid, unique_itemid)
print(len(common), len(unique_newsid), len(unique_itemid))
#keep in data only the one in common
data = data[data['News ID'].isin(common)]
print(data.shape)
data = data.drop_duplicates(subset='News ID') #since it is made of both train and test
print(data.shape)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['News ID'] = data['News ID'].astype(str)


25232 65238 25232
(45600, 2)
(25232, 2)


In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
#preprocessing
english_stopwords = set(stopwords.words('english'))
stemmer = PorterStemmer()

pattern_punctuation = re.compile(r'[^\w\s]')
pattern_numbers = re.compile(r'\w*\d+\w*')
pattern_short_words = re.compile(r'\b\w{1,3}\b')

def preprocess_text(text):
    text = text.lower()
    text = pattern_punctuation.sub('', text)  # del punctuation
    text = pattern_numbers.sub('', text)  # del numbers
    text = pattern_short_words.sub('', text)  # del words with len <= 2
    words = text.split()
    words = [word for word in words if word not in english_stopwords]  # del stopwords
    words = [stemmer.stem(word) for word in words]  # stemming
    return ' '.join(words)

data['processed_title'] = data['Title'].apply(preprocess_text)

[TfidfVectorizer documentation](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html)

In [None]:
#tfidf embedding
tfidf_vectorizer = TfidfVectorizer(min_df=1,  stop_words='english', norm='l2')
tfidf_matrix = tfidf_vectorizer.fit_transform(data['processed_title'])
feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)
tfidf_df['News ID'] = data['News ID'].values

In [None]:
tfidf_df.reset_index(drop=True, inplace=True)
#put column NEWS ID as first column
tfidf_df = tfidf_df[ ['News ID'] + [ col for col in tfidf_df.columns if col != 'News ID' ] ]
tfidf_df
tfidf_df.to_csv('full_tf-idf.csv', index=False)

# User Embedding from Item Embedding

1. aggregate  news + average:   $$ u = \frac{1}{n} \sum_{i=1}^{n} v_i $$ <br>
where  $n$ is the number of items interacted with by the user, and $v_i$​ is the TF-IDF vector of the i-th item <br><br><br>
2. aggregate  news + weighted average based on label: $$ u = \frac{\sum_{i=1}^{n} w_i v_i}{\sum_{i=1}^{n} w_i }  $$ <br>
   e.g. $w_i = 0.1$ for item seen and not interacted and $w_i = 1$ otherwise <br>
   <br><br>
   e.g. a weight of 0.1 for item seen and not interacted (i.e. label = 0) scales down the contribution of this vector to the overall profile by 90%. <br><b>Q: Does this approach make sense with very sparse data?-> Maybe viable if embedding was denser e.g. word2vec or glove</b>

In [None]:
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

#tfidf = tfidf_df
tfidf = pd.read_csv('/content/drive/MyDrive/tesi/data/tfidf_emb.csv')
inter = pd.read_csv('/content/drive/MyDrive/tesi/data/mind_small15.inter', sep='\t', header=0)
inter = inter[inter['label:float'] != 0]
assert inter['item_id:token'].nunique() == tfidf.shape[0]


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
tfidf_columns = tfidf.columns[:-1]
#initialize user vector & counter
user_tfidf_sum = pd.DataFrame(0, index=inter['user_id:token'].unique(), columns=tfidf_columns)
user_interaction_counts = pd.Series(0, index=inter['user_id:token'].unique())

#due to suicidal kernel, let's try incremental approach
batch_size = 10000

for start in range(0, inter.shape[0], batch_size):
    end = min(start + batch_size, inter.shape[0])
    chunk = inter.iloc[start:end]

    user_news_chunk = pd.merge(chunk, tfidf, left_on='item_id:token', right_on='News ID')
    #sum of interacted item (emb)
    user_sum = user_news_chunk.groupby('user_id:token')[tfidf_columns].sum()
    user_tfidf_sum.loc[user_sum.index] += user_sum
    #count inter in chnk
    user_counts = user_news_chunk['user_id:token'].value_counts()
    user_interaction_counts[user_counts.index] += user_counts

#average over seen items
user_embeddings = user_tfidf_sum.div(user_interaction_counts, axis=0)

In [None]:
user_embeddings.rename(columns={'Unnamed: 0': 'uid:token'}, inplace=True)
user_embeddings

Unnamed: 0,uid:token,aaron,abandon,abl,abort,absolut,absurd,abus,accept,access,...,youth,youtub,youv,yovanovitch,zealand,zion,zodiac,zone,zozo,zuckerberg
0,13740,0.0,0.0,0.0,0.043779,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
1,91836,0.0,0.0,0.0,0.000000,0.0,0.0,0.006283,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2,73700,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
3,34670,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
4,8125,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86331,73518,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
86332,16981,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
86333,17300,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
86334,57759,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
#prepare and save as atomic file
user_embeddings = user_embeddings.set_index('uid:token')
user_embeddings['user_emb:float_seq'] = user_embeddings.apply(lambda row: ' '.join(row.astype(str)), axis=1)
user_embeddings.reset_index(inplace=True)
user_embeddings = user_embeddings[['uid:token', 'user_emb:float_seq']]
user_embeddings.to_csv('user_embeddings15.csv', index=False)


In [None]:
user_embeddings

Unnamed: 0,uid:token,user_emb:float_seq
0,13740,0.0 0.0 0.0 0.0437792077908793 0.0 0.0 0.0 0.0...
1,91836,0.0 0.0 0.0 0.0 0.0 0.0 0.0062834390437269 0.0...
2,73700,0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0....
3,34670,0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0....
4,8125,0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0....
...,...,...
86331,73518,0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0....
86332,16981,0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0....
86333,17300,0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0....
86334,57759,0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0....


In [None]:
#same for item embedding
item_emb = pd.read_csv('/Users/giulia/Desktop/tesi/tfidf_emb.csv', sep=',')

In [None]:
item_emb = item_emb.set_index('News ID')
#change name of columns
item_emb.rename(columns={'News ID': 'iid:token'}, inplace=True)
item_emb['item_emb:float_seq'] = item_emb.apply(lambda row: ' '.join(row.astype(str)), axis=1)
item_emb.reset_index(inplace=True)
item_emb = item_emb[['News ID', 'item_emb:float_seq']]


In [None]:
#item_emb.rename(columns={'News ID': 'iid:token', 'item_emb:float_seq': 'item_emb:float_seq'}, inplace=True)
item_emb

Unnamed: 0,iid:token,item_emb:float_seq
0,61837,0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0....
1,53526,0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0....
2,38324,0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0....
3,2073,0.0 0.0 0.5667783692028184 0.0 0.0 0.0 0.0 0.0...
4,49186,0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0....
...,...,...
25227,5072,0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0....
25228,31080,0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0....
25229,62355,0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0....
25230,63860,0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0....


In [None]:
item_emb.to_csv('item_embeddings15.csv', sep='\t', index=False)

# Pre-trained embeddings: <br>
average of [GloVe](https://nlp.stanford.edu/projects/glove/) embedding weighted by tf-idf of words in the article

In [None]:
#why don't they already put this will forever be a mistery to me
#this need to be run only once
def add_header_to_glove_file(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    num_vectors = len(lines)
    dimensions = 200
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write(f"{num_vectors} {dimensions}\n")
        file.writelines(lines)
#input_file = 'glove.twitter.27B/glove.twitter.27B.200d.txt'
#output_file = 'glove.twitter.27B/glove.twitter.27B.200d.txt'
#add_header_to_glove_file(input_file, output_file)

In [None]:
glove_model = KeyedVectors.load_word2vec_format('glove.twitter.27B/glove.twitter.27B.200d.txt', binary=False) #1min

In [None]:
#preprocessing
english_stopwords = set(stopwords.words('english'))
stemmer = PorterStemmer()

pattern_punctuation = re.compile(r'[^\w\s]')
pattern_numbers = re.compile(r'\w*\d+\w*')
pattern_short_words = re.compile(r'\b\w{1,3}\b')

def glove_preprocess_text(text):
    text = text.lower()
    text = pattern_punctuation.sub('', text)  # del punctuation
    text = pattern_numbers.sub('', text)  # del numbers
    text = pattern_short_words.sub('', text)  # del words with len <= 2
    words = text.split()
    words = [word for word in words if word not in english_stopwords]  # del stopwords
    #words = [stemmer.stem(word) for word in words]  #GloVe uses exact form of words
    return words

filtered_data['processed_title'] = filtered_data['Title'].apply(glove_preprocess_text)

In [None]:
def noop(doc):
    return doc

tfidf_vectorizer = TfidfVectorizer(min_df=5, max_df=0.95, stop_words=None, norm='l2', tokenizer=noop, preprocessor=noop)
tfidf_matrix = tfidf_vectorizer.fit_transform(filtered_data['processed_title'])
feature_names = tfidf_vectorizer.get_feature_names_out()



In [None]:
#GloVe + TF-IDF Embeddings
def get_weighted_average_glove(words, tfidf_vector, tfidf_feature_names):
    word_vectors = np.zeros((glove_model.vector_size,))
    total_weight = 0
    for word in words:
        if word in glove_model and word in tfidf_feature_names:
            tfidf_weight = tfidf_vector[0, tfidf_feature_names.tolist().index(word)]
            word_vectors += glove_model[word] * tfidf_weight #weighted sum
            total_weight += tfidf_weight
    if total_weight > 0:
        word_vectors /= total_weight #averaged
    return word_vectors #200D vector

In [None]:
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
glove_embeddings = np.vstack([get_weighted_average_glove(doc, tfidf_matrix[i], tfidf_feature_names) for i, doc in enumerate(filtered_data['processed_title'])])
filtered_data['glove_embeddings'] = list(glove_embeddings)

In [None]:
#ITEM EMBEDDING as atomic file
item_emb = filtered_data[['News ID', 'glove_embeddings']]
item_emb = item_emb.rename(columns={'News ID': 'iid:token', 'glove_embeddings': 'item_emb:float_seq'}, inplace=False)
item_emb['item_emb:float_seq'] = item_emb['item_emb:float_seq'].apply(lambda x: json.dumps(x.tolist()))
item_emb.to_csv('item_embeddings15_glove.csv', sep='\t', index=False)
item_emb

Unnamed: 0,iid:token,item_emb:float_seq
2,61837,"[-0.3018888610625763, -0.01441863232279384, -0..."
3,53526,"[0.030755279004147814, -0.15293832795599122, -..."
4,38324,"[-0.14559967716686759, -0.5394764966235572, -0..."
5,2073,"[0.019193092035950033, 0.2334384066367838, -0...."
7,49186,"[-0.21076213154994625, 0.2232201614312713, 0.0..."
...,...,...
42409,42491,"[-0.36951603754792617, -0.043448884240896844, ..."
42410,13097,"[0.17308648803038967, 0.22572657748324515, -0...."
42411,63550,"[-0.08583496865807796, 0.009274802871813675, 0..."
42412,30345,"[0.24341000616550446, 0.2530199885368347, -0.5..."


In [None]:
#USER EMBEDDING
item_emb = pd.read_csv('/Users/giulia/Desktop/tesi/item_embeddings15_glove.csv', sep='\t')
item_emb['item_emb:float_seq'] = item_emb['item_emb:float_seq'].apply(lambda x: np.array(json.loads(x)))
inter = pd.read_csv('/Users/giulia/Desktop/tesi/mind_small15/mind_small15.inter', sep='\t', header=0)
inter = inter[inter['item_id:token'].isin(item_emb['iid:token'])]
assert inter['item_id:token'].nunique() == item_emb.shape[0]

In [None]:
# Convert the merge key columns to string type
inter['item_id:token'] = inter['item_id:token'].astype(str)
item_emb['iid:token'] = item_emb['iid:token'].astype(str)
user_item_emb = pd.merge(inter, item_emb, how='left', left_on='item_id:token', right_on='iid:token')


In [None]:
batch_size = 10000
batched_embeddings = pd.DataFrame()

for start in range(0, user_item_emb.shape[0], batch_size):
    end = min(start + batch_size, user_item_emb.shape[0])
    batch = user_item_emb.iloc[start:end]

    temp_df = pd.DataFrame(batch['item_emb:float_seq'].tolist(), index=batch.index)
    temp_df['user_id:token'] = batch['user_id:token']

    batched_embeddings = pd.concat([batched_embeddings, temp_df])

#average over seen items
user_embeddings = batched_embeddings.groupby('user_id:token').mean()

In [None]:
user_embeddings.to_csv('user_embeddings15_glove.csv', sep='\t', index=False)