### Collaborative filtering


In [173]:
import pandas as pd
import numpy as np 
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from datetime import timedelta

In [174]:
#path to the mindsmall training data
training_dataset = '../MINDsmall_train'

validation_dataset = '../MINDsmall_train'

entity_embedding = pd.read_csv("../MINDsmall_train/entity_embedding.vec", sep='\t')
relation_embedding = pd.read_csv("../MINDsmall_train/relation_embedding.vec", sep='\t')


news_data = pd.read_csv("../MINDsmall_train/news.tsv",
    sep='\t',
    names=["newsId", "category", "subcategory", "title","abstract", "url", "title_entities","abstract_entities"]
)

behaviors_data = pd.read_csv(
    "../MINDsmall_train/behaviors.tsv",
    sep='\t',
    names=["impressionId", "userId", "timestamp", "click_history", "impressions"],
    parse_dates=['timestamp'] 
)

### Preprocess data

In [175]:
behaviors_data['timestamp'] = pd.to_datetime(behaviors_data['timestamp'], format='%Y-%m-%d %H:%M:%S')
behaviors_data['hour_of_day'] = behaviors_data['timestamp'].dt.hour
behaviors_data['clicks'] = behaviors_data['click_history'].str.split().str.len()
behaviors_data['impressions_list'] = behaviors_data['impressions'].str.split()
behaviors_data['impressions_count'] = behaviors_data['impressions_list'].str.len()

# remove null values
news_data = news_data.dropna().reset_index(drop=True)


### Remove the null values for the clicks & impressions

In [176]:
null_clicks_rows = behaviors_data[behaviors_data['clicks'].isnull() & behaviors_data['click_history'].isnull()]

# Remove the rows with null values
behaviors_data = behaviors_data.dropna().reset_index(drop=True)

In [177]:
# sort behaviors_data by timestamp
behaviors_data = behaviors_data.sort_values(by='timestamp')

In [178]:
news_data.head()

Unnamed: 0,newsId,category,subcategory,title,abstract,url,title_entities,abstract_entities
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ..."
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI..."


In [179]:
behaviors_data.head()

Unnamed: 0,impressionId,userId,timestamp,click_history,impressions,hour_of_day,clicks,impressions_list,impressions_count
19705,20112,U65916,2019-11-09 00:00:19,N51706 N40767 N12096 N9798 N38802 N54827 N5780...,N54300-0 N46057-1 N57005-0 N52154-0 N57099-0 N...,0,15.0,"[N54300-0, N46057-1, N57005-0, N52154-0, N5709...",31
13531,13807,U49985,2019-11-09 00:01:13,N5056 N29975 N53234 N39603 N50032 N8422 N53580...,N20602-0 N50059-0 N57768-1 N50135-1 N15134-0 N...,0,93.0,"[N20602-0, N50059-0, N57768-1, N50135-1, N1513...",29
27115,27660,U25550,2019-11-09 00:02:44,N17260 N38298 N33976 N47719 N14888 N18870 N4607,N50135-0 N15134-0 N52433-1 N20602-0 N64536-0,0,7.0,"[N50135-0, N15134-0, N52433-1, N20602-0, N6453...",5
149080,152217,U19710,2019-11-09 00:02:50,N3530 N48284 N43019 N62546 N138 N13138 N10676 ...,N57099-0 N30295-0 N21086-0 N5379-0 N57005-0 N4...,0,8.0,"[N57099-0, N30295-0, N21086-0, N5379-0, N57005...",56
41348,42166,U38106,2019-11-09 00:03:09,N16874 N264 N48697 N51366,N3491-0 N20602-0 N25785-0 N23575-0 N38783-0 N1...,0,4.0,"[N3491-0, N20602-0, N25785-0, N23575-0, N38783...",48


### Sliding window
Our sliding window is split into 6 hours for training and 3 hours for validation.

In [180]:
train_window_size = 6
test_window_size = 3

start_time = 0
end_time = train_window_size
recommendations = []


def sliding_window(start_time, training_window_size, test_window_size=3):
    end_time = training_window_size
    training_data = behaviors_data[(behaviors_data['timestamp'] >= start_time) & (behaviors_data['timestamp'] < end_time)]
    test_data = behaviors_data[(behaviors_data['timestamp'] >= end_time) & (behaviors_data['timestamp'] < end_time + test_window_size)]

    return training_data, test_data


### Interaction matrix
For each article in the click history of each user interaction in the behaviors data, a 1 is added to the matrix.

In [181]:
def user_clicks(training_data):
    click_history = training_data.dropna(subset=['click_history'])
    all_clicks = []
    for index, row in click_history.iterrows():
        user_id = row['userId']
        for news_id in row['click_history'].split():
            all_clicks.append([user_id, news_id, 1])
    clicks_df = pd.DataFrame(all_clicks, columns=['userId', 'newsId', 'click'])
    if clicks_df.empty:
        print("Not enough clicks yet")
    return clicks_df

### Cosine similarity

In [182]:
def create_matrix(clicks_df):
    interaction_matrix = clicks_df.pivot_table(index='userId', columns='newsId', values='click', fill_value=0)
    return interaction_matrix

def get_similarity_df(interaction_matrix):
    interaction_sparse = csr_matrix(interaction_matrix.values)
    item_similarity = cosine_similarity(interaction_sparse.T)
    item_similarity_df = pd.DataFrame(item_similarity, index=interaction_matrix.columns, columns=interaction_matrix.columns)
    return item_similarity_df

### Collaborative filtering with sliding window
We allowed the sliding window to grow up to 72 hours as it allows for detailed data, and can give us more insight into evaluation metrics such as the coverage.

In [183]:
def collaborative_filtering(news_id, top_n=5):
    INITIAL_WINDOW_SIZE = 3  
    MAX_WINDOW_SIZE = 72     
    SLIDE_SIZE = 3           

    current_time = behaviors_data['timestamp'].min()
    end_time = current_time + pd.Timedelta(hours=INITIAL_WINDOW_SIZE)
    final_time = behaviors_data['timestamp'].max()
    
    all_clicks_df = pd.DataFrame()
    recommendations = []

    while current_time + pd.Timedelta(hours=SLIDE_SIZE) <= final_time:
        new_data = behaviors_data[(behaviors_data['timestamp'] >= current_time) & (behaviors_data['timestamp'] < end_time)]
        new_clicks_df = user_clicks(new_data)

        # The new data is appended to the existing data up to the maximum window size
        all_clicks_df = pd.concat([all_clicks_df, new_clicks_df]).drop_duplicates()

        # If the window is at full capacity, drop the oldest data
        if (end_time - current_time) >= pd.Timedelta(hours=MAX_WINDOW_SIZE):
            all_clicks_df = all_clicks_df[all_clicks_df['timestamp'] >= current_time + pd.Timedelta(hours=SLIDE_SIZE)]
            current_time += pd.Timedelta(hours=SLIDE_SIZE)

        end_time += pd.Timedelta(hours=SLIDE_SIZE)

        # Update the interaction matrix and get the similarity scores
        if not all_clicks_df.empty:
            interaction_matrix = create_matrix(all_clicks_df)
            if not interaction_matrix.empty:
                item_similarity_df = get_similarity_df(interaction_matrix)
                similar_scores = item_similarity_df.get(news_id)
                if similar_scores is not None and not similar_scores.empty:
                    top_articles = similar_scores.nlargest(top_n + 1).index.tolist()
                    if news_id in top_articles:
                        top_articles.remove(news_id)
                    recommendations.append((current_time, top_articles))
                    print(f"Top {top_n} articles similar to {news_id} from {current_time} to {end_time} are: {top_articles}")
                else:
                    print(f"No similarity data available for {news_id} in window: [{current_time}, {end_time})")
            else:
                print("The interaction matrix is empty.")
        else:
            print("No clicks data available.")

    return recommendations

result = collaborative_filtering('N55189', 5)
print(result)

[(0, ['N306', 'N29177', 'N871', 'N42620', 'N43142']), (3, ['N306', 'N29177', 'N871', 'N42620', 'N43142'])]
Top 5 articles similar to N55189 from 2019-11-09 00:00:19 to 2019-11-09 06:00:19 are: ['N21746', 'N44110', 'N63248', 'N21623', 'N10343']
Top 5 articles similar to N55189 from 2019-11-09 00:00:19 to 2019-11-09 09:00:19 are: ['N27676', 'N42620', 'N51112', 'N21623', 'N54225']
Top 5 articles similar to N55189 from 2019-11-09 00:00:19 to 2019-11-09 12:00:19 are: ['N42620', 'N27676', 'N29177', 'N306', 'N10059']
Top 5 articles similar to N55189 from 2019-11-09 00:00:19 to 2019-11-09 15:00:19 are: ['N42620', 'N27676', 'N871', 'N306', 'N10059']
Top 5 articles similar to N55189 from 2019-11-09 00:00:19 to 2019-11-09 18:00:19 are: ['N42620', 'N10059', 'N306', 'N871', 'N45729']
Top 5 articles similar to N55189 from 2019-11-09 00:00:19 to 2019-11-09 21:00:19 are: ['N10059', 'N306', 'N42620', 'N29177', 'N871']
Top 5 articles similar to N55189 from 2019-11-09 00:00:19 to 2019-11-10 00:00:19 are:

KeyboardInterrupt: 