### Collaborative filtering


In [93]:
import pandas as pd
import numpy as np 
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from datetime import timedelta

In [94]:
#path to the mindsmall training data
training_dataset = '../MINDsmall_train'

validation_dataset = '../MINDsmall_train'

entity_embedding = pd.read_csv("../MINDsmall_train/entity_embedding.vec", sep='\t')
relation_embedding = pd.read_csv("../MINDsmall_train/relation_embedding.vec", sep='\t')


news_data = pd.read_csv("../MINDsmall_train/news.tsv",
    sep='\t',
    names=["newsId", "category", "subcategory", "title","abstract", "url", "title_entities","abstract_entities"]
)

behaviors_data = pd.read_csv(
    "../MINDsmall_train/behaviors.tsv",
    sep='\t',
    names=["impressionId", "userId", "timestamp", "click_history", "impressions"],
    parse_dates=['timestamp'] 
)

### Preprocess data

In [95]:
behaviors_data['timestamp'] = pd.to_datetime(behaviors_data['timestamp'], format='%Y-%m-%d %H:%M:%S')
behaviors_data['clicks'] = behaviors_data['click_history'].str.split().str.len()
behaviors_data['impressions_list'] = behaviors_data['impressions'].str.split()
behaviors_data['impressions_count'] = behaviors_data['impressions_list'].str.len()

# remove null values
news_data = news_data.dropna().reset_index(drop=True)


### Remove the null values for the clicks & impressions

In [96]:
null_clicks_rows = behaviors_data[behaviors_data['clicks'].isnull() & behaviors_data['click_history'].isnull()]

# Remove the rows with null values
behaviors_data = behaviors_data.dropna().reset_index(drop=True)

In [97]:
# sort behaviors_data by timestamp
behaviors_data = behaviors_data.sort_values(by='timestamp')

In [98]:
news_data.head()

Unnamed: 0,newsId,category,subcategory,title,abstract,url,title_entities,abstract_entities
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ..."
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI..."


In [99]:
behaviors_data.head()

Unnamed: 0,impressionId,userId,timestamp,click_history,impressions,clicks,impressions_list,impressions_count
19705,20112,U65916,2019-11-09 00:00:19,N51706 N40767 N12096 N9798 N38802 N54827 N5780...,N54300-0 N46057-1 N57005-0 N52154-0 N57099-0 N...,15.0,"[N54300-0, N46057-1, N57005-0, N52154-0, N5709...",31
13531,13807,U49985,2019-11-09 00:01:13,N5056 N29975 N53234 N39603 N50032 N8422 N53580...,N20602-0 N50059-0 N57768-1 N50135-1 N15134-0 N...,93.0,"[N20602-0, N50059-0, N57768-1, N50135-1, N1513...",29
27115,27660,U25550,2019-11-09 00:02:44,N17260 N38298 N33976 N47719 N14888 N18870 N4607,N50135-0 N15134-0 N52433-1 N20602-0 N64536-0,7.0,"[N50135-0, N15134-0, N52433-1, N20602-0, N6453...",5
149080,152217,U19710,2019-11-09 00:02:50,N3530 N48284 N43019 N62546 N138 N13138 N10676 ...,N57099-0 N30295-0 N21086-0 N5379-0 N57005-0 N4...,8.0,"[N57099-0, N30295-0, N21086-0, N5379-0, N57005...",56
41348,42166,U38106,2019-11-09 00:03:09,N16874 N264 N48697 N51366,N3491-0 N20602-0 N25785-0 N23575-0 N38783-0 N1...,4.0,"[N3491-0, N20602-0, N25785-0, N23575-0, N38783...",48


### Sliding window

In [100]:
train_window_size = 6
test_window_size = 3

behaviors_data['timestamp'] = behaviors_data['timestamp'].dt.hour

start_time = 0
end_time = train_window_size
recommendations = []


def sliding_window(start_time, training_window_size, test_window_size=3):
    end_time = training_window_size
    training_data = behaviors_data[(behaviors_data['timestamp'] >= start_time) & (behaviors_data['timestamp'] < end_time)]
    test_data = behaviors_data[(behaviors_data['timestamp'] >= end_time) & (behaviors_data['timestamp'] < end_time + test_window_size)]
    
    # start_time += test_window_size
    # end_time = start_time + train_window_size
    return training_data, test_data


In [101]:
def user_clicks(training_data):
    click_history = training_data.dropna(subset=['click_history'])
    all_clicks = []
    for index, row in click_history.iterrows():
        user_id = row['userId']
        for news_id in row['click_history'].split():
            all_clicks.append([user_id, news_id, 1])
    return pd.DataFrame(all_clicks, columns=['userId', 'newsId', 'click'])

In [102]:
def create_matrix(clicks_df):
    interaction_matrix = clicks_df.pivot_table(index='userId', columns='newsId', values='click', fill_value=0)
    return interaction_matrix

def get_similarity_df(interaction_matrix):
    interaction_sparse = csr_matrix(interaction_matrix.values)
    item_similarity = cosine_similarity(interaction_sparse.T)
    item_similarity_df = pd.DataFrame(item_similarity, index=interaction_matrix.columns, columns=interaction_matrix.columns)
    return item_similarity_df

In [103]:
# Recommend articles with sliding window

def collaborative_filtering(news_id, top_n=5):
    start_time = 0
    training_window_size = 6
    test_window_size = 3

    training_data, test_data = sliding_window(start_time, training_window_size, test_window_size)
    clicks_df = user_clicks(training_data)
    interaction_matrix = create_matrix(clicks_df)
    item_similarity_df = get_similarity_df(interaction_matrix)

    while start_time + train_window_size < behaviors_data['timestamp'].max():
        training_data, test_data = sliding_window(start_time, training_window_size, test_window_size)
        clicks_df = user_clicks(training_data)
        interaction_matrix = create_matrix(clicks_df)
        item_similarity_df = get_similarity_df(interaction_matrix)
        start_time += test_window_size
        end_time = start_time + train_window_size


        similar_scores = item_similarity_df[news_id]
        top_articles = similar_scores.nlargest(top_n + 1).index.tolist()
        top_articles.remove(news_id) 
        print(f"Top {top_n} articles similar to {news_id} are: {top_articles}")
        # print("The time is now: ", start_time + "out of ", behaviors_data['timestamp'].max())

# Example: Get top 5 articles similar to 'N55528' (example newsId)
input_news_id = 'N55189'
result = collaborative_filtering('N55189', 5)

print(result)

Top 5 articles similar to N55189 are: ['N306', 'N29177', 'N871', 'N42620', 'N43142']


TypeError: unsupported operand type(s) for +: 'int' and 'str'

In [None]:
# print title of article 'N55528'
print("Title of input article:\n", news_data[news_data['newsId'] == 'N55528']['title'].values[0])
print("\n")
print("Title of recommended articles:\n")
print(news_data[news_data['newsId'] == result[0]]['title'].values[0])

print(news_data[news_data['newsId'] == result[1]]['title'].values[0])

print(news_data[news_data['newsId'] == result[2]]['title'].values[0])

print(news_data[news_data['newsId'] == result[3]]['title'].values[0])

print(news_data[news_data['newsId'] == result[4]]['title'].values[0])

Title of input article:
 The Brands Queen Elizabeth, Prince Charles, and Prince Philip Swear By


Title of recommended articles:

Kevin Spacey Won't Be Charged in Sexual Assault Case After Accuser Dies
Miguel Cervantes' Wife Reveals Daughter, 3, 'Died in My Arms' After Entering Hospice Care
Woman, suspect dead at 'Tarzan' actor Ron Ely's California residence
Heidi Klum's 2019 Halloween Costume Transformation Is Mind-Blowing   But, Like, What Is It?
Former NBA first-round pick Jim Farmer arrested in sex sting operation


### Collaborative filtering

In [None]:
        

def collaborative_filtering(training_data, test_data):

    # Function to create a user-item interaction matrix from behaviors data
    def create_interaction_matrix(data):
        all_clicks = []
        for index, row in data.dropna(subset=['click_history']).iterrows():
            user_id = row['userId']
            for news_id in row['click_history'].split():
                all_clicks.append([user_id, news_id, 1])
        clicks_df = pd.DataFrame(all_clicks, columns=['userId', 'newsId', 'interaction'])
        return clicks_df.pivot_table(index='userId', columns='newsId', values='interaction', fill_value=0)

    # Create interaction matrices for training and testing
    train_interaction_matrix = create_interaction_matrix(train_data)
    test_interaction_matrix = create_interaction_matrix(test_data)


