### Collaborative filtering


In [None]:
import pandas as pd
import numpy as np 
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from datetime import timedelta

In [None]:
#path to the mindsmall training data
training_dataset = '../MINDsmall_train'

validation_dataset = '../MINDsmall_train'

entity_embedding = pd.read_csv("../MINDsmall_train/entity_embedding.vec", sep='\t')
relation_embedding = pd.read_csv("../MINDsmall_train/relation_embedding.vec", sep='\t')


news_data = pd.read_csv("../MINDsmall_train/news.tsv",
    sep='\t',
    names=["newsId", "category", "subcategory", "title","abstract", "url", "title_entities","abstract_entities"]
)

behaviors_data = pd.read_csv(
    "../MINDsmall_train/behaviors.tsv",
    sep='\t',
    names=["impressionId", "userId", "timestamp", "click_history", "impressions"],
    parse_dates=['timestamp'] 
)

### Preprocess data

In [None]:
behaviors_data['timestamp'] = pd.to_datetime(behaviors_data['timestamp'], format='%Y-%m-%d %H:%M:%S')
behaviors_data['hour_of_day'] = behaviors_data['timestamp'].dt.hour
behaviors_data['clicks'] = behaviors_data['click_history'].str.split().str.len()
behaviors_data['impressions_list'] = behaviors_data['impressions'].str.split()
behaviors_data['impressions_count'] = behaviors_data['impressions_list'].str.len()

# remove null values
news_data = news_data.dropna().reset_index(drop=True)


### Remove the null values for the clicks & impressions

In [None]:
null_clicks_rows = behaviors_data[behaviors_data['clicks'].isnull() & behaviors_data['click_history'].isnull()]

# Remove the rows with null values
behaviors_data = behaviors_data.dropna().reset_index(drop=True)

In [None]:
# sort behaviors_data by timestamp
behaviors_data = behaviors_data.sort_values(by='timestamp')

In [None]:
news_data.head()

In [None]:
behaviors_data.head()

### Sliding window
Our sliding window is split into 6 hours for training and 3 hours for validation.

In [None]:
train_window_size = 6
test_window_size = 3

start_time = 0
end_time = train_window_size
recommendations = []


def sliding_window(start_time, training_window_size, test_window_size=3):
    end_time = training_window_size
    training_data = behaviors_data[(behaviors_data['timestamp'] >= start_time) & (behaviors_data['timestamp'] < end_time)]
    test_data = behaviors_data[(behaviors_data['timestamp'] >= end_time) & (behaviors_data['timestamp'] < end_time + test_window_size)]

    return training_data, test_data


### Interaction matrix
For each article in the click history of each user interaction in the behaviors data, a 1 is added to the matrix.

In [None]:
def user_clicks(training_data):
    all_clicks = []
    for index, row in training_data.iterrows():
        user_id = row['userId']
        timestamp = row['timestamp']
        for news_id in row['click_history'].split():
            all_clicks.append([user_id, news_id, 1, timestamp])
    clicks_df = pd.DataFrame(all_clicks, columns=['userId', 'newsId', 'click', 'timestamp'])
    return clicks_df

### Cosine similarity

In [None]:
def create_matrix(clicks_df):
    interaction_matrix = clicks_df.pivot_table(index='userId', columns='newsId', values='click', fill_value=0)
    return interaction_matrix

def get_similarity_df(interaction_matrix):
    interaction_sparse = csr_matrix(interaction_matrix.values)
    item_similarity = cosine_similarity(interaction_sparse.T)
    item_similarity_df = pd.DataFrame(item_similarity, index=interaction_matrix.columns, columns=interaction_matrix.columns)
    # use SVD to reduce dimensionality
    return item_similarity_df

### Collaborative filtering with sliding window
We allowed the sliding window to grow up to 72 hours as it allows for detailed data, and can give us more insight into evaluation metrics such as the coverage.

In [11]:
INITIAL_WINDOW_SIZE = 6
MAX_WINDOW_SIZE = 72
SLIDE_SIZE = 3
# behaviors_data = behaviors_data.head(10000)
hits_per_window = {}
#key = (start_time, end_time), value = (hits, total_rows)
#smaller behaviors_data 

start_time = behaviors_data['timestamp'].min()
end_time = start_time + pd.Timedelta(hours=INITIAL_WINDOW_SIZE)
print("~~~~")
print(start_time)
print(end_time)
print(end_time + pd.Timedelta(test_window_size))
print("~~~~")

training_data = behaviors_data[(behaviors_data['timestamp'] >= start_time) & (behaviors_data['timestamp'] < end_time)]
evaluation_data = behaviors_data[(behaviors_data['timestamp'] >= end_time) & (behaviors_data['timestamp'] < end_time + pd.Timedelta(hours=test_window_size))]
print(training_data.shape)
print(evaluation_data.shape)

def get_data_split(start_time, end_time, test_window_size=SLIDE_SIZE):
    # returns a training and evaluation split
    training_data = behaviors_data[(behaviors_data['timestamp'] >= start_time) & (behaviors_data['timestamp'] < end_time)]
    evaluation_data = behaviors_data[(behaviors_data['timestamp'] >= end_time) & (behaviors_data['timestamp'] < end_time + pd.Timedelta(hours=test_window_size))]
    return training_data, evaluation_data

def collaborative_filtering():
    #create matrix based on training split created by sliding window
    similarity_matrix = []
    return similarity_matrix

def recommend(similarity_matrix, evaluation_row):
    #look at the news id in the row. Recommend the top 5 similar news articles
    #last news id in the 'clicks' column is the one that was last clicked
    recommendations = []
    article_id = evaluation_row['click_history'].split()[-1]

    similar_scores = similarity_matrix.get(article_id)
    print(f"Similar scores for article {article_id}: {similar_scores}")

    if similar_scores is not None and not similar_scores.empty:
        recommendations = similar_scores.nlargest(5).index.tolist()
        if article_id in recommendations:
            recommendations.remove(article_id)
    return recommendations

print("TEST")
print(end_time)
print(pd.Timedelta(hours=SLIDE_SIZE))
print(behaviors_data['timestamp'].max())
      

#The sliding window
while end_time + pd.Timedelta(hours=SLIDE_SIZE) <= behaviors_data['timestamp'].max():
    print(f"Start time: {start_time}, End time: {end_time}")
    training_data, evaluation_data = get_data_split(start_time, end_time)

    interaction_matrix = create_matrix(user_clicks(training_data))
    similarity_matrix = get_similarity_df(interaction_matrix)
    
    #recommend for the rows in the evaluation data based on the similiarity matrix we created with the training data
    #evaluate the hit rate based on the recommendations and the impressions in the evaluation row
    hits = 0
    total = 0
    print(f'Shape of evaluation data {evaluation_data.shape}')
    for index, row in evaluation_data.iterrows():
        total += 1
        found_article = False

        clicked_article = []
        recommendations = recommend(similarity_matrix, row)

        [clicked_article.append(n.split("-")[0]) for n in row['impressions_list'] if n.split("-")[1] == "1"]
        # for clicked articles check if any of them are in the recommendations
        for article in clicked_article:
            if article in recommendations:
                found_article = True
        if found_article:
            hits += 1
            print(f'Hit for user {row['userId']} with articles {clicked_article} and recommendations {recommendations}')



        # if clicked_article in recommendations:
        #     hits += 1
        #     print(f'Hit for user {index} with article {clicked_article} and recommendations {recommendations}')

        print(f'Recommendations for user {row['userId']}: {recommendations}')
        print(f'Actual click for user {row['userId']}: {clicked_article}')
    hits_per_window[(start_time, end_time)] = (hits, total)

    print(f"Start time: {start_time}, End time: {end_time}")
    end_time += pd.Timedelta(hours=SLIDE_SIZE)
    if (end_time - start_time) > pd.Timedelta(hours = MAX_WINDOW_SIZE):
        print("\n")
        print("Increasing start time")
        print("\n")
        #remove data from the beginning of the training data
        start_time += pd.Timedelta(hours=SLIDE_SIZE)



Recommendations for user U339: ['N11842', 'N28821', 'N39177', 'N55656']
Actual click for user U339: ['N35729']
Similar scores for article N41881: newsId
N100      0.0
N1000     0.0
N10000    0.0
N10001    0.0
N10002    0.0
         ... 
N9984     0.0
N9987     0.0
N9990     0.0
N9992     0.0
N9993     0.0
Name: N41881, Length: 27960, dtype: float64
Recommendations for user U91517: ['N53198', 'N11279', 'N1129', 'N11939']
Actual click for user U91517: ['N15194']
Similar scores for article N37262: newsId
N100      0.000000
N1000     0.000000
N10000    0.000000
N10001    0.000000
N10002    0.000000
            ...   
N9984     0.229416
N9987     0.000000
N9990     0.000000
N9992     0.000000
N9993     0.000000
Name: N37262, Length: 27960, dtype: float64
Recommendations for user U72521: ['N48286', 'N2621', 'N3702', 'N62706']
Actual click for user U72521: ['N29001']
Similar scores for article N15676: newsId
N100      0.018019
N1000     0.000000
N10000    0.023262
N10001    0.000000
N10002   

In [None]:
print(hits_per_window)

In [None]:
# Visualize the hits per window
import matplotlib.pyplot as plt


x = [str(k) for k in hits_per_window.keys()]
y = [v[0] for v in hits_per_window.values()]

plt.figure(figsize=(10, 5))
plt.plot(x, y, marker='o')  


plt.xticks([]) 
plt.xlabel("Intervals")  
plt.ylabel("Hits")  

plt.tight_layout()  
plt.show()

In [None]:
for k in hits_per_window.keys():
    print(f"Window: {k}, Hits: {hits_per_window[k]}")

#key = (start_time, end_time), value = (hits, total_rows)
