### Collaborative Filtering

In [2]:
import pandas as pd
import numpy as np 
import random
import matplotlib.pyplot as plt
from scipy.sparse import dok_matrix
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
news = pd.read_csv("../MINDsmall_train/news.tsv", sep='\t', names=["newsId", "category", "subcategory", "title","abstract", "url", "title_entities","abstract_entities"])
news.head()

Unnamed: 0,newsId,category,subcategory,title,abstract,url,title_entities,abstract_entities
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ..."
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI..."


In [4]:
behaviors = pd.read_csv("../MINDsmall_train/behaviors.tsv", sep='\t', names=["ImpressionID", "Userid", "Time","History", "Impressions", "Title_entities"])
behaviors.head()

Unnamed: 0,ImpressionID,Userid,Time,History,Impressions,Title_entities
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0,
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...,
2,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...,
3,4,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0,
4,5,U8125,11/12/2019 4:11:21 PM,N10078 N56514 N14904 N33740,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...,


### Preprocess data

In [5]:
behaviors['Time'] = pd.to_datetime(behaviors['Time'], format='%m/%d/%Y %I:%M:%S %p')
behaviors['Clicks'] = behaviors['History'].str.split().str.len()
behaviors['Impressions_list'] = behaviors['Impressions'].str.split()
print(behaviors['Impressions_list'])
behaviors['Impressions_count'] = behaviors['Impressions_list'].str.len()

# Sort behaviors_data by timestamp
behaviors = behaviors.sort_values(by='Time')

# Remove null values
news = news.dropna().reset_index(drop=True)

0                                      [N55689-1, N35729-0]
1         [N20678-0, N39317-0, N58114-0, N20495-0, N4297...
2         [N50014-0, N23877-0, N35389-0, N49712-0, N1684...
3                  [N35729-0, N33632-0, N49685-1, N27581-0]
4         [N39985-0, N36050-0, N16096-0, N8400-1, N22407...
                                ...                        
156960    [N2235-0, N22975-0, N64037-0, N47652-0, N11378...
156961    [N3841-0, N61571-0, N58813-0, N28213-0, N4428-...
156962    [N55913-0, N62318-0, N53515-0, N10960-0, N9135...
156963    [N6219-0, N3663-0, N31147-0, N58363-0, N4107-0...
156964             [N61233-0, N33828-1, N19661-0, N41934-0]
Name: Impressions_list, Length: 156965, dtype: object


### Collaborative filtering 2

In [None]:
behaviors_subset = behaviors.head(1000)

users = behaviors_subset['Userid'].unique()
items = news['url'].unique()

# Initiate ratings matrix
n_users = len(users)
n_items = len(items)

ratings = np.zeros((n_users, n_items))
print(f'Rating matrix shape: {n_users} x {n_items}')

Rating matrix shape: 902 x 48612


In [None]:
# Fill the ratings matrix with interaction data
for index, row in behaviors_subset.iterrows():
    user_id = np.where(users == row['Userid'])[0][0] # Use np.where to find the index
    for impression in row['Impressions_list']:
        news_id, click = impression.split('-')
        if click == '1':
            # Check if the news_id exists in the items array
            item_indices = np.where(items == news_id)[0]
            if item_indices.size > 0: # Check if the array is not empty
                item_id = item_indices[0] # Use the first index if found
                ratings[user_id, item_id] = 1
                print(f"{user_id} and {item_id} is set to 1")
            else:
                print(f"News ID {news_id} not found in items array.")

#- Focus on the results, what would you do next 
#- Methods - decisions on ring buffer, collaborative filtering, bag of words, 
#    - collaborative have many paramters - item neigborbased
#- 10-15 pages
#-Figure: hit rate, timeframe in certain intervals. Does the 
#- Hit: if the user clicks on what we predict, this is a hit. 
#- half a day intervals


News ID N46057 not found in items array.
News ID N55582 not found in items array.
News ID N41858 not found in items array.
News ID N57768 not found in items array.
News ID N50135 not found in items array.
News ID N16560 not found in items array.
News ID N25785 not found in items array.
News ID N3491 not found in items array.
News ID N52433 not found in items array.
News ID N43083 not found in items array.
News ID N48925 not found in items array.
News ID N3652 not found in items array.
News ID N3128 not found in items array.
News ID N3491 not found in items array.
News ID N47020 not found in items array.
News ID N59030 not found in items array.
News ID N33981 not found in items array.
News ID N64536 not found in items array.
News ID N57768 not found in items array.
News ID N34520 not found in items array.
News ID N48925 not found in items array.
News ID N41881 not found in items array.
News ID N42488 not found in items array.
News ID N4247 not found in items array.
News ID N26066 not fo

In [None]:
# Assuming behaviors is your DataFrame containing the behaviors data
# and it's already sorted by time

# Define window sizes
train_window_size = 6 # hours
predict_window_size = 3 # hours

# Convert time to hours for easier windowing
behaviors['Time'] = behaviors['Time'].dt.hour

# Initialize variables
start_time = 0
end_time = train_window_size
predictions = []

while end_time <= behaviors['Time'].max():
    # Slice the data for the current window
    train_data = behaviors[(behaviors['Time'] >= start_time) & (behaviors['Time'] < end_time)]
    predict_data = behaviors[(behaviors['Time'] >= end_time) & (behaviors['Time'] < end_time + predict_window_size)]
    
    # Train your model on train_data
    # For example, fill the ratings matrix with train_data
    # ratings = ...
    
    # Make predictions on predict_data
    # For example, recommend items for each user in predict_data
    # predictions = ...
    
    # Evaluate the predictions against the actual interactions in predict_data
    # This step depends on how you define a "hit" or "miss"
    
    # Move the windows forward
    start_time += predict_window_size
    end_time += predict_window_size

# At this point, predictions contains the recommendations for each prediction window

### Collaborative filtering 1

In [13]:
# Merge DataFrames on news ID to enrich impression logs with news details
merged_data = pd.merge(behaviors_subset, news, left_on='Impressions', right_on='newsId', how='left')

# Aggregate interactions
user_item_interactions = []
for _, row in merged_data.iterrows():
    user_id = row['Userid']
    clicked_articles = row['Impressions'].split()
    click_history = row['History'].split() if isinstance(row['History'], str) else []
    all_interactions = list(set(clicked_articles + click_history))
    user_item_interactions.append({'user_id': user_id, 'clicked_articles': all_interactions})

In [14]:

# Create a list of unique articles
unique_articles_list = sorted(set(article for interaction in user_item_interactions for article in interaction['clicked_articles']))

# Create a sparse matrix
user_ids = [interaction['user_id'] for interaction in user_item_interactions]
user_item_matrix_sparse = dok_matrix((len(user_item_interactions), len(unique_articles_list)), dtype=int)
for idx, interaction in enumerate(user_item_interactions):
    for article in interaction['clicked_articles']:
        user_item_matrix_sparse[idx, unique_articles_list.index(article)] = 1

In [12]:
# Identify non-zero elements
non_zero_indices = np.nonzero(user_item_matrix_sparse)

for user_idx, article_idx in zip(non_zero_indices[0], non_zero_indices[1]):
    user_id = user_ids[user_idx]
    article_id = unique_articles_list[article_idx]
    print(f"User {user_id} clicked on article {article_id}")

# Convert user IDs to a pandas Index
user_ids_index = pd.Index(user_ids)

# Convert the sparse matrix to a DataFrame
user_item_matrix = pd.DataFrame.sparse.from_spmatrix(user_item_matrix_sparse, index=user_ids_index, columns=unique_articles_list)

print(user_item_matrix)

(array([ 0,  0,  0, ..., 33, 33, 33]), array([7321, 9697, 3262, ..., 7978, 8219, 7926]))
User U65916 clicked on article N5184
User U65916 clicked on article N6868-0
User U65916 clicked on article N28684-0
User U65916 clicked on article N33291-0
User U65916 clicked on article N51706
User U65916 clicked on article N53652-0
User U65916 clicked on article N52154-0
User U65916 clicked on article N37377-0
User U65916 clicked on article N5379-0
User U65916 clicked on article N41858-1
User U65916 clicked on article N64536-0
User U65916 clicked on article N38802
User U65916 clicked on article N55582-1
User U65916 clicked on article N12096
User U65916 clicked on article N54827
User U65916 clicked on article N57099-0
User U65916 clicked on article N23653
User U65916 clicked on article N36443
User U65916 clicked on article N43142
User U65916 clicked on article N11143
User U65916 clicked on article N31002-0
User U65916 clicked on article N54300-0
User U65916 clicked on article N40767
User U65916 cl