### Collaborative Filtering

In [2]:
import pandas as pd
import numpy as np 
import random
import matplotlib.pyplot as plt
from scipy.sparse import dok_matrix


In [3]:
news = pd.read_csv("../MINDsmall_train/news.tsv", sep='\t', names=["newsId", "category", "subcategory", "title","abstract", "url", "title_entities","abstract_entities"])
news.head()

Unnamed: 0,newsId,category,subcategory,title,abstract,url,title_entities,abstract_entities
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ..."
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI..."


In [4]:
behaviors = pd.read_csv("../MINDsmall_train/behaviors.tsv", sep='\t', names=["ImpressionID", "Userid", "Time","History", "Impressions", "Title_entities"])
behaviors.head()

Unnamed: 0,ImpressionID,Userid,Time,History,Impressions,Title_entities
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0,
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...,
2,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...,
3,4,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0,
4,5,U8125,11/12/2019 4:11:21 PM,N10078 N56514 N14904 N33740,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...,


### Preprocess data

In [17]:
behaviors['Time'] = pd.to_datetime(behaviors['Time'], format='%m/%d/%Y %I:%M:%S %p')
behaviors['Clicks'] = behaviors['History'].str.split().str.len()
behaviors['Impressions_list'] = behaviors['Impressions'].str.split()
behaviors['Impressions_count'] = behaviors['Impressions_list'].str.len()

# sort behaviors_data by timestamp
behaviors = behaviors.sort_values(by='Time')

# remove null values
news = news.dropna().reset_index(drop=True)

### Collaborative filtering 2

In [19]:
behaviors_subset = behaviors.head(1000)

users = behaviors_subset['Userid'].unique()
items = news['url'].unique()

# Initiate ratings matrix
n_users = len(users)
n_items = len(items)

print(f'Rating matrix shape: {n_users} x {n_items}')
ratings = np.zeros((n_users, n_items))
print(f'Rating matrix shape: {n_users} x {n_items}')

Rating matrix shape: 902 x 48612
Rating matrix shape: 902 x 48612


### Collaborative filtering 1

In [11]:
# Merge DataFrames on news ID to enrich impression logs with news details
merged_data = pd.merge(behaviors, news, left_on='Impressions', right_on='newsId', how='left')

# Aggregate interactions
user_item_interactions = []
for _, row in merged_data.iterrows():
    user_id = row['Userid']
    clicked_articles = row['Impressions'].split()
    click_history = row['History'].split() if isinstance(row['History'], str) else []
    all_interactions = list(set(clicked_articles + click_history))
    user_item_interactions.append({'user_id': user_id, 'clicked_articles': all_interactions})

In [12]:

# Create a list of unique articles
unique_articles_list = sorted(set(article for interaction in user_item_interactions for article in interaction['clicked_articles']))

# Create a sparse matrix
user_ids = [interaction['user_id'] for interaction in user_item_interactions]
user_item_matrix_sparse = dok_matrix((len(user_item_interactions), len(unique_articles_list)), dtype=int)
for idx, interaction in enumerate(user_item_interactions):
    for article in interaction['clicked_articles']:
        user_item_matrix_sparse[idx, unique_articles_list.index(article)] = 1

KeyboardInterrupt: 

In [10]:
# Identify non-zero elements
non_zero_indices = np.nonzero(user_item_matrix_sparse)

# Print non-zero elements
print("Non-zero elements:")
for user_idx, article_idx in zip(non_zero_indices[0], non_zero_indices[1]):
    user_id = user_ids[user_idx]
    article_id = unique_articles_list[article_idx]
    print(f"User {user_id} clicked on article {article_id}")

# Convert user IDs to a pandas Index
user_ids_index = pd.Index(user_ids)

# Convert the sparse matrix to a DataFrame
user_item_matrix = pd.DataFrame.sparse.from_spmatrix(user_item_matrix_sparse, index=user_ids_index, columns=unique_articles_list)

print(user_item_matrix)

KeyboardInterrupt: 