## Load the dataset

In [1]:
import pandas as pd
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt

rootpath = "../"

In [2]:
print("Loading big matrix...")
big_matrix = pd.read_csv(rootpath + "data/raw/big_matrix.csv")
print("Loading small matrix...")
small_matrix = pd.read_csv(rootpath + "data/raw/small_matrix.csv")

print("Loading social network...")
social_network = pd.read_csv(rootpath + "data/raw/social_network.csv")
social_network["friend_list"] = social_network["friend_list"].map(eval)

print("Loading item features...")
item_categories = pd.read_csv(rootpath + "data/raw/item_categories.csv")
item_categories["feat"] = item_categories["feat"].map(eval)

print("Loading user features...")
user_features = pd.read_csv(rootpath + "data/raw/user_features.csv")

print("Loading items' daily features...")
item_daily_features = pd.read_csv(rootpath + "data/raw/item_daily_features.csv")

print("All data loaded.")
interactions = small_matrix

Loading big matrix...
Loading small matrix...
Loading social network...
Loading item features...
Loading user features...
Loading items' daily features...
All data loaded.


In [3]:
print(f"Total interactions: {len(interactions):,}")
print(f"Unique users: {interactions['user_id'].nunique():,}")
print(f"Unique videos: {interactions['video_id'].nunique():,}")

# Create user-video matrix
print("\nCreating user-video matrix...")
user_video_matrix = interactions.pivot_table(
    index='user_id',
    columns='video_id',
    values='watch_ratio',
    fill_value=0  # If user didn't watch, assume 0 (rare with 99.7% coverage)
)

print(f"Matrix shape: {user_video_matrix.shape[0]:,} users × {user_video_matrix.shape[1]:,} videos")
print(f"\nSample of user-video matrix:")
print(user_video_matrix.iloc[:5, :5])

Total interactions: 4,676,570
Unique users: 1,411
Unique videos: 3,327

Creating user-video matrix...
Matrix shape: 1,411 users × 3,327 videos

Sample of user-video matrix:
video_id       103       109       120       122       128
user_id                                                   
14        0.429126  1.482039  0.728738  0.477810  0.439333
19        0.624466  1.070684  1.006064  0.759092  0.882691
21        1.415049  1.028840  1.809125  0.688823  0.588365
23        0.169223  2.549891  0.247487  0.438669  0.114338
24        0.345049  0.449337  0.802936  0.797411  1.875599


In [4]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute pairwise cosine similarity between all users
similarity_matrix = cosine_similarity(user_video_matrix.values)

print(f"Similarity matrix shape: {similarity_matrix.shape}")
print(f"  ({similarity_matrix.shape[0]:,} users × {similarity_matrix.shape[1]:,} users)")

# Convert to DataFrame for easier handling
similarity_df = pd.DataFrame(
    similarity_matrix,
    index=user_video_matrix.index,
    columns=user_video_matrix.index
)

# Set diagonal to NaN (user's similarity with themselves = 1.0, not interesting)
np.fill_diagonal(similarity_matrix, np.nan)

# Analyze similarity distribution
similarities = similarity_matrix[~np.isnan(similarity_matrix)]

print(f"\nSimilarity statistics (all {len(similarities):,} user pairs):")
print(f"  Mean:   {np.mean(similarities):.4f}")
print(f"  Median: {np.median(similarities):.4f}")
print(f"  Std:    {np.std(similarities):.4f}")
print(f"  Min:    {np.min(similarities):.4f}")
print(f"  Max:    {np.max(similarities):.4f}")


Similarity matrix shape: (1411, 1411)
  (1,411 users × 1,411 users)

Similarity statistics (all 1,989,510 user pairs):
  Mean:   0.5710
  Median: 0.5750
  Std:    0.2036
  Min:    0.0228
  Max:    0.9935


### Build Network with Threshold

In [5]:
THRESHOLD = 0.5

print(f"Building network with similarity threshold = {THRESHOLD}")
print("="*70)

G = nx.Graph()

# Add all users as nodes
users = user_video_matrix.index.tolist()
G.add_nodes_from(users)
print(f"Added {G.number_of_nodes():,} nodes")

# Add edges for user pairs with similarity > threshold
edges_added = 0
for i, user_i in enumerate(users):
    for j, user_j in enumerate(users[i+1:], start=i+1):  # Only upper triangle (undirected)
        sim = similarity_df.loc[user_i, user_j]
        if sim >= THRESHOLD:
            G.add_edge(user_i, user_j, weight=sim)
            edges_added += 1

print(f"Added {edges_added:,} edges (similarity ≥ {THRESHOLD})")

# Network statistics
N = G.number_of_nodes()
E = G.number_of_edges()
max_edges = N * (N - 1) / 2
density = nx.density(G)

print(f"\nNetwork Properties:")
print(f"  Nodes (N):        {N:,}")
print(f"  Edges (E):        {E:,}")
print(f"  Density:          {density:.4f} ({100*density:.2f}%)")
print(f"  Max possible edges: {int(max_edges):,}")


Building network with similarity threshold = 0.5
Added 1,411 nodes
Added 604,542 edges (similarity ≥ 0.5)

Network Properties:
  Nodes (N):        1,411
  Edges (E):        604,542
  Density:          0.6077 (60.77%)
  Max possible edges: 994,755


In [None]:
def get_neighbor_users(user_id, similarity_df, threshold=0.5):
    """
    Get neighbor users from the similarity matrix.
    Args:
        user_id: The user ID
        similarity_df: DataFrame with similarity scores
        threshold: Minimum similarity to consider (default 0.5)    
    Returns:
        Series of (user_id: similarity_score) 
    """
    if user_id not in similarity_df.index:
        return pd.Series(dtype=float)
    
    user_similarities = similarity_df.loc[user_id]
    user_similarities = user_similarities[user_similarities.index != user_id]
    
    # Filter by threshold
    similar_users = user_similarities[user_similarities >= threshold]
    
    return similar_users

# Store similar users in hashmap
neighbor_map = {}

for user_id in small_matrix['user_id'].unique():
    similar_users = get_neighbor_users(user_id, similarity_df, threshold=0.5)
    #print(f"\nUser {user_id} has {len(similar_users)} similar users")
    neighbor_map[user_id] = similar_users


In [None]:

def get_neighbors_who_watched(similarity_matrix, network, user, video):
  """
    Get neighbors of user_id in similarity matrix who watched video_id for the current compound network for time t.
    Args:
        network: DataFrame compound network at time t
        user: user_id to find neighbors for
        video: video_id to check
    Returns: DataFrame 
  """
  neighbors = neighbor_map.get(user, pd.Series())

  neighbors_who_watched = network[(network['video_id'] == video) & (network['user_id'].isin(neighbors.index))]

  return neighbors_who_watched


In [26]:
timestamps = small_matrix['timestamp'].dropna().unique()

t= timestamps.min()
max = timestamps.max()
print(f"The time range of the data is {max - t} seconds { (max - t) / 86400} days.")

while t <= max:
    compound = small_matrix[small_matrix['timestamp'] <= t]
    #print(compound)
    res = get_neighbors_who_watched(similarity_matrix, compound, 6190, 9559)
    print(res)
    t += 43200 # time bins of 43200 seconds (12 hours)
    print("------------------------------------------------")


The time range of the data is 5520837.623000145 seconds 63.89858359953872 days.
Empty DataFrame
Columns: [user_id, video_id, play_duration, video_duration, time, date, timestamp, watch_ratio]
Index: []
------------------------------------------------
Empty DataFrame
Columns: [user_id, video_id, play_duration, video_duration, time, date, timestamp, watch_ratio]
Index: []
------------------------------------------------
         user_id  video_id  play_duration  video_duration  \
3967160     6046      9559           1785           15034   
4659987     7142      9559           9470           15034   

                            time        date     timestamp  watch_ratio  
3967160  2020-07-05 01:57:17.572  20200705.0  1.593885e+09     0.118731  
4659987  2020-07-05 01:12:43.964  20200705.0  1.593883e+09     0.629906  
------------------------------------------------
         user_id  video_id  play_duration  video_duration  \
125912       226      9559           7003           15034   
5