In [1]:
# Core data analysis packages
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns

from scipy.sparse import csr_matrix 
from sklearn.neighbors import NearestNeighbors
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# import our files
user_artists = pd.read_csv('data/user_artists.dat',sep='\t')
artists = pd.read_csv('data/artists.dat',sep='\t',usecols=['id','name'])
tags = pd.read_csv('data/tags.dat', sep='\t',encoding='latin-1')
uta = pd.read_csv('data/user_taggedartists.dat', sep='\t')
utat = pd.read_csv('data/user_taggedartists-timestamps.dat', sep="\t")
friends = pd.read_csv('data/user_friends.dat', sep='\t')

## Step 2: Exploratory Data Analysis (EDA) and Preprocessing

In [3]:
num_users = user_artists['userID'].nunique()
num_artists = user_artists['artistID'].nunique()
print(f"Total users: {num_users}, Total artists: {num_artists}")

Total users: 1892, Total artists: 17632


In [4]:
#Create a binary version: 1 if a user has played the artist at least 1000 time. We will use this later to decide the commun neighbors.
user_artists['played'] = (user_artists['weight'] > 1000).astype(int)

In [5]:
user_artists.weight.describe() # 75% of the users have played the artist less than 187 times. The max is 352698

count     92834.00000
mean        745.24393
std        3751.32208
min           1.00000
25%         107.00000
50%         260.00000
75%         614.00000
max      352698.00000
Name: weight, dtype: float64

## Step 3: Building the Graph Representation

In [6]:
B = nx.Graph()

users = user_artists['userID'].unique()
B.add_nodes_from(users, bipartite='users')

artist_ids = user_artists['artistID'].unique()
artist_nodes = [f"artist_{artist}" for artist in artist_ids]
B.add_nodes_from(artist_nodes, bipartite='artists')

edges = []
for row in user_artists.itertuples(index=False):
    user_id = row.userID
    artist_id = row.artistID
    played = row.played
    edges.append((user_id, f"artist_{artist_id}", played))
    
B.add_weighted_edges_from(edges)

print("Total nodes in bipartite graph:", B.number_of_nodes())
print("Total edges in bipartite graph:", B.number_of_edges())


Total nodes in bipartite graph: 19524
Total edges in bipartite graph: 92834


In [7]:
from networkx.algorithms import bipartite

# Project the bipartite graph onto the user nodes.
user_graph = bipartite.weighted_projected_graph(B, users)

print("Total nodes in user graph:", user_graph.number_of_nodes())
print("Unique users:", len(users)) # just to ensure what we are doing is correct

KeyboardInterrupt: 

In [9]:
print("Total nodes in user graph:", user_graph.number_of_nodes())
print("Total edges in user graph:", user_graph.number_of_edges())

Total nodes in user graph: 1892
Total edges in user graph: 1014138


In [None]:
plt.figure(figsize=(12, 12))
# Use a spring layout for visualization
pos = nx.spring_layout(user_graph, k=0.1, iterations=20)
nx.draw_networkx_nodes(user_graph, pos, node_size=50, node_color='blue', alpha=0.7)
nx.draw_networkx_edges(user_graph, pos, alpha=0.5)
plt.title("Visualization of the Projected User Graph")
plt.axis('off')
plt.show()

In [None]:
def common_neighbors_score(u, v, G):
    return len(list(nx.common_neighbors(G, u, v)))

# Example: score between user 1 and user 2 (if they exist in the graph)
u, v = 1, 2
if u in user_graph and v in user_graph:
    score = common_neighbors_score(u, v, user_graph)
    print(f"Common Neighbors between {u} and {v}: {score}")