In [2]:
import pandas as pd
import networkx as nx
import os
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
path = os.getcwd() + '/ml1m/'

In [4]:
# Load the user rating dataset
ratings = pd.read_csv(f'{path}preprocessed/ratings.txt', sep='\t')
ratings

Unnamed: 0,uid,pid,rating,timestamp
0,0,872,5,978300760
1,0,537,3,978302109
2,0,679,3,978301968
3,0,2606,4,978300275
4,0,1790,5,978824291
...,...,...,...,...
940958,6039,810,1,956716541
940959,6039,813,5,956704887
940960,6039,477,5,956704746
940961,6039,815,4,956715648


In [5]:
movies = pd.read_csv(f'{path}/movies.dat',  sep="::", names=["movie_id", "movie_name", "genre"], header=None, encoding='latin-1')
movies

  movies = pd.read_csv(f'{path}/movies.dat',  sep="::", names=["movie_id", "movie_name", "genre"], header=None, encoding='latin-1')


Unnamed: 0,movie_id,movie_name,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [6]:
movies_map = pd.read_csv(f'{path}/preprocessed/products.txt', sep='\t', index_col='new_id')
movies_map

Unnamed: 0_level_0,raw_dataset_id
new_id,Unnamed: 1_level_1
0,2
1,3
2,4
3,5
4,6
...,...
3025,3947
3026,3948
3027,3949
3028,3950


In [6]:
# Create a networkx graph
G = nx.Graph()
print(G)

Graph with 0 nodes and 0 edges


In [7]:
# Add nodes to the graph for each user
G.add_nodes_from(ratings['uid'].unique())
print(G)

Graph with 6040 nodes and 0 edges


In [9]:
# Compute the cosine similarity between users
user_similarity = cosine_similarity(ratings.pivot_table(index='uid', columns='pid', values='rating').fillna(0))
user_similarity

array([[1.        , 0.08854972, 0.12862658, ..., 0.        , 0.19404654,
        0.12812071],
       [0.08854972, 1.        , 0.15904176, ..., 0.07030104, 0.07271236,
        0.21023644],
       [0.12862658, 0.15904176, 1.        , ..., 0.12556651, 0.10174285,
        0.14523598],
       ...,
       [0.        , 0.07030104, 0.12556651, ..., 1.        , 0.17599525,
        0.09171212],
       [0.19404654, 0.07271236, 0.10174285, ..., 0.17599525, 1.        ,
        0.23228967],
       [0.12812071, 0.21023644, 0.14523598, ..., 0.09171212, 0.23228967,
        1.        ]])

In [10]:
# Add edges to the graph for each pair of similar users
for i, row in enumerate(user_similarity):
    for j, similarity in enumerate(row):
        if similarity > 0:
            G.add_edge(i, j, weight=similarity)

In [11]:
print(G)

Graph with 6040 nodes and 17445323 edges


In [12]:
# Choose a user to find the most similar user
user_id = 10

In [13]:
# Get the index of the user in the graph
user_index = ratings['uid'].unique().tolist().index(user_id)

In [14]:
# Use the PageRank algorithm to find the most similar user
pagerank = nx.pagerank(G, weight='weight')
most_similar_user_index = max(pagerank.items(), key=lambda x: x[1])[0]

In [15]:
# Get the ID of the most similar user
most_similar_user_id = ratings['uid'].unique()[most_similar_user_index]

In [16]:
# Print the ID of the most similar user
print("Most similar user to user ", user_id, ": ", most_similar_user_id)

Most similar user to user  10 :  5366


In [23]:
user_10 = ratings[ratings['uid'] == 10]
user_10 = user_10.sort_values('pid')
user_10

Unnamed: 0,uid,pid,rating,timestamp
1205,10,32,3,978902405
1211,10,42,3,978902560
1219,10,45,5,978219607
1250,10,73,4,978904663
1241,10,87,5,978903811
...,...,...,...,...
1202,10,2681,2,978902477
1186,10,2722,1,978903278
1173,10,2761,4,978903365
1243,10,2877,3,978219385


In [24]:
user_5366 = ratings[ratings['uid'] == 5366]
user_5366 = user_5366[user_5366['pid'].isin(user_10['pid'])]
user_5366 = user_5366.sort_values('pid')
user_5366

Unnamed: 0,uid,pid,rating,timestamp
836473,5366,32,4,960506564
836498,5366,42,4,960501806
836509,5366,45,4,960501707
836554,5366,73,1,960505092
836883,5366,87,4,960504127
...,...,...,...,...
836229,5366,2681,3,960510152
836442,5366,2722,4,960503958
836717,5366,2761,3,960503860
837068,5366,2877,3,965079265


In [25]:
import pickle

In [26]:
with open('graph.pickle', 'wb') as file:
    pickle.dump(G, file)

In [None]:
with open('graph.pickle', 'rb') as file:
    G = pickle.load(file)