In [1]:
import networkx as nx
from node2vec import Node2Vec
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import mean_squared_error, mean_absolute_error
import math
from torch.utils.data import DataLoader, TensorDataset
from scripts.data.tabular_dataset_handler import TabularDatasetHandler
from sklearn.model_selection import train_test_split

In [2]:
# Load the MovieLens dataset into a NetworkX graph
def load_movielens_data_tdh():
    tdh = TabularDatasetHandler()
    tdh.preprocess_datasets()
    ratings = tdh.get_users_ratings_df_deepcopy()

    # Create a directed graph from the ratings data
    G = nx.DiGraph()
    
    for _, row in ratings.iterrows():
        user_id, movie_id, rating, _ = row['userId'], row['movieId'], row['rating'], row['timestamp']
        G.add_edge(user_id, movie_id, weight=rating)

    return G


# Utility functions

def get_movie_title_by_id_tdh(movieId):
    tdh = TabularDatasetHandler()
    tdh.preprocess_datasets()
    
    ratings = tdh.get_users_ratings_df_deepcopy()
    movies = tdh.get_movies_df_deepcopy()
    
    movies["id"] = movies["id"].apply(lambda x: f"movie_{x}")
    ratings["rating"] = ratings["rating"].apply(lambda x: float(x))
    ratings["movieId"] = ratings["movieId"].apply(lambda x: f"movie_{x}")
    
    return list(movies[movies.movieId == movieId].title)[0]
    
def get_movie_id_by_title_tdh(title):
    tdh = TabularDatasetHandler()
    tdh.preprocess_datasets()
    
    ratings = tdh.get_users_ratings_df_deepcopy()
    movies = tdh.get_movies_df_deepcopy()
    
    ratings["rating"] = ratings["rating"].apply(lambda x: float(x))
    ratings["movieId"] = ratings["movieId"].apply(lambda x: f"movie_{x}")
    
    return list(movies[movies.title == title].movieId)[0]
    

# Load the MovieLens dataset directly from files into a NetworkX graph

def load_movielens_data():
    movies = pd.read_csv("datasets/ml-latest-small/movies.csv")
    movies["movieId"] = movies["movieId"].apply(lambda x: f"movie_{x}")
    
    ratings = pd.read_csv("datasets/ml-latest-small/ratings.csv")
    ratings["rating"] = ratings["rating"].apply(lambda x: float(x))
    ratings["movieId"] = ratings["movieId"].apply(lambda x: f"movie_{x}")

    # Create a directed graph from the ratings data
    G = nx.DiGraph()
    
    for _, row in ratings.iterrows():
        user_id, movie_id, rating, _ = row['userId'], row['movieId'], row['rating'], row['timestamp']
        G.add_edge(user_id, movie_id, weight=rating)

    return G

# Utility functions

def get_ratings():
    movies = pd.read_csv("datasets/ml-latest-small/movies.csv")
    movies["movieId"] = movies["movieId"].apply(lambda x: f"movie_{x}")
    
    ratings = pd.read_csv("datasets/ml-latest-small/ratings.csv")
    ratings["rating"] = ratings["rating"].apply(lambda x: float(x))
    ratings["movieId"] = ratings["movieId"].apply(lambda x: f"movie_{x}")
    
    ground_truth_ratings = ratings[['userId', 'movieId', 'rating']]

    return ground_truth_ratings
    
def get_movie_title_by_id(movieId):
    movies = pd.read_csv("datasets/ml-latest-small/movies.csv")
    movies["movieId"] = movies["movieId"].apply(lambda x: f"movie_{x}")
    
    ratings = pd.read_csv("datasets/ml-latest-small/ratings.csv")
    ratings["rating"] = ratings["rating"].apply(lambda x: float(x))
    ratings["movieId"] = ratings["movieId"].apply(lambda x: f"movie_{x}")
    
    return list(movies[movies.movieId == movieId].title)[0]
    
def get_movie_id_by_title(title):
    movies = pd.read_csv("datasets/ml-latest-small/movies.csv")
    movies["movieId"] = movies["movieId"].apply(lambda x: f"movie_{x}")
    
    ratings = pd.read_csv("datasets/ml-latest-small/ratings.csv")
    ratings["rating"] = ratings["rating"].apply(lambda x: float(x))
    ratings["movieId"] = ratings["movieId"].apply(lambda x: f"movie_{x}")
    
    return list(movies[movies.title == title].movieId)[0]


# Compute node embeddings using node2vec
def compute_node_embeddings(graph):
    # Generate walks
    node2vec = Node2Vec(graph, dimensions=64, walk_length=30, num_walks=200, workers=4)

    # Train node2vec model
    model = node2vec.fit(window=10, min_count=1, batch_words=4)

    # Get node embeddings
    node_embeddings = {str(node): model.wv[str(node)] for node in graph.nodes}

    return node_embeddings


# Find relative movies
def find_related_movies(movie_id, embeddings, top_n=10):
    # Ensure movie_id is a string
    movie_id = str(movie_id)

    # Get the embedding for the input movie
    input_embedding = embeddings.get(movie_id)
    if input_embedding is None:
        print(f"Embedding not found for movie {movie_id}.")
        print(input_embedding)
        return []

    # Compute cosine similarity between the input movie and all other movies
    similarities = {}
    for node, embedding in embeddings.items():
        if node != movie_id:
            similarity = cosine_similarity([input_embedding], [embedding])[0][0]
            similarities[node] = similarity

    # Sort movies by similarity in descending order
    sorted_movies = sorted(similarities.items(), key=lambda x: x[1], reverse=True)

    # Return the top N related movies
    related_movies = sorted_movies[:top_n]
    return related_movies


In [3]:
# Load MovieLens dataset into a graph
movie_lens_graph = load_movielens_data()
#movie_lens_graph = load_movielens_data_tdh()

# Compute node embeddings using node2vec
embeddings = compute_node_embeddings(movie_lens_graph)

# Print embeddings for the first 5 nodes
for node, embedding in list(embeddings.items())[:5]:
    print(f"Node {node}: {embedding}")

Computing transition probabilities:   0%|          | 0/10334 [00:00<?, ?it/s]

Generating walks (CPU: 4): 100%|██████████| 50/50 [00:01<00:00, 27.13it/s]





Node 1: [ 0.39252117 -0.38909853  0.27725434  0.42842025  0.13963728 -0.424206
  0.39884582  0.01826996 -0.39699328 -0.155771    0.25103742 -0.33385864
 -0.01234346 -0.3469177   0.02139869  0.10984892 -0.2984878  -0.1260501
  0.03992636  0.36842224  0.01352923  0.3941507   0.48010162 -0.58138376
 -0.31896     0.34280923 -0.21268915  0.10859128  0.19903034 -0.17831206
  0.06370841  0.06804914  0.14612828 -0.17831317 -0.11268654 -0.3030779
 -0.05935834  0.04363752  0.62407833 -0.3096654  -0.0643036   0.10896574
 -0.16182384 -0.06318349 -0.13789676 -0.48912218  0.345889    0.0725305
 -0.17097992  0.5000836   0.17678733  0.2733285   0.40404177  0.3067778
 -0.10828014 -0.01243125  0.07321912 -0.43620348 -0.321581    0.14079632
 -0.22758391 -0.10366874 -0.2919674   0.3482479 ]
Node movie_1: [ 0.8521859  -0.72784483  0.5922699   0.76694256  0.267807   -0.9853236
  0.84129006 -0.03467785 -0.9222602  -0.30211586  0.6343607  -0.7058072
 -0.07820743 -0.72859454  0.04188198  0.52048504 -0.56471664

In [8]:
# Movie suggestion
def give_suggestions(query_movie):
    print("Query movie: ", query_movie)
    
    related_movies = find_related_movies(get_movie_id_by_title(query_movie), embeddings)
    
    # Print the results
    print(f"Related movies for query movie:")
    for movie, similarity in related_movies:
        print(f"Movie {movie}, Similarity: {similarity}, Title: {get_movie_title_by_id(movie)}")

In [9]:
# Example of suggestion
query_movies = [
    "Matrix, The (1999)",
    "Star Wars: Episode IV - A New Hope (1977)",
    "Lion King, The (1994)",
    "Terminator 2: Judgment Day (1991)",
    "Godfather, The (1972)",
]
for query in query_movies:
    give_suggestions(query)

Query movie:  Matrix, The (1999)
Related movies for query movie:
Movie movie_2959, Similarity: 0.9986423254013062, Title: Fight Club (1999)
Movie movie_2028, Similarity: 0.9984390139579773, Title: Saving Private Ryan (1998)
Movie movie_260, Similarity: 0.9984116554260254, Title: Star Wars: Episode IV - A New Hope (1977)
Movie movie_4993, Similarity: 0.9982384443283081, Title: Lord of the Rings: The Fellowship of the Ring, The (2001)
Movie movie_1198, Similarity: 0.9982157945632935, Title: Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)
Movie movie_1196, Similarity: 0.9980239868164062, Title: Star Wars: Episode V - The Empire Strikes Back (1980)
Movie movie_527, Similarity: 0.9979533553123474, Title: Schindler's List (1993)
Movie movie_318, Similarity: 0.9978317022323608, Title: Shawshank Redemption, The (1994)
Movie movie_6377, Similarity: 0.9977293014526367, Title: Finding Nemo (2003)
Movie movie_5952, Similarity: 0.9976209402084351, Title: Lord of the R

In [10]:
# Check if CUDA is available:
if torch.cuda.is_available():
    print("CUDA is available.")
    
    # Get the number of available GPUs:
    num_gpus = torch.cuda.device_count()
    print(f"Number of GPUs available: {num_gpus}")
    
    # Get information about each GPU:
    for i in range(torch.cuda.device_count()):
        gpu = torch.cuda.get_device_properties(i)
        print(f"GPU {i}: {gpu.name}, Compute Capability: {gpu.major}.{gpu.minor}")
    
    # Get the currently selected GPU:
    current_gpu = torch.cuda.current_device()
    print(f"Currently selected GPU number: {current_gpu}")
else:
    print("CUDA is not available.")

CUDA is available.
Number of GPUs available: 1
GPU 0: NVIDIA GeForce GTX 1060, Compute Capability: 6.1
Currently selected GPU number: 0


In [11]:
# Split the data into training and testing sets
train_data, test_data = train_test_split(get_ratings(), test_size=0.2, random_state=42)

# Convert the embeddings dictionary values to PyTorch tensor
embeddings_tensor = torch.tensor(list(embeddings.values()), dtype=torch.float32)

# Extracting user IDs, movie IDs, and ratings for training and testing
train_user_ids = train_data['userId'].astype('category').cat.codes.values
train_movie_ids = train_data['movieId'].astype('category').cat.codes.values
train_ratings = train_data['rating'].values.astype('float32')

test_user_ids = test_data['userId'].astype('category').cat.codes.values
test_movie_ids = test_data['movieId'].astype('category').cat.codes.values
test_ratings = test_data['rating'].values.astype('float32')

# Convert to PyTorch tensors for training
user_ids_tensor_train = torch.tensor(train_user_ids, dtype=torch.long)
movie_ids_tensor_train = torch.tensor(train_movie_ids, dtype=torch.long)
ratings_tensor_train = torch.tensor(train_ratings, dtype=torch.float)

# Convert to PyTorch tensors for testing
user_ids_tensor_test = torch.tensor(test_user_ids, dtype=torch.long)
movie_ids_tensor_test = torch.tensor(test_movie_ids, dtype=torch.long)
ratings_tensor_test = torch.tensor(test_ratings, dtype=torch.float)


  embeddings_tensor = torch.tensor(list(embeddings.values()), dtype=torch.float32)


In [12]:
# The regression model
class Model(nn.Module):
    def __init__(self, input_dim):
        super(Model, self).__init__()
        self.fc = nn.Linear(input_dim, 1)

    def forward(self, x):
        return self.fc(x)

# DataLoader for training
dataset_train = TensorDataset(user_ids_tensor_train, movie_ids_tensor_train, ratings_tensor_train)
dataloader_train = DataLoader(dataset_train, batch_size=64, shuffle=True)

# Initialize the model
input_dim = 2 * embeddings_tensor.size(1)
model = Model(input_dim)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training
num_epochs = 10
for epoch in range(num_epochs):
    for batch_user_ids, batch_movie_ids, batch_ratings in dataloader_train:
        optimizer.zero_grad()

        # Get embeddings for both users and movies
        user_embeddings = embeddings_tensor[batch_user_ids]
        movie_embeddings = embeddings_tensor[batch_movie_ids]
        input_embeddings = torch.cat([user_embeddings, movie_embeddings], dim=1)

        output = model(input_embeddings)
        loss = criterion(output.squeeze(), batch_ratings)
        loss.backward()
        optimizer.step()

    # Print the loss for each epoch
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}")


Epoch 1/10, Loss: 3.379289388656616
Epoch 2/10, Loss: 2.4223029613494873
Epoch 3/10, Loss: 1.2358391284942627
Epoch 4/10, Loss: 1.1484616994857788
Epoch 5/10, Loss: 1.1456538438796997
Epoch 6/10, Loss: 1.0022051334381104
Epoch 7/10, Loss: 1.205405831336975
Epoch 8/10, Loss: 0.9681271314620972
Epoch 9/10, Loss: 0.8801055550575256
Epoch 10/10, Loss: 0.8700295686721802


In [13]:
# Evaluation on the testing set
model.eval()
with torch.no_grad():
    # Get embeddings for both users and movies
    user_embeddings_test = embeddings_tensor[user_ids_tensor_test]
    movie_embeddings_test = embeddings_tensor[movie_ids_tensor_test]
    input_embeddings_test = torch.cat([user_embeddings_test, movie_embeddings_test], dim=1)

    predictions_test = model(input_embeddings_test).squeeze()

# Calculate MRSE and MAE of testing set
mse_test = nn.MSELoss()(predictions_test, ratings_tensor_test).item()
mae_test = nn.L1Loss()(predictions_test, ratings_tensor_test).item()

print("\nEvaluation on Testing Set:")
print(f"Mean Squared Error (MSE): {mse_test}")
print(f"Mean Absolute Error (MAE): {mae_test}")


Evaluation on Testing Set:
Mean Squared Error (MSE): 1.079466462135315
Mean Absolute Error (MAE): 0.8298935294151306
