# Movie Recommendation System

## Environment settings

In [1]:
import os
import sys

# Add the path to the root directory of the project to the system path
ROOT_PATH = "D:\\Internship"
sys.path.append(os.path.join(ROOT_PATH, "recsys\\movie_recommendation_system\\src"))

# Dependencies
import pandas as pd
import torch
from datetime import datetime

# My scripts
from movie_recommender.data.graph_dataset_handler import HeterogeneousGraphDatasetHandler
from movie_recommender.data.expandable_graph_dataset_handler import ExpandableHeterogeneousGraphDatasetHandler
from movie_recommender.models.gnn_retrain_strategies import GNNRetrainModelHandler
from movie_recommender.recommenders.collaborative_filtering import CollaborativeFiltering

# Remove warnings
import warnings; warnings.simplefilter('ignore')

# Data and trained models paths
data_path = os.path.join(ROOT_PATH, "resources", "movielens")
processed_data_path = os.path.join(ROOT_PATH, "resources", "movielens_processed")
trained_models_path = os.path.join(ROOT_PATH, "trained_models")
updated_models_path = os.path.join(trained_models_path, "updated_models")

gdh_filepath = os.path.join(processed_data_path, "gdh_instance.pkl")
GraphSAGE_filepath = os.path.join(trained_models_path, "GraphSAGE_based_model.pth")

In [2]:
# # Check if CUDA is available:
# if torch.cuda.is_available():
#     print("CUDA is available.")
    
#     # Get the number of available GPUs:
#     num_gpus = torch.cuda.device_count()
#     print(f"Number of GPUs available: {num_gpus}")
    
#     # Get information about each GPU:
#     for i in range(torch.cuda.device_count()):
#         gpu = torch.cuda.get_device_properties(i)
#         print(f"GPU {i}: {gpu.name}, Compute Capability: {gpu.major}.{gpu.minor}")
    
#     # Get the currently selected GPU:
#     current_gpu = torch.cuda.current_device()
#     print(f"Currently selected GPU number: {current_gpu}")
# else:
#     print("CUDA is not available.")

## Load a pre-built graph dataset instance

In [3]:
# Load a graph dataset handler
gdh = HeterogeneousGraphDatasetHandler.load_class_instance(filepath=gdh_filepath)

In [4]:
dataset = gdh.get_graph_dataset()
dataset

HeteroData(
  user={
    node_id=[672],
    x=[672, 600],
  },
  movie={
    node_id=[45433],
    x=[45433, 405],
  },
  (user, rating, movie)={
    edge_index=[2, 45004],
    edge_label=[45004],
    y=[45004],
  },
  (movie, rev_rating, user)={
    edge_index=[2, 45004],
    edge_label=[45004],
    y=[45004],
  }
)

## Create new movies

In [5]:
# Print column information
# def print_movie_details(row_number):
#     if 0 <= row_number < len(gdh._movies_df):
#         row = gdh._movies_df.iloc[row_number]
#         for column, value in row.items():
#             print(f"{column}: {value}")
#     else:
#         print("Invalid row number. Please provide a valid row index.")

# print("Column content")
# print_movie_details(0)
# print("\nColumns types")
# gdh._movies_df.dtypes

In [6]:
new_movies_df = pd.DataFrame([
    # New movie
    {
        "adult": False,
        "belongs_to_collection": "Spider-Verse Collection",
        "budget": "100000000",
        "genres": ["Animation", "Action", "Adventure", "Science Fiction"],
        "homepage": "https://www.acrossthespiderverse.movie/",
        "id": 414906,
        "original_language": "en",
        "original_title": "Spider-Man: Across the Spider-Verse",
        "overview": "Miles Morales catapults across the Multiverse, where he encounters a team of Spider-People charged with protecting its very existence. When the heroes clash on how to handle a new threat, Miles must redefine what it means to be a hero.",
        "popularity": "89.543",
        "production_companies": ["Columbia Pictures", "Sony Pictures Animation", "Marvel Entertainment"],
        "production_countries": ["United States of America"],
        "release_date": "2023-06-02",
        "revenue": 690000000.0,
        "runtime": 140.0,
        "spoken_languages": [{"iso_639_1": "en", "name": "English"}],
        "status": "Released",
        "tagline": "It's how you wear the mask that matters.",
        "title": "Spider-Man: Across the Spider-Verse",
        "vote_average": 8.7,
        "vote_count": 20000.0,
        "year": "2023",
    },
    # New movie
    {
        "adult": False,
        "belongs_to_collection": "Dune Collection",
        "budget": "190000000",
        "genres": ["Science Fiction", "Adventure", "Drama"],
        "homepage": "https://www.dunemovie.com/",
        "id": 693134,
        "original_language": "en",
        "original_title": "Dune: Part Two",
        "overview": "Paul Atreides unites with Chani and the Fremen while seeking revenge against those who destroyed his family, facing a choice between love and the fate of the universe.",
        "popularity": "95.678",
        "production_companies": ["Legendary Pictures", "Warner Bros. Pictures"],
        "production_countries": ["United States of America"],
        "release_date": "2024-03-01",
        "revenue": 720000000.0,
        "runtime": 166.0,
        "spoken_languages": [{"iso_639_1": "en", "name": "English"}],
        "status": "Released",
        "tagline": "Long live the fighters.",
        "title": "Dune: Part Two",
        "vote_average": 8.9,
        "vote_count": 50000.0,
        "year": "2024",
    },
    # Duplicate
    {
        "adult": False,
        "belongs_to_collection": "Toy Story Collection",
        "budget": "30000000",
        "genres": ["Animation", "Comedy", "Family"],
        "homepage": "http://toystory.disney.com/toy-story",
        "id": 862,  # Same ID as original
        "original_language": "en",
        "original_title": "Toy Story",
        "overview": "Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.",
        "popularity": "21.946943",
        "production_companies": ["Pixar Animation Studios"],
        "production_countries": ["United States of America"],
        "release_date": "1995-10-30",
        "revenue": 373554033.0,
        "runtime": 81.0,
        "spoken_languages": [{"iso_639_1": "en", "name": "English"}],
        "status": "Released",
        "tagline": "nan",
        "title": "Toy Story",
        "vote_average": 7.7,
        "vote_count": 5415.0,
        "year": "1995",
    }
])
new_movies_df

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,overview,popularity,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,year
0,False,Spider-Verse Collection,100000000,"[Animation, Action, Adventure, Science Fiction]",https://www.acrossthespiderverse.movie/,414906,en,Spider-Man: Across the Spider-Verse,"Miles Morales catapults across the Multiverse,...",89.543,...,2023-06-02,690000000.0,140.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,It's how you wear the mask that matters.,Spider-Man: Across the Spider-Verse,8.7,20000.0,2023
1,False,Dune Collection,190000000,"[Science Fiction, Adventure, Drama]",https://www.dunemovie.com/,693134,en,Dune: Part Two,Paul Atreides unites with Chani and the Fremen...,95.678,...,2024-03-01,720000000.0,166.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Long live the fighters.,Dune: Part Two,8.9,50000.0,2024
2,False,Toy Story Collection,30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,7.7,5415.0,1995


In [7]:
new_movies_df.dtypes

adult                       bool
belongs_to_collection     object
budget                    object
genres                    object
homepage                  object
id                         int64
original_language         object
original_title            object
overview                  object
popularity                object
production_companies      object
production_countries      object
release_date              object
revenue                  float64
runtime                  float64
spoken_languages          object
status                    object
tagline                   object
title                     object
vote_average             float64
vote_count               float64
year                      object
dtype: object

## Create new user-movie ratings

In [8]:
# Create a new user
user_id = 1000 #gdh.users_ratings_df["userId"].max() + 1

In [9]:
# Create new user-movie ratings
movies_list = [
    {"title": "Top Gun", "rating": 5},
    {"title": "Titanic", "rating": 4.5},
    {"title": "Jumanji", "rating": 4.5},
    {"title": "Toy Story", "rating": 3.5},
    {"title": "Pulp Fiction", "rating": 2},
    {"title": "Pirates of the Caribbean: The Curse of the Black Pearl", "rating": 4},
    {"title": "Harry Potter and the Philosopher's Stone", "rating": 5},
    {"title": "Harry Potter and the Chamber of Secrets", "rating": 5},
    {"title": "Harry Potter and the Prisoner of Azkaban", "rating": 5},
    {"title": "Harry Potter and the Goblet of Fire", "rating": 5},
    {"title": "The Lord of the Rings: The Fellowship of the Ring", "rating": 5},
    {"title": "The Lord of the Rings: The Two Towers", "rating": 5},
    {"title": "The Hobbit: An Unexpected Journey", "rating": 5},
    {"title": "The Notebook", "rating": 3.5},
    {"title": "Mean Girls", "rating": 3.5},
    {"title": "The Social Network", "rating": 4.5},
    {"title": "Grease", "rating": 2},
    {"title": "Rocky", "rating": 3.5},
    {"title": "La La Land", "rating": 1.5},
    {"title": "Dirty Dancing", "rating": 2.5},
]

# Convert the user-movie ratings to tuples
timestamp_now = int(datetime.now().timestamp())

movies_tuples = [
    (
        user_id,
        gdh.movies_df[gdh.movies_df["title"] == movie["title"]]["id"].values[0],
        movie["rating"],
        timestamp_now
    ) for idx, movie in enumerate(movies_list)
]

# Output result
for entry in movies_tuples:
    print(entry)

(1000, 744, 5, 1743771267)
(1000, 597, 4.5, 1743771267)
(1000, 8844, 4.5, 1743771267)
(1000, 862, 3.5, 1743771267)
(1000, 680, 2, 1743771267)
(1000, 22, 4, 1743771267)
(1000, 671, 5, 1743771267)
(1000, 672, 5, 1743771267)
(1000, 673, 5, 1743771267)
(1000, 674, 5, 1743771267)
(1000, 120, 5, 1743771267)
(1000, 121, 5, 1743771267)
(1000, 49051, 5, 1743771267)
(1000, 11036, 3.5, 1743771267)
(1000, 10625, 3.5, 1743771267)
(1000, 37799, 4.5, 1743771267)
(1000, 621, 2, 1743771267)
(1000, 1366, 3.5, 1743771267)
(1000, 313369, 1.5, 1743771267)
(1000, 88, 2.5, 1743771267)


In [10]:
# Convert the list of tuples to a DataFrame
new_users_ratings_df = pd.DataFrame(movies_tuples, columns=["userId", "movieId", "rating", "timestamp"])
new_users_ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1000,744,5.0,1743771267
1,1000,597,4.5,1743771267
2,1000,8844,4.5,1743771267
3,1000,862,3.5,1743771267
4,1000,680,2.0,1743771267


## Expandable graph dataset handler

In [11]:
# Wrap the graph dataset handler with the expandable dataset handler
egdh = ExpandableHeterogeneousGraphDatasetHandler(gdh)

In [12]:
# Check the graph dataset before adding new movies and ratings
egdh.get_graph_dataset()

HeteroData(
  user={
    node_id=[672],
    x=[672, 600],
  },
  movie={
    node_id=[45433],
    x=[45433, 405],
  },
  (user, rating, movie)={
    edge_index=[2, 45004],
    edge_label=[45004],
    y=[45004],
  },
  (movie, rev_rating, user)={
    edge_index=[2, 45004],
    edge_label=[45004],
    y=[45004],
  }
)

In [13]:
# Test expandable graph dataset handler with new movies
updated_movies_df = egdh.movies_df
print("Number of movie records before updating", len(egdh.movies_df))
egdh.add_new_movies(new_movies_df)
print("Number of movie records after updating", len(egdh.movies_df))

Number of movie records before updating 45433
Found 2 new movies to add to 'movies_df': Movies ids [414906, 693134]
Number of movie records after updating 45435


In [14]:
# Check whether the new movies were added to the graph dataset
dataset = egdh.get_graph_dataset()
dataset

HeteroData(
  user={
    node_id=[672],
    x=[672, 600],
  },
  movie={
    node_id=[45435],
    x=[45435, 405],
  },
  (user, rating, movie)={
    edge_index=[2, 45004],
    edge_label=[45004],
    y=[45004],
  },
  (movie, rev_rating, user)={
    edge_index=[2, 45004],
    edge_label=[45004],
    y=[45004],
  }
)

In [15]:
# Test expandable graph dataset handler with new ratings
updated_users_ratings_df = egdh.users_ratings_df
print("Number of rating records before updating", len(egdh.users_ratings_df))
egdh.add_new_user_movie_ratings(new_users_ratings_df)
print("Number of rating records after updating", len(egdh.users_ratings_df))

Number of rating records before updating 45004
Found 20 new ratings to add to 'users_ratings_df'
Found 1 new users to add to 'users_ratings_df': Users ids [1000]
Number of rating records after updating 45024


In [16]:
# Check whether the new ratings were added to the graph dataset
dataset = egdh.get_graph_dataset()
dataset

HeteroData(
  user={
    node_id=[673],
    x=[673, 600],
  },
  movie={
    node_id=[45435],
    x=[45435, 405],
  },
  (user, rating, movie)={
    edge_index=[2, 45024],
    edge_label=[45024],
    y=[45024],
  },
  (movie, rev_rating, user)={
    edge_index=[2, 45024],
    edge_label=[45024],
    y=[45024],
  }
)

In [17]:
# Check the subgraph dataset
subgraph_dataset = egdh.get_subgraph_dataset()
subgraph_dataset

HeteroData(
  movie={
    node_id=[45435],
    x=[45435, 405],
  },
  user={
    node_id=[673],
    x=[673, 600],
  },
  (user, rating, movie)={
    edge_index=[2, 20],
    edge_label=[20],
    y=[20],
  },
  (movie, rev_rating, user)={
    edge_index=[2, 20],
    edge_label=[20],
    y=[20],
  }
)

In [18]:
# Print dataset information
print("Dataset size: ", dataset.size())
print("\nDataset type: ", type(dataset))
print("\nDataset metadata: ", dataset.metadata())
print("\nDataset to dict: ", dataset.to_dict())

Dataset size:  (46108, 46108)

Dataset type:  <class 'torch_geometric.data.hetero_data.HeteroData'>

Dataset metadata:  (['user', 'movie'], [('user', 'rating', 'movie'), ('movie', 'rev_rating', 'user')])

Dataset to dict:  {'_global_store': {}, 'user': {'node_id': tensor([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
         14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,
         28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,
         42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,
         56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,
         70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,
         84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,
         98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
        112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125,
        126, 127, 128, 129, 130, 

## Add data in pre-trained model training set

In [19]:
# Load a pretrained model
GraphSAGE_model = GNNRetrainModelHandler.load_pretrained_model(
    pretrained_model_filepath=GraphSAGE_filepath
)

In [20]:
GraphSAGE_model._egdh.get_graph_dataset()

HeteroData(
  user={
    node_id=[672],
    x=[672, 600],
  },
  movie={
    node_id=[45433],
    x=[45433, 405],
  },
  (user, rating, movie)={
    edge_index=[2, 45004],
    edge_label=[45004],
    y=[45004],
  },
  (movie, rev_rating, user)={
    edge_index=[2, 45004],
    edge_label=[45004],
    y=[45004],
  }
)

In [21]:
GraphSAGE_model._egdh.get_subgraph_dataset()

HeteroData()

In [22]:
# Set the new train set for the re-training of the model
GraphSAGE_model.add_new_train_data(new_movies_df=new_movies_df, new_ratings_df=new_users_ratings_df)

Found 2 new movies to add to 'movies_df': Movies ids [414906, 693134]
Found 20 new ratings to add to 'users_ratings_df'
Found 1 new users to add to 'users_ratings_df': Users ids [1000]


In [23]:
GraphSAGE_model._egdh.get_graph_dataset()

HeteroData(
  user={
    node_id=[673],
    x=[673, 600],
  },
  movie={
    node_id=[45435],
    x=[45435, 405],
  },
  (user, rating, movie)={
    edge_index=[2, 45024],
    edge_label=[45024],
    y=[45024],
  },
  (movie, rev_rating, user)={
    edge_index=[2, 45024],
    edge_label=[45024],
    y=[45024],
  }
)

In [24]:
GraphSAGE_model._egdh.get_subgraph_dataset()

HeteroData(
  movie={
    node_id=[45435],
    x=[45435, 405],
  },
  user={
    node_id=[673],
    x=[673, 600],
  },
  (user, rating, movie)={
    edge_index=[2, 20],
    edge_label=[20],
    y=[20],
  },
  (movie, rev_rating, user)={
    edge_index=[2, 20],
    edge_label=[20],
    y=[20],
  }
)

## Create test new user-movie ratings

In [25]:
# Test data
# Create new user-movie ratings
test_movies_list = [
    {"title": "The Lord of the Rings: The Return of the King", "rating": 5},
    {"title": "The Hobbit: The Desolation of Smaug", "rating": 5},
    {"title": "The Hobbit: The Battle of the Five Armies", "rating": 5},
    {"title": "Harry Potter and the Order of the Phoenix", "rating": 5},
    {"title": "Harry Potter and the Half-Blood Prince", "rating": 5},
    {"title": "Harry Potter and the Deathly Hallows: Part 1", "rating": 5},
    {"title": "Harry Potter and the Deathly Hallows: Part 2", "rating": 5},
    {"title": "Avatar", "rating": 3},
    {"title": "Cast Away", "rating": 4},
    {"title": "Catwoman", "rating": 2},
    {"title": "Iron Man", "rating": 5},
    {"title": "Serena", "rating": 1},
    {"title": "The Great Gatsby", "rating": 2.5},
]

# Convert the user-movie ratings to tuples
timestamp_now = int(datetime.now().timestamp())

test_movies_tuples = [
    (
        user_id,
        gdh.movies_df[gdh.movies_df["title"] == movie["title"]]["id"].values[0],
        movie["rating"],
        timestamp_now
    ) for idx, movie in enumerate(test_movies_list)
]

# Output result
for entry in test_movies_tuples:
    print(entry)

(1000, 122, 5, 1743771285)
(1000, 57158, 5, 1743771285)
(1000, 122917, 5, 1743771285)
(1000, 675, 5, 1743771285)
(1000, 767, 5, 1743771285)
(1000, 12444, 5, 1743771285)
(1000, 12445, 5, 1743771285)
(1000, 19995, 3, 1743771285)
(1000, 8358, 4, 1743771285)
(1000, 314, 2, 1743771285)
(1000, 1726, 5, 1743771285)
(1000, 164251, 1, 1743771285)
(1000, 11034, 2.5, 1743771285)


In [26]:
# Convert the list of tuples to a DataFrame
test_new_users_ratings_df = pd.DataFrame(test_movies_tuples, columns=["userId", "movieId", "rating", "timestamp"])
test_new_users_ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1000,122,5.0,1743771285
1,1000,57158,5.0,1743771285
2,1000,122917,5.0,1743771285
3,1000,675,5.0,1743771285
4,1000,767,5.0,1743771285


## Utils for testing re-training performance 

In [27]:
# Extract the subset of movies that were rated by the new user
movies_df = GraphSAGE_model._egdh._movies_df
new_rated_movies_df = movies_df[movies_df["id"].isin(new_users_ratings_df["movieId"])]
new_rated_movies_df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,overview,popularity,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,year
0,False,Toy Story Collection,30000000,Animation|Comedy|Family,http://toystory.disney.com/toy-story,862,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,7.7,5415.0,1995
1,False,,65000000,Adventure|Fantasy|Family,,8844,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,6.9,2413.0,1995
292,False,,8000000,Thriller|Crime,,680,en,Pulp Fiction,"A burger-loving hit man, his philosophical par...",140.950236,...,1994-09-10,213928762.0,154.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Just because you are a character doesn't mean ...,Pulp Fiction,8.3,8670.0,1994
1056,False,Dirty Dancing Collection,6000000,Drama|Music|Romance,http://lionsgateathome.com/dirty-dancing,88,en,Dirty Dancing,Expecting the usual tedium that accompanies a ...,14.044122,...,1987-08-21,213954274.0,100.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Have the time of your life.,Dirty Dancing,7.1,1371.0,1987
1069,False,,15000000,Action|Romance|War,,744,en,Top Gun,For Lieutenant Pete 'Maverick' Mitchell and hi...,20.301019,...,1986-05-16,356830601.0,110.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Up there with the best of the best.,Top Gun,6.7,1736.0,1986


In [28]:
# Extract the subset of test movies that were rated by the new user
movies_df = GraphSAGE_model._egdh._movies_df
test_new_rated_movies_df = movies_df[movies_df["id"].isin(test_new_users_ratings_df["movieId"])]
test_new_rated_movies_df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,overview,popularity,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,year
3897,False,,90000000,Adventure|Drama,,8358,en,Cast Away,"Chuck, a top international manager for FedEx, ...",21.296343,...,2000-12-22,429632100.0,143.0,"[{'iso_639_1': 'ru', 'name': 'Pусский'}, {'iso...",Released,"At the edge of the world, his journey begins.",Cast Away,7.5,3304.0,2000
7000,False,The Lord of the Rings Collection,94000000,Adventure|Fantasy|Action,http://www.lordoftherings.net,122,en,The Lord of the Rings: The Return of the King,Aragorn is revealed as the heir to the ancient...,29.324358,...,2003-12-01,1118889000.0,201.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The eye of the enemy is moving.,The Lord of the Rings: The Return of the King,8.1,8226.0,2003
7025,False,,6500000,Drama|Romance,,11034,en,The Great Gatsby,"Nick Carraway, a young Midwesterner now living...",5.702638,...,1974-03-27,26533200.0,144.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Gone is the romance that was so divine.,The Great Gatsby,6.2,144.0,1974
7939,False,,100000000,Action|Crime,,314,en,Catwoman,Liquidated after discovering a corporate consp...,13.340272,...,2004-07-22,82102380.0,104.0,"[{'iso_639_1': 'es', 'name': 'Español'}, {'iso...",Released,CATch her in IMAX,Catwoman,4.2,833.0,2004
11927,False,Harry Potter Collection,150000000,Adventure|Fantasy|Family|Mystery,http://www.harrypotterorderofthephoenix.com/,675,en,Harry Potter and the Order of the Phoenix,Returning for his fifth year of study at Hogwa...,21.3643,...,2007-06-28,938212700.0,138.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Evil Must Be Confronted.,Harry Potter and the Order of the Phoenix,7.4,5633.0,2007


In [29]:
def compute_rmse(pred_ratings):
    """
    Compute the Root Mean Squared Error (RMSE) for predicted and ground truth ratings.

    Parameters:
        - pred_ratings (pd.DataFrame): A DataFrame containing 'predicted_rating' and 
            'ground_truth_rating' columns.

    Returns:
        float: The computed RMSE value.
    """
    from sklearn.metrics import mean_squared_error
    import numpy as np

    # Compute RMSE
    rmse = np.sqrt(mean_squared_error(
        pred_ratings["ground_truth_rating"], 
        pred_ratings["predicted_rating"]
    ))
    return rmse

In [30]:
def evaluate_model_performance(GraphSAGE_model):
    """
    Evaluate the performance of the GraphSAGE model on both training and test data.

    Parameters:
        - GraphSAGE_model: The pre-trained or retrained GraphSAGE model to evaluate.
    """
    from IPython.display import display, HTML

    # Initialize the recommender
    GraphSAGE_recommender = CollaborativeFiltering(model_handler=GraphSAGE_model)

    # Evaluate performance specifically over the new user training data
    pred_ratings_train = GraphSAGE_recommender.predict_ratings(user_id, new_rated_movies_df)
    rmse_train = compute_rmse(pred_ratings_train)

    # Evaluate performance specifically over the new user test data
    pred_ratings_test = GraphSAGE_recommender.predict_ratings(user_id, test_new_rated_movies_df)
    if "ground_truth_rating" in pred_ratings_test.columns:
        pred_ratings_test.drop(columns=["ground_truth_rating"], inplace=True)
    pred_ratings_test = pred_ratings_test.merge(
        test_new_users_ratings_df.rename(columns={"rating": "ground_truth_rating"})[["movieId", "ground_truth_rating"]],
        on="movieId",
        how="left"
    )
    rmse_test = compute_rmse(pred_ratings_test)

    # Display results side by side
    display(HTML(f"""
    <div style="display: flex; justify-content: space-around;">
        <div>
            <h4>Training Data</h4>
            <p>RMSE: {rmse_train:.4f}</p>
            {pred_ratings_train.to_html(index=False)}
        </div>
        <div>
            <h4>Test Data</h4>
            <p>RMSE: {rmse_test:.4f}</p>
            {pred_ratings_test.to_html(index=False)}
        </div>
    </div>
    """))

## Performance pre-trained model

In [31]:
# Load a pretrained model
GraphSAGE_model = GNNRetrainModelHandler.load_pretrained_model(
    pretrained_model_filepath=GraphSAGE_filepath
)

In [32]:
# Set the new train set for the re-training of the model
GraphSAGE_model.add_new_train_data(new_movies_df=new_movies_df, new_ratings_df=new_users_ratings_df)

Found 2 new movies to add to 'movies_df': Movies ids [414906, 693134]
Found 20 new ratings to add to 'users_ratings_df'
Found 1 new users to add to 'users_ratings_df': Users ids [1000]


In [33]:
# Evaluate general performance pre-trained model
GraphSAGE_model.evaluate_performance()

Device: 'cuda
'
Test RMSE: 0.9055, Test MAE: 0.6928

      userId  movieId  pred_rating  gt_rating
0        514     9645     3.909420        2.5
1         26      286     4.455622        5.0
2        599    44114     3.718285        5.0
3        101     7008     4.035489        4.0
4        101    11443     4.162599        3.0
...      ...      ...          ...        ...
4495     246     2244     3.258173        2.0
4496     104    10299     3.125103        3.0
4497     469     5044     3.438445        3.0
4498     129    36979     3.477707        4.0
4499     563    10009     2.232373        1.0

[4500 rows x 4 columns]


In [34]:
# Evaluate performance specifically over the new training data
evaluate_model_performance(GraphSAGE_model)

movieId,predicted_rating,ground_truth_rating
862,3.370536,3.5
8844,1.124036,4.5
680,4.661665,2.0
88,4.22953,2.5
744,4.997815,5.0
621,4.363821,2.0
597,4.681252,4.5
1366,4.714817,3.5
671,4.916984,5.0
120,4.346409,5.0

movieId,predicted_rating,ground_truth_rating
8358,4.165007,4.0
122,4.423152,5.0
11034,3.915348,2.5
314,4.699201,2.0
675,3.950711,5.0
1726,3.454071,5.0
767,4.676386,5.0
19995,3.80511,3.0
12444,3.999969,5.0
12445,3.963425,5.0


In [35]:
# Test addition new ratings for already existing user
GraphSAGE_model.add_new_train_data(new_movies_df=new_movies_df, new_ratings_df=test_new_users_ratings_df)

No movie added: Movies ids [414906, 693134, 862] are all already present in 'movies_df'.
Found 13 new ratings to add to 'users_ratings_df'
No new users to add. Users [1000] are already present in 'users_ratings_df'.


## Full re-training

In [36]:
# Load a pretrained model
GraphSAGE_model = GNNRetrainModelHandler.load_pretrained_model(
    pretrained_model_filepath=GraphSAGE_filepath
)

In [37]:
# Set the new train set for the re-training of the model
GraphSAGE_model.add_new_train_data(new_movies_df=new_movies_df, new_ratings_df=new_users_ratings_df)

Found 2 new movies to add to 'movies_df': Movies ids [414906, 693134]
Found 20 new ratings to add to 'users_ratings_df'
Found 1 new users to add to 'users_ratings_df': Users ids [1000]


In [38]:
# Full retrain the model
GraphSAGE_model.full_retrain(
    num_epochs=350,
    lr=0.01,
    model_name="full_retrained_model",
    trained_model_path=trained_models_path
)

Device: 'cuda'
Adaptive patience set to 32 epochs based on num_epochs=350.
Epoch: 001, Train loss: 13.0628, Train RMSE: 3.2571, Train MAE: 3.0822, Val RMSE: 3.2751, Val MAE: 3.1021
Epoch: 002, Train loss: 10.6090, Train RMSE: 2.2620, Train MAE: 2.0575, Val RMSE: 2.2830, Val MAE: 2.0815
Epoch: 003, Train loss: 5.1166, Train RMSE: 1.3227, Train MAE: 1.0207, Val RMSE: 1.2968, Val MAE: 0.9975
Epoch: 004, Train loss: 1.7555, Train RMSE: 1.7438, Train MAE: 1.4041, Val RMSE: 1.7169, Val MAE: 1.3734
Epoch: 005, Train loss: 3.7963, Train RMSE: 1.0604, Train MAE: 0.8576, Val RMSE: 1.0601, Val MAE: 0.8587
Epoch: 006, Train loss: 1.1244, Train RMSE: 1.5085, Train MAE: 1.3140, Val RMSE: 1.5291, Val MAE: 1.3394
Epoch: 007, Train loss: 2.2756, Train RMSE: 1.6800, Train MAE: 1.4830, Val RMSE: 1.7018, Val MAE: 1.5100
Epoch: 008, Train loss: 2.8225, Train RMSE: 1.4703, Train MAE: 1.2790, Val RMSE: 1.4914, Val MAE: 1.3040
Epoch: 009, Train loss: 2.1618, Train RMSE: 1.0991, Train MAE: 0.9022, Val RMSE: 1.

In [39]:
# Evaluate general performance after full retraining
GraphSAGE_model.evaluate_performance()

Device: 'cuda
'
Test RMSE: 0.9105, Test MAE: 0.7000

      userId  movieId  pred_rating  gt_rating
0        514     9645     3.878598        2.5
1         26      286     4.348916        5.0
2        599    44114     3.761158        5.0
3        101     7008     4.153367        4.0
4        101    11443     4.113421        3.0
...      ...      ...          ...        ...
4495     246     2244     3.379130        2.0
4496     104    10299     2.938054        3.0
4497     469     5044     3.506150        3.0
4498     129    36979     3.285060        4.0
4499     563    10009     1.145944        1.0

[4500 rows x 4 columns]


In [40]:
# Evaluate performance specifically over the new training data
evaluate_model_performance(GraphSAGE_model)

movieId,predicted_rating,ground_truth_rating
862,3.588421,3.5
8844,4.4585,4.5
680,3.975657,2.0
88,3.400517,2.5
744,5.0,5.0
621,3.555361,2.0
597,3.912991,4.5
1366,4.333863,3.5
671,4.355219,5.0
120,4.093032,5.0

movieId,predicted_rating,ground_truth_rating
8358,3.318212,4.0
122,3.751835,5.0
11034,3.406344,2.5
314,4.249714,2.0
675,3.869349,5.0
1726,3.338596,5.0
767,4.268681,5.0
19995,2.599612,3.0
12444,3.950366,5.0
12445,3.881575,5.0


 ## Incremental training

In [41]:
# Load a pretrained model
GraphSAGE_model = GNNRetrainModelHandler.load_pretrained_model(
    pretrained_model_filepath=GraphSAGE_filepath
)

In [42]:
# Set the new train set for the re-training of the model
GraphSAGE_model.add_new_train_data(new_movies_df=new_movies_df, new_ratings_df=new_users_ratings_df)

Found 2 new movies to add to 'movies_df': Movies ids [414906, 693134]
Found 20 new ratings to add to 'users_ratings_df'
Found 1 new users to add to 'users_ratings_df': Users ids [1000]


In [43]:
# Incremental train the model
GraphSAGE_model.incremental_train(
    num_epochs=15,
    lr=0.001,
    model_name="incremental_trained_model",
    trained_model_path=trained_models_path
)

Device: 'cuda'
Adaptive patience set to 15 epochs based on num_epochs=15.
Epoch: 001, Train loss: 1.5888, Train RMSE: 1.0700, Train MAE: 0.7461, Val RMSE: 0.9088, Val MAE: 0.6886
Epoch: 002, Train loss: 1.2983, Train RMSE: 1.0069, Train MAE: 0.6909, Val RMSE: 0.9031, Val MAE: 0.6894
Epoch: 003, Train loss: 1.0906, Train RMSE: 0.9528, Train MAE: 0.6677, Val RMSE: 0.9059, Val MAE: 0.7031
Epoch: 004, Train loss: 0.9232, Train RMSE: 0.9116, Train MAE: 0.6663, Val RMSE: 0.9255, Val MAE: 0.7321
Epoch: 005, Train loss: 0.8362, Train RMSE: 0.8956, Train MAE: 0.6589, Val RMSE: 0.9509, Val MAE: 0.7631
Epoch: 006, Train loss: 0.8030, Train RMSE: 0.8802, Train MAE: 0.6533, Val RMSE: 0.9649, Val MAE: 0.7793
Epoch: 007, Train loss: 0.7748, Train RMSE: 0.8554, Train MAE: 0.6259, Val RMSE: 0.9634, Val MAE: 0.7770
Epoch: 008, Train loss: 0.7318, Train RMSE: 0.8234, Train MAE: 0.5785, Val RMSE: 0.9519, Val MAE: 0.7623
Epoch: 009, Train loss: 0.6781, Train RMSE: 0.7928, Train MAE: 0.5316, Val RMSE: 0.938

In [44]:
# Evaluate general performance after incremental training
GraphSAGE_model.evaluate_performance()

Device: 'cuda
'
Test RMSE: 0.9368, Test MAE: 0.7280

      userId  movieId  pred_rating  gt_rating
0        514     9645     3.692862        2.5
1         26      286     4.035707        5.0
2        599    44114     3.519161        5.0
3        101     7008     3.927359        4.0
4        101    11443     4.028891        3.0
...      ...      ...          ...        ...
4495     246     2244     3.171138        2.0
4496     104    10299     3.245615        3.0
4497     469     5044     3.291425        3.0
4498     129    36979     3.431199        4.0
4499     563    10009     3.432747        1.0

[4500 rows x 4 columns]


In [45]:
# Evaluate performance specifically over the new training data
evaluate_model_performance(GraphSAGE_model)

movieId,predicted_rating,ground_truth_rating
862,3.77708,3.5
8844,3.360795,4.5
680,4.100924,2.0
88,4.06573,2.5
744,4.644041,5.0
621,3.858717,2.0
597,4.277199,4.5
1366,4.368412,3.5
671,4.634444,5.0
120,4.317852,5.0

movieId,predicted_rating,ground_truth_rating
8358,3.816863,4.0
122,4.193461,5.0
11034,3.608039,2.5
314,4.332638,2.0
675,3.7256,5.0
1726,3.961396,5.0
767,4.515242,5.0
19995,3.587844,3.0
12444,3.753611,5.0
12445,3.718725,5.0


In [46]:
# Load a pretrained model
GraphSAGE_model = GNNRetrainModelHandler.load_pretrained_model(
    pretrained_model_filepath="D:\\Internship\\recsys\data\\temp\\online\\online_updated_GNN_user_bdd62e39-8999-468b-be8a-c36277a93bdc.pth"
    #pretrained_model_filepath="D:\\Internship\\recsys\\data\\temp\\offline\\offline_updated_GNN_model.pth"
    #pretrained_model_filepath="D:\\Internship\\recsys\\data\\temp\\init\\init_GNN_model.pth"
)

In [47]:
# Set the new train set for the re-training of the model
GraphSAGE_model.add_new_train_data(new_movies_df=new_movies_df, new_ratings_df=new_users_ratings_df)

Found 2 new movies to add to 'movies_df': Movies ids [414906, 693134]
Found 5 new ratings to add to 'users_ratings_df'
No new users to add. Users [1000] are already present in 'users_ratings_df'.


## Distillation training

In [48]:
# Load a pretrained model
GraphSAGE_model = GNNRetrainModelHandler.load_pretrained_model(
    pretrained_model_filepath=GraphSAGE_filepath
)

In [49]:
# Set the new train set for the re-training of the model
GraphSAGE_model.add_new_train_data(new_movies_df=new_movies_df, new_ratings_df=new_users_ratings_df)

Found 2 new movies to add to 'movies_df': Movies ids [414906, 693134]
Found 20 new ratings to add to 'users_ratings_df'
Found 1 new users to add to 'users_ratings_df': Users ids [1000]


In [50]:
# Incremental train the model
GraphSAGE_model.distillation_train(
    num_epochs=250,
    lr=0.01,
    temperature=1.0,
    alpha=0.5,
    model_name="distillation_trained_model",
    trained_model_path=trained_models_path
)

Device: 'cuda'
Adaptive patience set to 29 epochs based on num_epochs=250.
Epoch: 001, Loss: 0.7944, Train RMSE: 1.2271, Train MAE: 0.8411, Val RMSE: 1.3346, Val MAE: 1.0302
Epoch: 002, Loss: 3.0079, Train RMSE: 1.2951, Train MAE: 1.1559, Val RMSE: 1.4067, Val MAE: 1.2278
Epoch: 003, Loss: 0.8481, Train RMSE: 2.0002, Train MAE: 1.8913, Val RMSE: 1.9317, Val MAE: 1.7462
Epoch: 004, Loss: 2.0082, Train RMSE: 2.0474, Train MAE: 1.9209, Val RMSE: 1.9550, Val MAE: 1.7701
Epoch: 005, Loss: 2.1032, Train RMSE: 1.7005, Train MAE: 1.5820, Val RMSE: 1.7210, Val MAE: 1.5399
Epoch: 006, Loss: 1.4533, Train RMSE: 1.1118, Train MAE: 0.9624, Val RMSE: 1.3268, Val MAE: 1.1503
Epoch: 007, Loss: 0.6267, Train RMSE: 0.7226, Train MAE: 0.5909, Val RMSE: 0.9797, Val MAE: 0.7846
Epoch: 008, Loss: 0.3492, Train RMSE: 0.8581, Train MAE: 0.6167, Val RMSE: 1.0368, Val MAE: 0.7758
Epoch: 009, Loss: 0.8356, Train RMSE: 0.9152, Train MAE: 0.6739, Val RMSE: 1.0567, Val MAE: 0.7898
Epoch: 010, Loss: 0.8741, Train RM

In [51]:
# Evaluate general performance after distillation training
GraphSAGE_model.evaluate_performance()

Device: 'cuda
'
Test RMSE: 1.1394, Test MAE: 0.9418

      userId  movieId  pred_rating  gt_rating
0        514     9645     2.967564        2.5
1         26      286     3.241055        5.0
2        599    44114     2.779964        5.0
3        101     7008     3.402693        4.0
4        101    11443     3.565405        3.0
...      ...      ...          ...        ...
4495     246     2244     2.929942        2.0
4496     104    10299     3.018107        3.0
4497     469     5044     2.708227        3.0
4498     129    36979     3.044476        4.0
4499     563    10009     3.815342        1.0

[4500 rows x 4 columns]


In [52]:
# Evaluate performance specifically over the new training data
evaluate_model_performance(GraphSAGE_model)

movieId,predicted_rating,ground_truth_rating
862,3.622602,3.5
8844,4.60875,4.5
680,2.833967,2.0
88,3.217451,2.5
744,5.0,5.0
621,1.801937,2.0
597,3.590517,4.5
1366,3.388369,3.5
671,3.983413,5.0
120,4.412644,5.0

movieId,predicted_rating,ground_truth_rating
8358,3.02536,4.0
122,3.841735,5.0
11034,2.850649,2.5
314,3.407903,2.0
675,3.429032,5.0
1726,3.881182,5.0
767,4.490969,5.0
19995,2.98753,3.0
12444,3.381851,5.0
12445,3.37461,5.0


## Fine-tuning

In [53]:
# Load a pretrained model
GraphSAGE_model = GNNRetrainModelHandler.load_pretrained_model(
    pretrained_model_filepath=GraphSAGE_filepath
)

In [54]:
# Set the new train set for the re-training of the model
GraphSAGE_model.add_new_train_data(new_movies_df=new_movies_df, new_ratings_df=new_users_ratings_df)

Found 2 new movies to add to 'movies_df': Movies ids [414906, 693134]
Found 20 new ratings to add to 'users_ratings_df'
Found 1 new users to add to 'users_ratings_df': Users ids [1000]


In [55]:
# Incremental train the model
GraphSAGE_model.fine_tune(
    num_epochs=15,
    lr=0.001,
    model_name="fine_tuned_model",
    trained_model_path=trained_models_path
)

Device: 'cuda'
Adaptive patience set to 20 epochs based on num_epochs=15.
Epoch: 001, Loss: 2.0803, Train RMSE: 1.4003, Train MAE: 1.0229, Val RMSE: 0.9028, Val MAE: 0.6842
Epoch: 002, Loss: 1.9809, Train RMSE: 1.3675, Train MAE: 0.9572, Val RMSE: 0.9128, Val MAE: 0.6902
Epoch: 003, Loss: 1.9271, Train RMSE: 1.3483, Train MAE: 0.9099, Val RMSE: 0.9208, Val MAE: 0.6958
Epoch: 004, Loss: 1.9004, Train RMSE: 1.3368, Train MAE: 0.8895, Val RMSE: 0.9220, Val MAE: 0.6968
Epoch: 005, Loss: 1.8791, Train RMSE: 1.3283, Train MAE: 0.8819, Val RMSE: 0.9171, Val MAE: 0.6935
Epoch: 006, Loss: 1.8514, Train RMSE: 1.3211, Train MAE: 0.8840, Val RMSE: 0.9089, Val MAE: 0.6883
Epoch: 007, Loss: 1.8168, Train RMSE: 1.3151, Train MAE: 0.8950, Val RMSE: 0.9008, Val MAE: 0.6840
Epoch: 008, Loss: 1.7794, Train RMSE: 1.3110, Train MAE: 0.9098, Val RMSE: 0.8954, Val MAE: 0.6827
Epoch: 009, Loss: 1.7445, Train RMSE: 1.3086, Train MAE: 0.9259, Val RMSE: 0.8938, Val MAE: 0.6855
Epoch: 010, Loss: 1.7148, Train RMS

In [56]:
# Evaluate general performance after fine-tuning
GraphSAGE_model.evaluate_performance()

Device: 'cuda
'
Test RMSE: 0.9391, Test MAE: 0.7428

      userId  movieId  pred_rating  gt_rating
0        514     9645     3.756391        2.5
1         26      286     4.042635        5.0
2        599    44114     3.379977        5.0
3        101     7008     3.770851        4.0
4        101    11443     3.880891        3.0
...      ...      ...          ...        ...
4495     246     2244     3.132987        2.0
4496     104    10299     2.900756        3.0
4497     469     5044     2.986651        3.0
4498     129    36979     3.180464        4.0
4499     563    10009     2.379834        1.0

[4500 rows x 4 columns]


In [57]:
# Evaluate performance specifically over the new training data
evaluate_model_performance(GraphSAGE_model)

movieId,predicted_rating,ground_truth_rating
862,3.758345,3.5
8844,1.99801,4.5
680,4.534894,2.0
88,4.329314,2.5
744,4.795349,5.0
621,4.367586,2.0
597,4.55353,4.5
1366,4.574782,3.5
671,4.702748,5.0
120,4.379584,5.0

movieId,predicted_rating,ground_truth_rating
8358,4.129676,4.0
122,4.363535,5.0
11034,3.97705,2.5
314,4.54845,2.0
675,3.992866,5.0
1726,3.709986,5.0
767,4.558208,5.0
19995,3.876826,3.0
12444,4.026474,5.0
12445,4.000809,5.0
