In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv
from sklearn.model_selection import train_test_split

# Load MovieLens Dataset
url = "https://files.grouplens.org/datasets/movielens/ml-100k/u.data"
df = pd.read_csv(url, sep="\t", names=["user_id", "movie_id", "rating", "timestamp"])
df.drop(columns=["timestamp"], inplace=True)

# Convert user & movie IDs to zero-based index
df["user_id"] -= 1
df["movie_id"] -= 1

num_users = df["user_id"].nunique()
num_movies = df["movie_id"].nunique()
num_nodes = num_users + num_movies

# Create Edges (User-Movie Interactions)
edges = torch.tensor([[u, num_users + m] for u, m in zip(df["user_id"], df["movie_id"])], dtype=torch.long).t()

# Define Graph Data
data = Data(edge_index=edges)

# Split data into train & test
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

train_users = torch.tensor(train_data["user_id"].values, dtype=torch.long)
train_movies = torch.tensor(train_data["movie_id"].values, dtype=torch.long)
train_ratings = torch.tensor(train_data["rating"].values, dtype=torch.float32)

test_users = torch.tensor(test_data["user_id"].values, dtype=torch.long)
test_movies = torch.tensor(test_data["movie_id"].values, dtype=torch.long)
test_ratings = torch.tensor(test_data["rating"].values, dtype=torch.float32)

# GNN for Generating User & Movie Embeddings
class GNNLayer(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(GNNLayer, self).__init__()
        self.conv = SAGEConv(in_channels, out_channels)

    def forward(self, x, edge_index):
        return self.conv(x, edge_index).relu()

# Neural Collaborative Filtering (NCF) Model
class HybridGNN_NCF(nn.Module):
    def __init__(self, num_users, num_movies, embedding_dim=32):
        super(HybridGNN_NCF, self).__init__()

        self.embedding_dim = embedding_dim
        self.gnn = GNNLayer(embedding_dim, embedding_dim)  # Keep GNN layer

        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.movie_embedding = nn.Embedding(num_movies, embedding_dim)

        self.fc1 = nn.Linear(embedding_dim, 64)  # Fix input size
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)
        self.relu = nn.ReLU()

    def forward(self, user, movie, edge_index):
        # Get individual embeddings
        user_emb = self.user_embedding(user)
        movie_emb = self.movie_embedding(movie)

        # Graph embedding processing
        node_features = torch.cat([self.user_embedding.weight, self.movie_embedding.weight], dim=0)
        x_gnn = self.gnn(node_features, edge_index)

        # Extract updated embeddings for specific users & movies
        user_gnn = x_gnn[user]
        movie_gnn = x_gnn[num_users + movie]  # Movies are after users

        # Combine embeddings
        x_final = user_gnn + movie_gnn  # Can use other aggregations like Hadamard product

        # Pass through MLP
        x_final = self.relu(self.fc1(x_final))
        x_final = self.relu(self.fc2(x_final))

        return self.fc3(x_final).squeeze()

# Initialize Model, Loss, Optimizer
embedding_dim = 32
model = HybridGNN_NCF(num_users, num_movies, embedding_dim)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train Model
epochs = 200
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    predictions = model(train_users, train_movies, data.edge_index)
    loss = criterion(predictions, train_ratings)
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.6f}")

# Evaluate Model
model.eval()
with torch.no_grad():
    test_predictions = model(test_users, test_movies, data.edge_index)
    test_loss = criterion(test_predictions, test_ratings)
print(f"\nTest Loss: {test_loss.item():.4f}")

Epoch 1/200, Loss: 13.591998
Epoch 11/200, Loss: 11.630446
Epoch 21/200, Loss: 7.706192
Epoch 31/200, Loss: 2.342462
Epoch 41/200, Loss: 2.153544
Epoch 51/200, Loss: 1.385339
Epoch 61/200, Loss: 1.388433
Epoch 71/200, Loss: 1.280532
Epoch 81/200, Loss: 1.233403
Epoch 91/200, Loss: 1.201785
Epoch 101/200, Loss: 1.167929
Epoch 111/200, Loss: 1.139678
Epoch 121/200, Loss: 1.114314
Epoch 131/200, Loss: 1.090986
Epoch 141/200, Loss: 1.069610
Epoch 151/200, Loss: 1.050119
Epoch 161/200, Loss: 1.032242
Epoch 171/200, Loss: 1.015859
Epoch 181/200, Loss: 1.000894
Epoch 191/200, Loss: 0.987133

Test Loss: 1.0002



Recommended Movies for User 0: [1114, 199, 1666, 1519, 1524]



Recommended Movies for User 0: [1658, 1659, 1626, 1234, 1319]


In [18]:
import pandas as pd

# Load Movie Titles
movies = pd.read_csv(
    "https://files.grouplens.org/datasets/movielens/ml-100k/u.item",
    sep="|", encoding="latin-1", names=["movie_id", "title"], usecols=[0, 1]
)

# Convert movie_id to zero-based index
movies["movie_id"] -= 1
movie_id_to_name = dict(zip(movies["movie_id"], movies["title"]))


In [35]:
def recommend_movies(user_id, top_n=5):
    model.eval()
    with torch.no_grad():
        user_tensor = torch.tensor([user_id] * num_movies, dtype=torch.long)
        movie_tensor = torch.arange(num_movies, dtype=torch.long)
        scores = model(user_tensor, movie_tensor, data.edge_index)
        top_movies = torch.argsort(scores, descending=True)[:top_n].tolist()

    # Convert movie IDs to movie names
    recommended_movies = [movie_id_to_name[movie] for movie in top_movies]
    return recommended_movies, top_movies

In [36]:
# Example: Recommend for User 0
print("\nRecommended Movies for User 0:", recommend_movies(0))


Recommended Movies for User 0: (['Aiqing wansui (1994)', 'Marlene Dietrich: Shadow and Light (1996) ', 'Sleepover (1995)', 'Faust (1994)', 'Death in Brunswick (1991)'], [1535, 1200, 1459, 1366, 1592])


In [37]:
def get_user_rated_movies(user_id, min_rating=4):
    """Return the list of movies the user has rated at least `min_rating`."""
    user_movies = df[(df["user_id"] == user_id) & (df["rating"] >= min_rating)]["movie_id"].tolist()
    return [movie_id_to_name[movie] for movie in user_movies]

# Example: Check User 0's past highly-rated movies
print("User 0's Highly Rated Movies:", get_user_rated_movies(0))


User 0's Highly Rated Movies: ['Three Colors: White (1994)', 'Desperado (1995)', 'Glengarry Glen Ross (1992)', 'Angels and Insects (1995)', 'Groundhog Day (1993)', 'Delicatessen (1991)', 'Hunt for Red October, The (1990)', 'Ed Wood (1994)', 'Star Trek: First Contact (1996)', 'Pillow Book, The (1995)', 'Horseman on the Roof, The (Hussard sur le toit, Le) (1995)', 'Star Trek VI: The Undiscovered Country (1991)', 'So I Married an Axe Murderer (1993)', 'Shawshank Redemption, The (1994)', 'Star Trek: The Wrath of Khan (1982)', 'Independence Day (ID4) (1996)', 'Wallace & Gromit: The Best of Aardman Animation (1996)', 'Wizard of Oz, The (1939)', 'Citizen Kane (1941)', 'Silence of the Lambs, The (1991)', 'Blues Brothers, The (1980)', 'Breaking the Waves (1996)', "Robert A. Heinlein's The Puppet Masters (1994)", 'Three Colors: Blue (1993)', 'Good, The Bad and The Ugly, The (1966)', 'Raiders of the Lost Ark (1981)', 'Jurassic Park (1993)', 'Pulp Fiction (1994)', 'Hot Shots! Part Deux (1993)', 'S

In [38]:
from sklearn.metrics import mean_squared_error

# Get model predictions on test data
model.eval()
with torch.no_grad():
    test_preds = model(test_users, test_movies, data.edge_index)

# Calculate RMSE
rmse = mean_squared_error(test_ratings.numpy(), test_preds.numpy()) ** 0.5
print(f"RMSE of the Model: {rmse:.4f}")

RMSE of the Model: 0.1937


In [39]:
def get_recommendation_diversity(user_id, top_n=5):
    recommended = recommend_movies(user_id, top_n)  # List of movie IDs
    unique_genres = set()

    for movie_id in recommended:
        genres = movies.get(movie_id, [])  # Get genres for the movie
        unique_genres.update(genres)  # Add all genres to the set

    return len(unique_genres) / top_n  # Higher means better diversity

print("Diversity Score:", get_recommendation_diversity(0))

Diversity Score: 0.0


In [29]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv
from sklearn.model_selection import train_test_split

# Load MovieLens Dataset
url = "https://files.grouplens.org/datasets/movielens/ml-100k/u.data"
df = pd.read_csv(url, sep="\t", names=["user_id", "movie_id", "rating", "timestamp"])
df.drop(columns=["timestamp"], inplace=True)

# Convert user & movie IDs to zero-based index
df["user_id"] -= 1
df["movie_id"] -= 1

num_users = df["user_id"].nunique()
num_movies = df["movie_id"].nunique()
num_nodes = num_users + num_movies

# Create Edges (User-Movie Interactions)
edges = torch.tensor([[u, num_users + m] for u, m in zip(df["user_id"], df["movie_id"])], dtype=torch.long).t()

# Define Graph Data
data = Data(edge_index=edges)

# Split data into train & test
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

train_users = torch.tensor(train_data["user_id"].values, dtype=torch.long)
train_movies = torch.tensor(train_data["movie_id"].values, dtype=torch.long)
train_ratings = torch.tensor(train_data["rating"].values, dtype=torch.float32) / 5.0  # Normalize ratings

test_users = torch.tensor(test_data["user_id"].values, dtype=torch.long)
test_movies = torch.tensor(test_data["movie_id"].values, dtype=torch.long)
test_ratings = torch.tensor(test_data["rating"].values, dtype=torch.float32) / 5.0  # Normalize ratings

# GNN for Embeddings
class GNNLayer(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(GNNLayer, self).__init__()
        self.conv = SAGEConv(in_channels, out_channels)

    def forward(self, x, edge_index):
        return self.conv(x, edge_index).relu()

# Improved Hybrid Model
class HybridGNN_NCF(nn.Module):
    def __init__(self, num_users, num_movies, embedding_dim=64):
        super(HybridGNN_NCF, self).__init__()

        self.embedding_dim = embedding_dim
        self.gnn = GNNLayer(embedding_dim, embedding_dim)

        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.movie_embedding = nn.Embedding(num_movies, embedding_dim)

        self.dropout = nn.Dropout(0.2)
        self.batch_norm = nn.BatchNorm1d(embedding_dim * 2)  # Fix BatchNorm size

        self.fc1 = nn.Linear(embedding_dim * 2, 128)  # Fix input size
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)
        self.relu = nn.ReLU()

    def forward(self, user, movie, edge_index):
        user_emb = self.user_embedding(user)
        movie_emb = self.movie_embedding(movie)

        # Apply GNN to enhance embeddings
        node_features = torch.cat([self.user_embedding.weight, self.movie_embedding.weight], dim=0)
        x_gnn = self.gnn(node_features, edge_index)

        user_gnn = x_gnn[user]
        movie_gnn = x_gnn[num_users + movie]

        # **Fix: Use concatenation instead of Hadamard product**
        x_final = torch.cat([user_gnn, movie_gnn], dim=1)

        # **Fix: BatchNorm input dimension**
        x_final = self.batch_norm(x_final)
        x_final = self.dropout(self.relu(self.fc1(x_final)))
        x_final = self.dropout(self.relu(self.fc2(x_final)))

        return self.fc3(x_final).squeeze()

# Initialize Model
embedding_dim = 768  # Increased
model = HybridGNN_NCF(num_users, num_movies, embedding_dim)

criterion = nn.SmoothL1Loss()  # Changed from MSELoss
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)  # L2 Regularization

# Train Model
epochs = 100
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    predictions = model(train_users, train_movies, data.edge_index)
    loss = criterion(predictions, train_ratings)
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.6f}")

# Evaluate Model
model.eval()
with torch.no_grad():
    test_predictions = model(test_users, test_movies, data.edge_index)
    test_loss = criterion(test_predictions, test_ratings)
print(f"\nTest Loss: {test_loss.item():.4f}")


Epoch 1/100, Loss: 0.295332
Epoch 11/100, Loss: 0.032594
Epoch 21/100, Loss: 0.027297
Epoch 31/100, Loss: 0.024529
Epoch 41/100, Loss: 0.021798
Epoch 51/100, Loss: 0.020063
Epoch 61/100, Loss: 0.018574
Epoch 71/100, Loss: 0.017687
Epoch 81/100, Loss: 0.016729
Epoch 91/100, Loss: 0.015948

Test Loss: 0.0183


In [34]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv
from sklearn.model_selection import train_test_split

# Load MovieLens Dataset
url = "https://files.grouplens.org/datasets/movielens/ml-100k/u.data"
df = pd.read_csv(url, sep="\t", names=["user_id", "movie_id", "rating", "timestamp"])
df.drop(columns=["timestamp"], inplace=True)

# Convert user & movie IDs to zero-based index
df["user_id"] -= 1
df["movie_id"] -= 1

num_users = df["user_id"].nunique()
num_movies = df["movie_id"].nunique()
num_nodes = num_users + num_movies

# Create Edges (User-Movie Interactions)
edges = torch.tensor([[u, num_users + m] for u, m in zip(df["user_id"], df["movie_id"])], dtype=torch.long).t()

# Define Graph Data
data = Data(edge_index=edges)

# Split data into train & test
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

train_users = torch.tensor(train_data["user_id"].values, dtype=torch.long)
train_movies = torch.tensor(train_data["movie_id"].values, dtype=torch.long)
train_ratings = torch.tensor(train_data["rating"].values, dtype=torch.float32) / 5.0  # Normalize ratings

test_users = torch.tensor(test_data["user_id"].values, dtype=torch.long)
test_movies = torch.tensor(test_data["movie_id"].values, dtype=torch.long)
test_ratings = torch.tensor(test_data["rating"].values, dtype=torch.float32) / 5.0  # Normalize ratings

# GNN for Embeddings
class GNNLayer(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(GNNLayer, self).__init__()
        self.conv = SAGEConv(in_channels, out_channels)

    def forward(self, x, edge_index):
        return self.conv(x, edge_index).relu()

# Improved Hybrid Model
class HybridGNN_NCF(nn.Module):
    def __init__(self, num_users, num_movies, embedding_dim=64):
        super(HybridGNN_NCF, self).__init__()

        self.embedding_dim = embedding_dim
        self.gnn = GNNLayer(embedding_dim, embedding_dim)

        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.movie_embedding = nn.Embedding(num_movies, embedding_dim)

        self.dropout = nn.Dropout(0.2)
        self.batch_norm = nn.BatchNorm1d(embedding_dim * 2)  # Fix BatchNorm size

        self.fc1 = nn.Linear(embedding_dim * 2, 128)  # Fix input size
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)
        self.relu = nn.ReLU()

    def forward(self, user, movie, edge_index):
        user_emb = self.user_embedding(user)
        movie_emb = self.movie_embedding(movie)

        # Apply GNN to enhance embeddings
        node_features = torch.cat([self.user_embedding.weight, self.movie_embedding.weight], dim=0)
        x_gnn = self.gnn(node_features, edge_index)

        user_gnn = x_gnn[user]
        movie_gnn = x_gnn[num_users + movie]

        # **Fix: Use concatenation instead of Hadamard product**
        x_final = torch.cat([user_gnn, movie_gnn], dim=1)

        # **Fix: BatchNorm input dimension**
        x_final = self.batch_norm(x_final)
        x_final = self.dropout(self.relu(self.fc1(x_final)))
        x_final = self.dropout(self.relu(self.fc2(x_final)))

        return self.fc3(x_final).squeeze()

# Initialize Model
embedding_dim = 768  # Increased
model = HybridGNN_NCF(num_users, num_movies, embedding_dim)

criterion = nn.SmoothL1Loss()  # Changed from MSELoss
optimizer = optim.AdamW(model.parameters(), lr=0.0005, weight_decay=1e-5)  # L2 Regularization

# Train Model
epochs = 100
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    predictions = model(train_users, train_movies, data.edge_index)
    loss = criterion(predictions, train_ratings)
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.6f}")

# Evaluate Model
model.eval()
with torch.no_grad():
    test_predictions = model(test_users, test_movies, data.edge_index)
    test_loss = criterion(test_predictions, test_ratings)
print(f"\nTest Loss: {test_loss.item():.4f}")


Epoch 1/100, Loss: 0.243402
Epoch 11/100, Loss: 0.048323
Epoch 21/100, Loss: 0.029930
Epoch 31/100, Loss: 0.025213
Epoch 41/100, Loss: 0.022965
Epoch 51/100, Loss: 0.021440
Epoch 61/100, Loss: 0.020300
Epoch 71/100, Loss: 0.019200
Epoch 81/100, Loss: 0.018523
Epoch 91/100, Loss: 0.017702

Test Loss: 0.0188


In [43]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv
from sklearn.model_selection import train_test_split

# Load MovieLens Dataset
url = "https://files.grouplens.org/datasets/movielens/ml-100k/u.data"
df = pd.read_csv(url, sep="\t", names=["user_id", "movie_id", "rating", "timestamp"])
df.drop(columns=["timestamp"], inplace=True)

# Convert user & movie IDs to zero-based index
df["user_id"] -= 1
df["movie_id"] -= 1

num_users = df["user_id"].nunique()
num_movies = df["movie_id"].nunique()
num_nodes = num_users + num_movies

# Create Edges (User-Movie Interactions)
edges = torch.tensor([[u, num_users + m] for u, m in zip(df["user_id"], df["movie_id"])], dtype=torch.long).t()

# Define Graph Data
data = Data(edge_index=edges)

# Split data into train & test
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

train_users = torch.tensor(train_data["user_id"].values, dtype=torch.long)
train_movies = torch.tensor(train_data["movie_id"].values, dtype=torch.long)
train_ratings = torch.tensor(train_data["rating"].values, dtype=torch.float32) / 5.0  # Normalize ratings

test_users = torch.tensor(test_data["user_id"].values, dtype=torch.long)
test_movies = torch.tensor(test_data["movie_id"].values, dtype=torch.long)
test_ratings = torch.tensor(test_data["rating"].values, dtype=torch.float32) / 5.0  # Normalize ratings

# GNN for Embeddings
class GNNLayer(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(GNNLayer, self).__init__()
        self.conv = SAGEConv(in_channels, out_channels)

    def forward(self, x, edge_index):
        return self.conv(x, edge_index).relu()

# Improved Hybrid Model
class HybridGNN_NCF(nn.Module):
    def __init__(self, num_users, num_movies, embedding_dim=64):
        super(HybridGNN_NCF, self).__init__()

        self.embedding_dim = embedding_dim
        self.gnn = GNNLayer(embedding_dim, embedding_dim)

        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.movie_embedding = nn.Embedding(num_movies, embedding_dim)

        self.dropout = nn.Dropout(0.2)
        self.batch_norm = nn.BatchNorm1d(embedding_dim * 2)  # Fix BatchNorm size

        self.fc1 = nn.Linear(embedding_dim * 2, 786)  # Fix input size
        self.fc2 = nn.Linear(786, 256)
        self.fc3 = nn.Linear(256, 128)
        self.fc4 = nn.Linear(128, 64)
        self.fc5 = nn.Linear(64, 1)
        self.relu = nn.ReLU()

    def forward(self, user, movie, edge_index):
        user_emb = self.user_embedding(user)
        movie_emb = self.movie_embedding(movie)

        # Apply GNN to enhance embeddings
        node_features = torch.cat([self.user_embedding.weight, self.movie_embedding.weight], dim=0)
        x_gnn = self.gnn(node_features, edge_index)

        user_gnn = x_gnn[user]
        movie_gnn = x_gnn[num_users + movie]

        # **Fix: Use concatenation instead of Hadamard product**
        x_final = torch.cat([user_gnn, movie_gnn], dim=1)

        # **Fix: BatchNorm input dimension**
        x_final = self.batch_norm(x_final)
        x_final = self.dropout(self.relu(self.fc1(x_final)))
        x_final = self.dropout(self.relu(self.fc2(x_final)))
        x_final = self.dropout(self.relu(self.fc3(x_final)))
        x_final = self.dropout(self.relu(self.fc4(x_final)))

        return self.fc5(x_final).squeeze()

# Initialize Model
embedding_dim = 768  # Increased
model = HybridGNN_NCF(num_users, num_movies, embedding_dim)

criterion = nn.SmoothL1Loss()  # Changed from MSELoss
optimizer = optim.Adam(model.parameters(), lr=0.0005, weight_decay=1e-5)  # L2 Regularization

# Train Model
epochs = 100
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    predictions = model(train_users, train_movies, data.edge_index)
    loss = criterion(predictions, train_ratings)
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.6f}")

# Evaluate Model
model.eval()
with torch.no_grad():
    test_predictions = model(test_users, test_movies, data.edge_index)
    test_loss = criterion(test_predictions, test_ratings)
print(f"\nTest Loss: {test_loss.item():.4f}")


Epoch 1/100, Loss: 0.358624
Epoch 11/100, Loss: 0.055116
Epoch 21/100, Loss: 0.032197


KeyboardInterrupt: 

In [None]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import torch

# Load movie metadata (MovieLens 100K dataset)
movies_url = "https://files.grouplens.org/datasets/movielens/ml-100k/u.item"
movies_df = pd.read_csv(movies_url, sep="|", encoding="latin-1", header=None, usecols=[0, 1], names=["movie_id", "title"])

# Load Sentence-BERT (MiniLM)
sbert_model = SentenceTransformer("all-MiniLM-L6-v2")

# Generate embeddings for movie titles
movie_embeddings = sbert_model.encode(movies_df["title"].tolist(), convert_to_tensor=True)

# Convert to PyTorch embedding layer
movie_embedding_layer = nn.Embedding.from_pretrained(movie_embeddings, freeze=True)


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer

# Check device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load MovieLens Dataset (100K)
url = "https://files.grouplens.org/datasets/movielens/ml-100k/u.data"
df = pd.read_csv(url, sep="\t", names=["user_id", "movie_id", "rating", "timestamp"])
df.drop(columns=["timestamp"], inplace=True)

# Convert user & movie IDs to zero-based index
df["user_id"] -= 1
df["movie_id"] -= 1

num_users = df["user_id"].nunique()
num_movies = df["movie_id"].nunique()
num_nodes = num_users + num_movies

# Create Graph Edges (User-Movie Interactions)
edges = torch.tensor([[u, num_users + m] for u, m in zip(df["user_id"], df["movie_id"])], dtype=torch.long).t()
data = Data(edge_index=edges).to(device)

# Load Movie Titles
movies_url = "https://files.grouplens.org/datasets/movielens/ml-100k/u.item"
movies_df = pd.read_csv(movies_url, sep="|", encoding="latin-1", header=None, usecols=[0, 1], names=["movie_id", "title"])
movies_df["movie_id"] -= 1  # Zero-based index
movies_df = movies_df.sort_values("movie_id").reset_index(drop=True)

# Load Sentence-BERT for Movie Titles (Ensure 768-D output)
sbert_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")  # 768-D output
movie_embeddings = sbert_model.encode(movies_df["title"].tolist(), convert_to_tensor=True)

# Check BERT output size
embedding_dim = movie_embeddings.shape[1]  # 768

# Create movie embedding layer
movie_embedding_layer = nn.Embedding.from_pretrained(movie_embeddings, freeze=True).to(device)

# Split data into train & test
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

# Convert to PyTorch tensors and move to device
train_users = torch.tensor(train_data["user_id"].values, dtype=torch.long).to(device)
train_movies = torch.tensor(train_data["movie_id"].values, dtype=torch.long).to(device)
train_ratings = torch.tensor(train_data["rating"].values, dtype=torch.float32, device=device) / 5.0  # Normalize

test_users = torch.tensor(test_data["user_id"].values, dtype=torch.long).to(device)
test_movies = torch.tensor(test_data["movie_id"].values, dtype=torch.long).to(device)
test_ratings = torch.tensor(test_data["rating"].values, dtype=torch.float32, device=device) / 5.0  # Normalize

# Define GNN Layer
class GNNLayer(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(GNNLayer, self).__init__()
        self.conv = SAGEConv(in_channels, out_channels)

    def forward(self, x, edge_index):
        return self.conv(x, edge_index).relu()

# Hybrid Model with BERT & GNN
class HybridBERT_GNN_NCF(nn.Module):
    def __init__(self, num_users, embedding_dim=768):  # Ensure 768-D embedding
        super(HybridBERT_GNN_NCF, self).__init__()

        self.embedding_dim = embedding_dim
        self.gnn = GNNLayer(embedding_dim, embedding_dim)

        # GNN-based User Embeddings (Match BERT size)
        self.user_embedding = nn.Embedding(num_users, embedding_dim)

        # Pretrained BERT Movie Embeddings
        self.movie_embedding = movie_embedding_layer

        # Fully Connected Layers
        self.fc1 = nn.Linear(embedding_dim * 2, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 128)
        self.fc4 = nn.Linear(128, 1)

        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        self.batch_norm1 = nn.BatchNorm1d(512)
        self.batch_norm2 = nn.BatchNorm1d(256)

    def forward(self, user, movie, edge_index):
        # User embeddings from GNN
        user_emb = self.user_embedding(user)

        # Movie embeddings from pre-trained BERT
        movie_emb = self.movie_embedding(movie)

        # Apply GNN for user embeddings
        node_features = torch.cat([self.user_embedding.weight, self.movie_embedding.weight], dim=0)
        x_gnn = self.gnn(node_features, edge_index)

        user_gnn = x_gnn[user]
        movie_gnn = movie_emb  # Directly use BERT embeddings

        # Concatenate user & movie embeddings
        x_final = torch.cat([user_gnn, movie_gnn], dim=1)

        # Fully connected layers
        x_final = self.relu(self.fc1(x_final))
        x_final = self.batch_norm1(x_final)
        x_final = self.dropout(self.relu(self.fc2(x_final)))
        x_final = self.batch_norm2(x_final)
        x_final = self.dropout(self.relu(self.fc3(x_final)))

        return self.fc4(x_final).squeeze()

# Initialize Model
model = HybridBERT_GNN_NCF(num_users, embedding_dim).to(device)

# Loss and Optimizer
criterion = nn.SmoothL1Loss()
optimizer = optim.Adam(model.parameters(), lr=0.0005, weight_decay=1e-5)

# Train Model
epochs = 50
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    predictions = model(train_users, train_movies, data.edge_index)
    loss = criterion(predictions, train_ratings)
    loss.backward()
    optimizer.step()

    if epoch % 5 == 0:
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.6f}")

# Evaluate Model
model.eval()
with torch.no_grad():
    test_predictions = model(test_users, test_movies, data.edge_index)
    test_loss = criterion(test_predictions, test_ratings)

print(f"\nTest Loss: {test_loss.item():.4f}")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Epoch 1/50, Loss: 0.473459
Epoch 6/50, Loss: 0.100004


In [2]:
pip install torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-2.0.0+cpu.html


Looking in links: https://data.pyg.org/whl/torch-2.0.0+cpu.html
Collecting torch-scatter
  Downloading https://data.pyg.org/whl/torch-2.0.0%2Bcpu/torch_scatter-2.1.2%2Bpt20cpu-cp311-cp311-linux_x86_64.whl (494 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m494.0/494.0 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch-sparse
  Downloading https://data.pyg.org/whl/torch-2.0.0%2Bcpu/torch_sparse-0.6.18%2Bpt20cpu-cp311-cp311-linux_x86_64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m35.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch-cluster
  Downloading https://data.pyg.org/whl/torch-2.0.0%2Bcpu/torch_cluster-1.6.3%2Bpt20cpu-cp311-cp311-linux_x86_64.whl (750 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m750.9/750.9 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch-spline-conv
  Downloading https://data.pyg.org/whl/torch-2.0.0%2Bcpu/torch_spline_conv-1.

In [47]:
model = HybridBERT_GNN_NCF(num_users, embedding_dim).to(device)

In [48]:
train_users = train_users.to(device)
train_movies = train_movies.to(device)
train_ratings = train_ratings.to(device)

test_users = test_users.to(device)
test_movies = test_movies.to(device)
test_ratings = test_ratings.to(device)

data.edge_index = data.edge_index.to(device)


In [49]:
movie_embedding_layer = nn.Embedding.from_pretrained(movie_embeddings.to(device), freeze=True)


In [51]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cuda


In [52]:
print(f"train_users device: {train_users.device}")
print(f"train_movies device: {train_movies.device}")
print(f"train_ratings device: {train_ratings.device}")
print(f"Edge index device: {data.edge_index.device}")


train_users device: cpu
train_movies device: cpu
train_ratings device: cpu
Edge index device: cpu
