In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
import torch
torch.__version__

'2.0.1+cu118'

In [8]:
!pip install sentence-transformers
!pip install torch_geometric
!pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.0.1+cu118.html

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://data.pyg.org/whl/torch-2.0.1+cu118.html


In [24]:
import os
import re
import csv
import pandas as pd
from sentence_transformers import SentenceTransformer
from torch import Tensor
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T
from torch_geometric.loader import LinkNeighborLoader
from torch_geometric.nn import HeteroConv, GCNConv, SAGEConv, GATConv, Linear, to_hetero
import torch.nn.functional as F
import tqdm
from sklearn.metrics import roc_auc_score

# Preprocessing FOUR "combined_data.txt" turn into ONE "ratings.csv" file

In [5]:
def process_data(input_files, output_file):
    pattern_movie_id = r'^(\d+):\s*$'
    pattern_data = r'^(\d+),\s*([\d.]+),\s*(\d{4}-\d{2}-\d{2})$'

    with open(output_file, 'w', newline='') as w:
        writer = csv.writer(w)
        writer.writerow(['user_id', 'rating', 'date', 'movie_id'])
    
        for file in input_files:

            data = []
            
            movie_id = None
            print(file)

            num = 0
            with open(file, 'r') as f:
                for line in f:
                    line = line.strip()

                    match_movie_id = re.match(pattern_movie_id, line)
                    match_data = re.match(pattern_data, line)
                    
                    
                    if match_movie_id:
                        movie_id = match_movie_id.group(1)
                    elif match_data:
                        cust_id = match_data.group(1)
                        rating = match_data.group(2)
                        date = match_data.group(3)
                        data.append([cust_id, rating, date, movie_id])
                    else:
                        raise Exception('Found neither MovieId nor Data')

            writer.writerows(data)


    print("CSV file created successfully.")

# Note: change to the correct data path and output name
input_files = ['/kaggle/input/netflix-prize-data/combined_data_1.txt', 
               '/kaggle/input/netflix-prize-data/combined_data_2.txt', 
               '/kaggle/input/netflix-prize-data/combined_data_3.txt', 
               '/kaggle/input/netflix-prize-data/combined_data_4.txt']
output_file = 'ratings.csv'

# Unmark the following line to use the function
# process_data(input_files, output_file)

In [6]:
# Read ratings.csv file and sampling
ratings_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/111_2_SNA/sampled_ratings.csv')
# ratings_df = ratings_df.sort_values(by='user_id').reset_index(drop=True)
# ratings_df = ratings_df.iloc[:100154]
# ratings_df.to_csv('sampled_ratings.csv', encoding='utf-8', index=False)
ratings_df

Unnamed: 0,user_id,rating,date,movie_id
0,6,2,2005-12-04,14358
1,6,4,2005-01-12,6134
2,6,4,2005-10-26,5926
3,6,3,2004-11-10,6797
4,6,3,2005-12-04,3905
...,...,...,...,...
100149,2996,5,2004-10-18,17157
100150,2996,5,2004-10-14,11573
100151,2996,4,2004-10-14,10123
100152,2996,4,2005-02-25,14103


# Handle error lines in movie_title.csv file

In [7]:
def handle_bad_lines(line):
    fields =[str(field) for field in line]
    movie_id = int(fields[0])
    release_year = int(fields[1])
    combined_title = ''.join(fields[2:]).strip()
    return movie_id, release_year, combined_title

In [9]:
# ratings_df.to_csv('sampled_ratings.csv', encoding='utf-8')

In [8]:
# Read movie_titles.csv
# movie_titles_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/111_2_SNA/movie_titles.csv', 
#                               names = ['movie_id', 'release_year', 'movie_title'], 
#                               encoding='ISO-8859-1', 
#                               engine='python', 
#                               on_bad_lines=handle_bad_lines)
# movie_titles_df['release_year'] = movie_titles_df['release_year'].astype('Int64')
# movie_titles_df

Unnamed: 0,movie_id,release_year,movie_title
0,1,2003,Dinosaur Planet
1,2,2004,Isle of Man TT 2004 Review
2,3,1997,Character
3,4,1994,Paula Abdul's Get Up & Dance
4,5,2004,The Rise and Fall of ECW
...,...,...,...
17765,17766,2002,Where the Wild Things Are and Other Maurice Se...
17766,17767,2004,Fidel Castro: American Experience
17767,17768,2000,Epoch
17768,17769,2003,The Company


In [None]:
# movie_titles_df.to_csv('/content/drive/MyDrive/Colab Notebooks/111_2_SNA/movie_titles.csv', 
#                        encoding='utf-8',
#                        index=False)

# Read node and edge data from two csv file
＊Create movie feature at the same time

In [1]:
movie_path = '/content/drive/MyDrive/Colab Notebooks/111_2_SNA/movie_titles.csv'
rating_path = '/content/drive/MyDrive/Colab Notebooks/111_2_SNA/sampled_ratings.csv'

In [2]:
def load_node_csv(path, index_col, encoders=None, **kwargs):
    df = pd.read_csv(path, index_col=index_col, **kwargs)
    mapping = {index: i for i, index in enumerate(df.index.unique())}

    x = None
    if encoders is not None:
        xs = [encoder(df[col]) for col, encoder in encoders.items()]
        x = torch.cat(xs, dim=-1)

    return x, mapping

## Create movie node data
＊Using a Language Model to encode 'movie_title' as a feature of movie nodes

In [3]:
class SequenceEncoder:
    def __init__(self, model_name='all-MiniLM-L6-v2', device=None):
        self.device = device
        self.model = SentenceTransformer(model_name, device=device)

    @torch.no_grad()
    def __call__(self, df):
        x = self.model.encode(df.values, show_progress_bar=True,
                              convert_to_tensor=True, device=self.device)
        return x.cpu()

In [10]:
movie_x, movie_mapping = load_node_csv(
    movie_path, index_col='movie_id', encoders={
        'movie_title': SequenceEncoder()
    })

Batches:   0%|          | 0/556 [00:00<?, ?it/s]

## Create user node data

In [11]:
_, user_mapping = load_node_csv(rating_path, index_col='user_id')

## Create edge data

In [14]:
def load_edge_csv(path, src_index_col, src_mapping, dst_index_col, dst_mapping,
                  encoders=None, **kwargs):
    df = pd.read_csv(path, **kwargs)

    src = [src_mapping[index] for index in df[src_index_col]]
    dst = [dst_mapping[index] for index in df[dst_index_col]]
    edge_index = torch.tensor([src, dst])

    edge_attr = None
    if encoders is not None:
        edge_attrs = [encoder(df[col]) for col, encoder in encoders.items()]
        edge_attr = torch.cat(edge_attrs)

    return edge_index, edge_attr

In [15]:
class IdentityEncoder:
    def __init__(self, dtype=None):
        self.dtype = dtype

    def __call__(self, df):
        return torch.from_numpy(df.values).view(-1, 1).to(self.dtype)

In [16]:
edge_index, edge_label = load_edge_csv(
    rating_path,
    src_index_col='user_id',
    src_mapping=user_mapping,
    dst_index_col='movie_id',
    dst_mapping=movie_mapping,
    encoders={'rating': IdentityEncoder(dtype=torch.long)},
)

# Create a Heterageneous Graph
＊including node: {user, movie:['feature']}, edge:{(user, rates, movies
)}

In [17]:
data = HeteroData()

# Save node indices:
data["user"].node_id = torch.arange(len(user_mapping))
data["movie"].node_id = torch.arange(len(movie_mapping))

# Add the node features and edge indices:
data["movie"].x = movie_x
data["user", "rates", "movie"].edge_index = edge_index

# We also need to make sure to add the reverse edges from movies to users
# in order to let a GNN be able to pass messages in both directions.
# We can leverage the `T.ToUndirected()` transform for this from PyG:
data = T.ToUndirected()(data)

print(data)

HeteroData(
  [1muser[0m={ node_id=[524] },
  [1mmovie[0m={
    node_id=[17770],
    x=[17770, 384]
  },
  [1m(user, rates, movie)[0m={ edge_index=[2, 100154] },
  [1m(movie, rev_rates, user)[0m={ edge_index=[2, 100154] }
)


# Defining Edge-level Training Splits

In [18]:
# For this, we first split the set of edges into
# training (80%), validation (10%), and testing edges (10%).
# Across the training edges, we use 70% of edges for message passing,
# and 30% of edges for supervision.
# We further want to generate fixed negative edges for evaluation with a ratio of 2:1.
# Negative edges during training will be generated on-the-fly.
# We can leverage the `RandomLinkSplit()` transform for this from PyG:
transform = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    disjoint_train_ratio=0.3,
    neg_sampling_ratio=2.0,
    add_negative_train_samples=False,
    edge_types=("user", "rates", "movie"),
    rev_edge_types=("movie", "rev_rates", "user") 
)

train_data, val_data, test_data = transform(data)
print("Training data:")
print("==============")
print(train_data)
print()
print("Validation data:")
print("================")
print(val_data)

Training data:
HeteroData(
  [1muser[0m={ node_id=[524] },
  [1mmovie[0m={
    node_id=[17770],
    x=[17770, 384]
  },
  [1m(user, rates, movie)[0m={
    edge_index=[2, 56087],
    edge_label=[24037],
    edge_label_index=[2, 24037]
  },
  [1m(movie, rev_rates, user)[0m={ edge_index=[2, 56087] }
)

Validation data:
HeteroData(
  [1muser[0m={ node_id=[524] },
  [1mmovie[0m={
    node_id=[17770],
    x=[17770, 384]
  },
  [1m(user, rates, movie)[0m={
    edge_index=[2, 80124],
    edge_label=[30045],
    edge_label_index=[2, 30045]
  },
  [1m(movie, rev_rates, user)[0m={ edge_index=[2, 80124] }
)


# Defining Mini-batch Loaders

In [19]:
# In the first hop, we sample at most 20 neighbors.
# In the second hop, we sample at most 10 neighbors.
# In addition, during training, we want to sample negative edges on-the-fly with
# a ratio of 2:1.
# We can make use of the `loader.LinkNeighborLoader` from PyG:
from torch_geometric.loader import LinkNeighborLoader

# Define seed edges:
edge_label_index = train_data["user", "rates", "movie"].edge_label_index
edge_label = train_data["user", "rates", "movie"].edge_label

train_loader = LinkNeighborLoader(
    data=train_data,
    num_neighbors=[20, 10],
    neg_sampling_ratio=2.0,
    edge_label_index=(("user", "rates", "movie"), edge_label_index),
    edge_label=edge_label,
    batch_size=128,
    shuffle=True,
)

# Inspect a sample:
sampled_data = next(iter(train_loader))

print("Sampled mini-batch:")
print("===================")
print(sampled_data)

Sampled mini-batch:
HeteroData(
  [1muser[0m={
    node_id=[511],
    n_id=[511]
  },
  [1mmovie[0m={
    node_id=[2680],
    x=[2680, 384],
    n_id=[2680]
  },
  [1m(user, rates, movie)[0m={
    edge_index=[2, 15738],
    edge_label=[384],
    edge_label_index=[2, 384],
    e_id=[15738],
    input_id=[128]
  },
  [1m(movie, rev_rates, user)[0m={
    edge_index=[2, 6516],
    e_id=[6516]
  }
)


# Creating a Heterogeneous Link-level GNN

In [20]:
class GNN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()

        self.conv1 = SAGEConv(hidden_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, hidden_channels)

    def forward(self, x: Tensor, edge_index: Tensor) -> Tensor:
        x = F.relu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        return x

# Our final classifier applies the dot-product between source and destination
# node embeddings to derive edge-level predictions:
class Classifier(torch.nn.Module):
    def forward(self, x_user: Tensor, x_movie: Tensor, edge_label_index: Tensor) -> Tensor:
        # Convert node embeddings to edge-level representations:
        edge_feat_user = x_user[edge_label_index[0]]
        edge_feat_movie = x_movie[edge_label_index[1]]

        # Apply dot-product to get a prediction per supervision edge:
        return (edge_feat_user * edge_feat_movie).sum(dim=-1)


class Model(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        # Since the dataset does not come with rich features, we also learn two
        # embedding matrices for users and movies:
        self.movie_lin = torch.nn.Linear(384, hidden_channels)
        self.user_emb = torch.nn.Embedding(data["user"].num_nodes, hidden_channels)
        self.movie_emb = torch.nn.Embedding(data["movie"].num_nodes, hidden_channels)

        # Instantiate homogeneous GNN:
        self.gnn = GNN(hidden_channels)

        # Convert GNN model into a heterogeneous variant:
        self.gnn = to_hetero(self.gnn, metadata=data.metadata())

        self.classifier = Classifier()

    def forward(self, data: HeteroData) -> Tensor:
        x_dict = {
          "user": self.user_emb(data["user"].n_id),
          "movie": self.movie_lin(data["movie"].x) + self.movie_emb(data["movie"].n_id),
        } 

        # `x_dict` holds feature matrices of all node types
        # `edge_index_dict` holds all edge indices of all edge types
        x_dict = self.gnn(x_dict, data.edge_index_dict)
        pred = self.classifier(
            x_dict["user"],
            x_dict["movie"],
            data["user", "rates", "movie"].edge_label_index,
        )

        return pred

        
model = Model(hidden_channels=64)

print(model)

Model(
  (movie_lin): Linear(in_features=384, out_features=64, bias=True)
  (user_emb): Embedding(524, 64)
  (movie_emb): Embedding(17770, 64)
  (gnn): GraphModule(
    (conv1): ModuleDict(
      (user__rates__movie): SAGEConv(64, 64, aggr=mean)
      (movie__rev_rates__user): SAGEConv(64, 64, aggr=mean)
    )
    (conv2): ModuleDict(
      (user__rates__movie): SAGEConv(64, 64, aggr=mean)
      (movie__rev_rates__user): SAGEConv(64, 64, aggr=mean)
    )
  )
  (classifier): Classifier()
)


# Training the Heterogeneous Link-level GNN

In [22]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: '{device}'")

model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(1, 6):
    total_loss = total_examples = 0
    for sampled_data in tqdm.tqdm(train_loader):
        optimizer.zero_grad()

        sampled_data.to(device)
        pred = model(sampled_data)

        ground_truth = sampled_data["user", "rates", "movie"].edge_label
        loss = F.binary_cross_entropy_with_logits(pred, ground_truth)

        loss.backward()
        optimizer.step()
        total_loss += float(loss) * pred.numel()
        total_examples += pred.numel()
    print(f"Epoch: {epoch:03d}, Loss: {total_loss / total_examples:.4f}")

Device: 'cpu'


100%|██████████| 188/188 [00:10<00:00, 18.73it/s]


Epoch: 001, Loss: 0.2363


100%|██████████| 188/188 [00:11<00:00, 16.86it/s]


Epoch: 002, Loss: 0.2263


100%|██████████| 188/188 [00:13<00:00, 13.59it/s]


Epoch: 003, Loss: 0.2178


100%|██████████| 188/188 [00:08<00:00, 21.37it/s]


Epoch: 004, Loss: 0.2129


100%|██████████| 188/188 [00:10<00:00, 17.10it/s]

Epoch: 005, Loss: 0.2052





# Evaluating the Heterogeneous Link-level GNN

In [23]:
# Define the validation seed edges:
edge_label_index = val_data["user", "rates", "movie"].edge_label_index
edge_label = val_data["user", "rates", "movie"].edge_label

val_loader = LinkNeighborLoader(
    data=val_data,
    num_neighbors=[20, 10],
    edge_label_index=(("user", "rates", "movie"), edge_label_index),
    edge_label=edge_label,
    batch_size=3 * 128,
    shuffle=False,
)

sampled_data = next(iter(val_loader))

print("Sampled mini-batch:")
print("===================")
print(sampled_data)

Sampled mini-batch:
HeteroData(
  [1muser[0m={
    node_id=[506],
    n_id=[506]
  },
  [1mmovie[0m={
    node_id=[2452],
    x=[2452, 384],
    n_id=[2452]
  },
  [1m(user, rates, movie)[0m={
    edge_index=[2, 17511],
    edge_label=[384],
    edge_label_index=[2, 384],
    e_id=[17511],
    input_id=[384]
  },
  [1m(movie, rev_rates, user)[0m={
    edge_index=[2, 6603],
    e_id=[6603]
  }
)


In [25]:
preds = []
ground_truths = []
for sampled_data in tqdm.tqdm(val_loader):
    with torch.no_grad():
        sampled_data.to(device)
        preds.append(model(sampled_data))
        ground_truths.append(sampled_data["user", "rates", "movie"].edge_label)

pred = torch.cat(preds, dim=0).cpu().numpy()
ground_truth = torch.cat(ground_truths, dim=0).cpu().numpy()
auc = roc_auc_score(ground_truth, pred)
print()
print(f"Validation AUC: {auc:.4f}")

100%|██████████| 79/79 [00:01<00:00, 45.57it/s]


Validation AUC: 0.9480



