In [None]:
!pip install torch_geometric

In [None]:
import pandas as pd
import numpy as np
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T
import torch
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv, GATConv, to_hetero, LayerNorm,BatchNorm
from torch_geometric.data import HeteroData
from torch.nn import Linear, Dropout
import torch.optim as optim
from torch_geometric.loader import DataLoader
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt

In [None]:
class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return x

class LinkPredictor(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(LinkPredictor, self).__init__()
        self.gnn = GraphSAGE(in_channels, hidden_channels, out_channels)
        self.gnn = to_hetero(self.gnn, data.metadata())

    def encode(self, x_dict, edge_index_dict):
        return self.gnn(x_dict, edge_index_dict)

    def decode(self, z_dict, edge_label_index):
        user_embeddings = z_dict['user']
        item_embeddings = z_dict['movie']
        user_emb = user_embeddings[edge_label_index[0]]
        item_emb = item_embeddings[edge_label_index[1]]

        similarity = F.cosine_similarity(user_emb, item_emb, dim=1)
        return similarity.sigmoid()

    def forward(self, x_dict, edge_index_dict, edge_label_index):
        z_dict = self.encode(x_dict, edge_index_dict)
        return self.decode(z_dict, edge_label_index)

In [None]:
model = LinkPredictor(in_channels = 10, hidden_channels=32, out_channels = 10)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
df1 = pd.read_csv('data_movie_1m4_3rd.csv')
df2 = pd.read_csv('data_movie_1m_1st.csv')
df3 = pd.read_csv('data_movie_1m4_3rd.csv')

df = pd.concat([df1, df2, df3], ignore_index=True)

df_movie = pd.read_csv('df_movie.csv')
df_movie['rating'] = df['sentiment_label_int']
df_movie.rename(columns={'rating': 'sent_score'}, inplace=True)
df_item_embed = pd.read_csv('item_embeddings_movie_sen_only.csv')
df_user_embed = pd.read_csv('user_embeddings_movie_sen_only.csv')

In [None]:
df_movie = df_movie[df_movie['userID'].isin(df_user_embed['userID'])]
df_movie = df_movie[df_movie['itemID'].isin(df_item_embed['itemID'])]
df_movie = df_movie[df_movie['sent_score'] >= 3]

In [None]:
user_movie = df_movie['userID'].unique()
num_users = len(user_movie)
user_movie_map = {user_id: i for i, user_id in enumerate(user_movie)}
user_vectors = np.random.rand(num_users, 10)
user_vectors = torch.from_numpy(user_vectors).to(torch.float)

movie_ids = df_movie['itemID'].unique()
num_movies = len(movie_ids)
movie_map = {movie_id: i for i, movie_id in enumerate(movie_ids)}
item_vectors = np.random.rand(num_movies, 10)
item_vectors = torch.from_numpy(item_vectors).to(torch.float)

user_movie_ids = df_movie['userID'].map(user_movie_map).to_numpy()
item_movie_ids = df_movie['itemID'].map(movie_map).to_numpy()
movie_edges_matrix = np.vstack((user_movie_ids, item_movie_ids))
movie_edges_matrix = torch.from_numpy(movie_edges_matrix).to(torch.long)

In [None]:
data = HeteroData()

data['user'].x = user_vectors
data['movie'].x = item_vectors

data['user', 'rates', 'movie'].edge_index = movie_edges_matrix

data = T.ToUndirected()(data)
data

In [None]:
def train(loader):
    model.train()
    total_loss = 0
    all_labels = []
    all_preds = []

    for data in loader:
        data = data.to(device)
        optimizer.zero_grad()
        edge_label = data['user', 'rates', 'movie'].edge_label
        edge_label_index = data['user', 'rates', 'movie'].edge_label_index

        out = model(data.x_dict, data.edge_index_dict, edge_label_index)
        loss = F.binary_cross_entropy_with_logits(out, edge_label)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        all_labels.extend(edge_label.cpu().numpy())
        all_preds.extend(torch.sigmoid(out).detach().cpu().numpy())

    auc_score = roc_auc_score(all_labels, all_preds)
    return total_loss / len(loader.dataset), auc_score

def evaluate(loader):
    model.eval()
    total_loss = 0
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for data in loader:
            data = data.to(device)
            edge_label = data['user', 'rates', 'movie'].edge_label
            edge_label_index = data['user', 'rates', 'movie'].edge_label_index

            out = model(data.x_dict, data.edge_index_dict, edge_label_index)
            loss = F.binary_cross_entropy_with_logits(out, edge_label)
            total_loss += loss.item()

            all_labels.extend(edge_label.cpu().numpy())
            all_preds.extend(torch.sigmoid(out).detach().cpu().numpy())

    auc_score = roc_auc_score(all_labels, all_preds)
    return total_loss / len(loader.dataset), auc_score


def train_test(train_loader, val_loader, test_loader):
    train_losses = []
    train_accuracies = []
    val_losses = []
    val_accuracies = []

    for epoch in range(1500):
        train_loss, train_acc = train(train_loader)
        print(f'Epoch: {epoch:03d}, Train Loss: {train_loss}, Train AUC: {train_acc}')

        torch.save(model.state_dict(), f'model_epoch_{epoch}.pth')

        train_losses.append(train_loss)
        train_accuracies.append(train_acc)

        val_loss, val_acc = evaluate(val_loader)
        print(f'Val loss: {val_loss}, Val AUC: {val_acc}')

        val_losses.append(val_loss)
        val_accuracies.append(val_acc)

        test_loss, test_acc = evaluate(test_loader)
        print(f'Test loss: {test_loss}, Test AUC: {test_acc}\n')

    plt.figure(figsize=(10, 6))

    plt.subplot(1, 2, 1)
    plt.plot(train_losses, label='Train Loss')
    plt.plot(val_losses, label='Validation Loss')
    plt.xlabel('Epoch')
    plt.title('Training and Validation Loss')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(train_accuracies, label='Train AUC')
    plt.plot(val_accuracies, label='Validation AUC')
    plt.xlabel('Epoch')
    plt.title('Training and Validation AUC')
    plt.legend()

    plt.tight_layout()

    plt.show()

In [None]:
transform = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    disjoint_train_ratio=0.5,
    neg_sampling_ratio=1.0,
    add_negative_train_samples=True,
    edge_types=("user", "rates", "movie"),
    rev_edge_types=("movie", "rev_rates", "user"),
)

train_data, val_data, test_data = transform(data)

In [None]:
train_loader = DataLoader([train_data], batch_size=512, shuffle=True)
val_loader = DataLoader([val_data], batch_size=512, shuffle=False)
test_loader = DataLoader([test_data], batch_size=512, shuffle=False)
train_test(train_loader, val_loader, test_loader)