### Imports

In [1]:
import networkx as nx
import pandas as pd
from transformers import AutoModel,AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.metrics.distance import edit_distance,jaccard_distance
import torch
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv,GATConv,RGCNConv,RGATConv,SAGEConv
from ast import literal_eval
from sklearn.preprocessing import LabelEncoder
from numpy import array as np_array,zeros as np_zeros,hstack as np_hstack

  from .autonotebook import tqdm as notebook_tqdm


### Data loading

https://www.kaggle.com/datasets/asaniczka/tmdb-movies-dataset-2023-930k-movies

In [2]:
mov_dataset = pd.read_csv("./ng-datasets/fin-ds.csv")

In [3]:
mov_dataset.columns

Index(['title', 'adult', 'original_language', 'overview', 'tagline', 'genres',
       'production_country', 'year'],
      dtype='object')

In [None]:
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
embmodel = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
def get_embs(sentences,batch_size=128):
    global tokenizer,embmodel
    embs = []
    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True,max_length=256)
        with torch.no_grad():
            outputs = embmodel(**inputs)
            batch_embs = outputs.last_hidden_state.mean(dim=1).detach().cpu().numpy()
        embs.extend(batch_embs)
    return np_array(embs)

In [None]:
ovw_emb = [
    get_embs([item]) for item in mov_dataset["overview"].to_list()
]
tag_emb = [
    get_embs(literal_eval(item)).mean(axis=0) 
    if len(literal_eval(item)) > 1 
    else get_embs([literal_eval(item)])[0]
    for item in mov_dataset["tagline"].to_list()
]
genre_emb = [
    get_embs(literal_eval(item)).mean(axis=0) 
    if len(literal_eval(item)) > 1 
    else get_embs([literal_eval(item)])[0]
    for item in mov_dataset["genres"].to_list()
]

In [None]:
weights = {"adult":1.0,"original_language":2.0,"overview":2.5,"tagline":3.0,"genres":4.0,"year":1.5}
embs = {"overview":ovw_emb,"tagline":tag_emb,"genres":genre_emb}
size = len(mov_dataset)
sim_mat = np_zeros((size,size))
for i in range(size):
    for j in range(i + 1, size):
        # Compute scalar feature similarity
        scalar_features_i = np_array(
            [
                mov_dataset.iloc[i]["is_adult"] * weights["adult"],
                mov_dataset.iloc[i]["original_language"] * weights["original_language"],
                mov_dataset.iloc[i]["year"] * weights["year"],
            ]
        )
        scalar_features_j = np_array(
            [
                mov_dataset.iloc[j]["is_adult"] * weights["adult"],
                mov_dataset.iloc[j]["original_language"] * weights["original_language"],
                mov_dataset.iloc[j]["year"] * weights["year"],
            ]
        )
        scalar_similarity = cosine_similarity([scalar_features_i], [scalar_features_j])[0][0]
        text_similarities = []
        for feature in ["overview", "tagline", "genres"]:
            emb_i = embs[feature][i] * weights[feature]
            emb_j = embs[feature][j] * weights[feature]
            text_similarities.append(cosine_similarity([emb_i], [emb_j])[0][0])
        overall_similarity = scalar_similarity + sum(text_similarities)
        sim_mat[i, j] = overall_similarity
        sim_mat[j, i] = overall_similarity

### Graph creation

In [None]:
grph = nx.Graph()
thres = 0.35
for i, title in enumerate(mov_dataset["title"]):
    grph.add_node(i, title=title)
for i in range(size):
    for j in range(i + 1, size):
        if sim_mat[i, j] > thres:
            grph.add_edge(i, j, weight=sim_mat[i, j])

In [None]:
edge_index = torch.tensor(list(grph.edges)).t().contiguous()
edge_weight = torch.tensor([grph[u][v]["weight"] for u, v in grph.edges], dtype=torch.float)
node_features = []
for i in range(size):
    scalar_features = np_array(
        [
            mov_dataset.iloc[i]["is_adult"] * weights["adult"],
            mov_dataset.iloc[i]["original_language"] * weights["original_language"],
            mov_dataset.iloc[i]["year"] * weights["year"],
        ]
    )
    text_features = []
    for feature in ["overview", "tagline", "genres"]:
        text_features.extend(embs[feature][i] * weights[feature])
    node_features.append(np_hstack([scalar_features, text_features]))

node_features = torch.tensor(node_features, dtype=torch.float)
grdata = Data(x=node_features, edge_index=edge_index, edge_attr=edge_weight)

### Training

In [None]:
class GCN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCN, self).__init__()
        self.gcn1 = GCNConv(input_dim, hidden_dim)
        self.gcn2 = GCNConv(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.gcn1(x, edge_index)
        x = torch.relu(x)
        x = self.gcn2(x, edge_index)
        return x
model = GCN(node_features.shape[1],hidden_dim=64,output_dim=32)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)
epochs = 8
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    out = model(grdata)
    loss = torch.nn.functional.cross_entropy(out[grdata.train_mask], grdata.y[grdata.train_mask])
    loss.backward()
    optimizer.step()
    print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}')

In [None]:
def get_similar_recs(prompt_titles,mov_dataset,gnn_model,data,k=4):
    gnn_model.eval()
    with torch.no_grad(): node_embeddings = gnn_model(data)
    title_to_index = {row["title"]: idx for idx, row in mov_dataset.iterrows()}
    prompt_indices = [
        title_to_index[title] for title in prompt_titles if title in title_to_index
    ]
    if not prompt_indices: raise ValueError("None of the provided titles were found in the dataset.")
    prompt_embeddings = node_embeddings[prompt_indices].cpu().numpy()
    all_node_embeddings = node_embeddings.cpu().numpy()
    similarities = cosine_similarity(prompt_embeddings, all_node_embeddings)
    top_k_results = {}
    for i, title in enumerate(prompt_titles):
        if title in title_to_index:
            node_idx = title_to_index[title]
            node_similarities = similarities[i]
            top_k_indices = node_similarities.argsort()[-(k + 1):][::-1]
            top_k_indices = [idx for idx in top_k_indices if idx != node_idx][:k]
            top_k_titles = [mov_dataset.iloc[idx]["title"] for idx in top_k_indices]
            top_k_results[title] = top_k_titles
    return top_k_results

In [None]:
inps = ["inception"]
sim_movs = get_similar_recs(inps,mov_dataset,model,grdata)
for title, similar_titles in sim_movs.items():
    print(f"Top 4 similar titles to '{title}':")
    for similar_title in similar_titles:
        print(f"- {similar_title}")