In [1]:
import os, random,re, numpy as np, torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from torch_geometric.datasets import Planetoid
from torch_geometric.nn import GCNConv
import torch_geometric.transforms as T
from torch_geometric.data import HeteroData
from sentence_transformers import SentenceTransformer
from torch_geometric.transforms import RandomLinkSplit
import json
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
current_dir = os.getcwd()
print(current_dir)

c:\Users\mosca\Documents\Maestria\Tercer_trimestre\AA2\TP_AA2\tp_yelp


In [3]:
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED); torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device:", device)

Device: cuda


### Carga de reviews INICIAL - Si no modificamos la data, saltear al siguiente paso

In [None]:
def cargar_reviews(current_dir,nombre_archivo="review_philadelphia.json"):

    """
    Carga un archivo de reseñas JSON y devuelve un df de pandas.
    """
    path = os.path.join(current_dir, "yelp_dataset", nombre_archivo)
    print(f"Leyendo archivo desde: {path}")

        
    reviews = []
    _word_re = re.compile(r"[A-Za-z0-9]+", re.UNICODE)
    with open(path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):

            # ### Solo para empezar a trabajar la data
            # if i >= 100:   # detener después de 'limit' líneas
            #     break

            line = line.strip()
            if line:  # ignora líneas vacías
                dicc=json.loads(line)
                
                #para evitar tener reviews vacias
                if len(_word_re.findall(dicc['text'])) == 0: 
                    continue

                # nos quedamos con la data que queremos
                for key in ['review_id','funny','cool','useful','date']:
                    del dicc[f'{key}']
                reviews.append(dicc)
    
    print(f"Se cargaron {len(reviews)} reseñas desde {path}")


    rv_df = pd.DataFrame(reviews)
    # Convertir stars a int
    rv_df["stars"] = rv_df["stars"].astype(int)
    # Restar 1 para que queden entre 0 y 4
    rv_df["stars"] = rv_df["stars"] - 1

    return rv_df

In [None]:
reviews=cargar_reviews(current_dir)
reviews.head()

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)


In [None]:
embeddings = model.encode(reviews['text'], convert_to_tensor=True, show_progress_bar=True)

In [None]:
embeddings_np = embeddings.cpu().numpy()

reviews['embedding'] = list(embeddings_np)



In [None]:
path = os.path.join(current_dir, "yelp_dataset", 'reviews_con_embeddings.pkl')
print(f"Guardando archivo como: {path}")


In [None]:
reviews.to_pickle(path)

### Carga de data de Usuarios y locales

In [4]:
business = []
with open(os.path.join(current_dir, "yelp_dataset", "business_philadelphia.json"), "r", encoding="utf-8") as f:
    for line in f:
        business.append(json.loads(line))

df_business = pd.DataFrame(business)

users = []
with open(os.path.join(current_dir, "yelp_dataset", "user_philadelphia.json"), "r", encoding="utf-8") as f:
    for line in f:
        users.append(json.loads(line))

df_users = pd.DataFrame(users)

In [5]:
# Paso 1: limpiar la columna
df_business['categories'] = df_business['categories'].fillna('')
df_business['categories_list'] = df_business['categories'].apply(
    lambda x: [c.strip() for c in x.split(',') if c.strip() != '']
)

# Paso 2: ajustar el binarizador
mlb = MultiLabelBinarizer()
mlb.fit(df_business['categories_list'])

# Paso 3: transformar a vectores
category_vectors = mlb.transform(df_business['categories_list'])

# Cada fila de category_vectors es un vector numpy del negocio
df_business['category_vector'] = list(category_vectors)

In [6]:
df_business.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours,categories_list,category_vector
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ...","[Restaurants, Food, Bubble Tea, Coffee & Tea, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,MUTTqe8uqyMdBl186RmNeA,Tuna Bar,205 Race St,Philadelphia,PA,19106,39.953949,-75.143226,4.0,245,1,"{'RestaurantsReservations': 'True', 'Restauran...","Sushi Bars, Restaurants, Japanese","{'Tuesday': '13:30-22:0', 'Wednesday': '13:30-...","[Sushi Bars, Restaurants, Japanese]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,ROeacJQwBeh05Rqg7F6TCg,BAP,1224 South St,Philadelphia,PA,19147,39.943223,-75.162568,4.5,205,1,"{'NoiseLevel': 'u'quiet'', 'GoodForMeal': '{'d...","Korean, Restaurants","{'Monday': '11:30-20:30', 'Tuesday': '11:30-20...","[Korean, Restaurants]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,QdN72BWoyFypdGJhhI5r7g,Bar One,767 S 9th St,Philadelphia,PA,19147,39.939825,-75.157447,4.0,65,0,"{'Smoking': 'u'no'', 'NoiseLevel': 'u'average'...","Cocktail Bars, Bars, Italian, Nightlife, Resta...","{'Monday': '16:0-0:0', 'Tuesday': '16:0-0:0', ...","[Cocktail Bars, Bars, Italian, Nightlife, Rest...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,Mjboz24M9NlBeiOJKLEd_Q,DeSandro on Main,4105 Main St,Philadelphia,PA,19127,40.022466,-75.218314,3.0,41,0,"{'RestaurantsReservations': 'False', 'Caters':...","Pizza, Restaurants, Salad, Soup","{'Tuesday': '17:0-21:30', 'Wednesday': '17:0-1...","[Pizza, Restaurants, Salad, Soup]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [8]:
df_users.head()

Unnamed: 0,user_id,name,review_count,yelping_since,useful,funny,cool,elite,friends,fans,...,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos
0,qVc8ODYU5SZjKXVBgXdI7w,Walker,585,2007-01-25 16:47:26,7217,1259,5994,2007,"NSCy54eWehBJyZdG2iE84w, pe42u7DcCH2QmI81NX-8qA...",267,...,65,55,56,18,232,844,467,467,239,180
1,j14WgRoU_-2ZE1aw1dXrJg,Daniel,4333,2009-01-25 04:35:42,43091,13066,27281,"2009,2010,2011,2012,2013,2014,2015,2016,2017,2...","ueRPE0CX75ePGMqOFVj6IQ, 52oH4DrRvzzl8wh5UXyU0A...",3138,...,264,184,157,251,1847,7054,3131,3131,1521,1946
2,q_QQ5kBBwlCcbL1s4NVK3g,Jane,1221,2005-03-14 20:26:35,14953,9940,11211,200620072008200920102011201220132014,"xBDpTUbai0DXrvxCe3X16Q, 7GPNBO496aecrjJfW6UWtg...",1357,...,163,191,361,147,1212,5696,2543,2543,815,323
3,AUi8MPWJ0mLkMfwbui27lg,John,109,2010-01-07 18:32:04,154,20,23,,"gy5fWeSv3Gamuq9Ox4MV4g, lMr3LWU6kPFLTmCpDkACxg...",4,...,0,0,0,0,1,6,3,3,0,0
4,1McG5Rn_UDkmlkZOrsdptg,Teresa,7,2009-05-26 16:11:11,18,3,13,,"piejMEdRkGB7-1aL4lL5NQ, X0zFOU6iG95-feQKOXkgrA...",1,...,0,0,0,0,1,0,2,2,0,0


### Preparamos la data para la contruccion del grafo

In [6]:
path = os.path.join(current_dir, "yelp_dataset", 'reviews_con_embeddings.pkl')
reviews_cargados = pd.read_pickle(path)

In [7]:
reviews_cargados.head()

Unnamed: 0,user_id,business_id,stars,text,embedding
0,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,4,I've taken a lot of spin classes over the year...,"[-0.011747047, 0.10525637, -0.0076568783, -0.0..."
1,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,4,"Wow! Yummy, different, delicious. Our favo...","[-0.08979228, -0.04855581, 0.005521066, 0.0004..."
2,eUta8W_HdHMXPzLBBZhL1A,04UD14gamNjLY0IDYVhHJg,0,I am a long term frequent customer of this est...,"[0.027325869, -0.037775017, -0.03003162, -0.04..."
3,smOvOajNG0lS4Pq7d8g4JQ,RZtGWDLCAtuipwaZ-UfjmQ,3,Good food--loved the gnocchi with marinara\nth...,"[-0.022265958, 0.026531707, 0.027565729, -0.00..."
4,Dd1jQj7S-BFGqRbApFzCFw,YtSqYv1Q_pOltsVPSx54SA,4,Tremendous service (Big shout out to Douglas) ...,"[0.021217695, -0.03051979, 0.0039557144, 0.025..."


In [8]:
#cargamso los embeddings de reseñas
embeddings_car_np = np.stack(reviews_cargados['embedding'].values)
edge_embeddings = torch.from_numpy(embeddings_car_np)

#cargamos los vectores de los negocios
business_vec_np = np.stack(df_business['category_vector'].values)
business_Vec = torch.from_numpy(business_vec_np)


In [9]:
# Mapeo de ID unico a indice (0, 1, 2, ...)
user_mapping = {id: i for i, id in enumerate(reviews_cargados['user_id'].unique())}
place_mapping = {id: i for i, id in enumerate(reviews_cargados['business_id'].unique())}

# Aplicamos el mapeo al df para obtener los indices de las aristas
user_indices_mapped = reviews_cargados['user_id'].map(user_mapping)
place_indices_mapped = reviews_cargados['business_id'].map(place_mapping)


# Preparamos grafo
NUM_NODES_USER = len(user_mapping) 
NUM_NODES_PLACE = len(place_mapping)
NUM_RATINGS = reviews_cargados.shape[0]

data = HeteroData()
data['user'].num_nodes = NUM_NODES_USER
data['place'].num_nodes = NUM_NODES_PLACE

# Datos para las aristas
ratings = reviews_cargados['stars']

# Convertimos series de pandas a tensores de PyTorch antes de apilarlas

data['user', 'rates', 'place'].edge_index = torch.stack([
    torch.from_numpy(user_indices_mapped.values).long(),
    torch.from_numpy(place_indices_mapped.values).long()
], dim=0)

# Agragamos label de buena arista
data['user', 'rates', 'place'].edge_attr_label  = torch.from_numpy((ratings > 3).values).long()

# Cargamos embedding de reseña a la arista
data['user', 'rates', 'place'].edge_attr = edge_embeddings

In [10]:
# Asignar los vectores de categorías como features de los nodos de "place"
data['place'].x = business_Vec.float()

In [11]:
print(f"Forma del edge_attr: {data['user', 'rates', 'place'].edge_attr_label.shape}")
print(f"Forma del atributos de business: {data['place'].x.shape}")

Forma del edge_attr: torch.Size([967506])
Forma del atributos de business: torch.Size([14569, 1027])


In [12]:
data

HeteroData(
  user={ num_nodes=279837 },
  place={
    num_nodes=14569,
    x=[14569, 1027],
  },
  (user, rates, place)={
    edge_index=[2, 967506],
    edge_attr_label=[967506],
    edge_attr=[967506, 384],
  }
)

### Dividimos la data

In [13]:
transform = RandomLinkSplit(
    num_val=0.1, num_test=0.1, is_undirected=False,
    edge_types=[('user', 'rates', 'place')],
    add_negative_train_samples= False
)
train_data_raw, val_data_raw, test_data_raw = transform(data)

In [14]:
train_data_raw

HeteroData(
  user={ num_nodes=279837 },
  place={
    num_nodes=14569,
    x=[14569, 1027],
  },
  (user, rates, place)={
    edge_index=[2, 774006],
    edge_attr_label=[774006],
    edge_attr=[774006, 384],
    edge_label=[774006],
    edge_label_index=[2, 774006],
  }
)

In [15]:
val_data_raw

HeteroData(
  user={ num_nodes=279837 },
  place={
    num_nodes=14569,
    x=[14569, 1027],
  },
  (user, rates, place)={
    edge_index=[2, 774006],
    edge_attr_label=[774006],
    edge_attr=[774006, 384],
    edge_label=[193500],
    edge_label_index=[2, 193500],
  }
)

Creamso una funcion para crear los vectores de usuarios en base a los lugares que visitaron

In [16]:
def compute_user_vectors(data):
    edge_index = data['user', 'rates', 'place'].edge_index
    place_vecs = data['place'].x  # [num_places, dim]
    
    num_users = data['user'].num_nodes
    dim = place_vecs.size(1)

    # Inicializamos un acumulador y un contador
    user_sum = torch.zeros((num_users, dim))
    user_count = torch.zeros(num_users)

    # Descomponemos aristas
    user_idx = edge_index[0]
    place_idx = edge_index[1]

    # Sumamos vectores de lugares visitados
    user_sum.index_add_(0, user_idx, place_vecs[place_idx])
    user_count.index_add_(0, user_idx, torch.ones_like(user_idx, dtype=torch.float))

    # Promediamos, evitando divisiones por cero
    user_mean = user_sum / user_count.clamp(min=1).unsqueeze(1)

    return user_mean

In [17]:
train_user_vec = compute_user_vectors(train_data_raw)
val_user_vec   = compute_user_vectors(val_data_raw)
test_user_vec  = compute_user_vectors(test_data_raw)

In [18]:
train_data_raw['user'].x = train_user_vec
val_data_raw['user'].x   = val_user_vec
test_data_raw['user'].x  = test_user_vec

train_data_raw = T.ToUndirected()(train_data_raw)
val_data_raw = T.ToUndirected()(val_data_raw)
test_data_raw = T.ToUndirected()(test_data_raw)

In [19]:

train_data_raw

HeteroData(
  user={
    num_nodes=279837,
    x=[279837, 1027],
  },
  place={
    num_nodes=14569,
    x=[14569, 1027],
  },
  (user, rates, place)={
    edge_index=[2, 774006],
    edge_attr_label=[774006],
    edge_attr=[774006, 384],
    edge_label=[774006],
    edge_label_index=[2, 774006],
  },
  (place, rev_rates, user)={
    edge_index=[2, 774006],
    edge_attr_label=[774006],
    edge_attr=[774006, 384],
    edge_label=[774006],
  }
)

In [21]:

val_data_raw

HeteroData(
  user={
    num_nodes=279837,
    x=[279837, 1027],
  },
  place={
    num_nodes=14569,
    x=[14569, 1027],
  },
  (user, rates, place)={
    edge_index=[2, 774006],
    edge_attr_label=[774006],
    edge_attr=[774006, 384],
    edge_label=[193500],
    edge_label_index=[2, 193500],
  },
  (place, rev_rates, user)={
    edge_index=[2, 774006],
    edge_attr_label=[774006],
    edge_attr=[774006, 384],
  }
)

In [20]:
def scatter_mean_torch(src, index, dim_size):


    out = torch.zeros(dim_size, src.size(1), device=src.device)
    out = out.index_add(0, index, src)

    # contadores por nodo
    count = torch.zeros(dim_size, device=src.device).index_add(0, index, torch.ones(src.size(0), device=src.device))

    # evitar división por cero
    count = count.clamp(min=1).unsqueeze(1)

    return out / count


def vec_aristas(data_input):
    data = data_input

    # Edges user → place
    src = data["user", "rates", "place"].edge_index[0]  
    edge_attr = data["user", "rates", "place"].edge_attr 
    user_review_mean = scatter_mean_torch(edge_attr, src, dim_size=data['user'].num_nodes)

    # Edges place → user
    src_p = data["place", "rev_rates", "user"].edge_index[0]  # places
    edge_attr_p = data["place", "rev_rates", "user"].edge_attr  # [774006, 384]
    place_review_mean = scatter_mean_torch(edge_attr_p, src_p, dim_size=data['place'].num_nodes)

    # Concatenar a las features de usuario
    data['user'].x = torch.cat([data['user'].x, user_review_mean], dim=1)
    data['place'].x = torch.cat([data['place'].x, place_review_mean], dim=1)

    return data


In [21]:
train_data_raw=vec_aristas(train_data_raw)
val_data_raw=vec_aristas(val_data_raw)
test_data_raw=vec_aristas(test_data_raw)

In [22]:

train_data_raw


HeteroData(
  user={
    num_nodes=279837,
    x=[279837, 1411],
  },
  place={
    num_nodes=14569,
    x=[14569, 1411],
  },
  (user, rates, place)={
    edge_index=[2, 774006],
    edge_attr_label=[774006],
    edge_attr=[774006, 384],
    edge_label=[774006],
    edge_label_index=[2, 774006],
  },
  (place, rev_rates, user)={
    edge_index=[2, 774006],
    edge_attr_label=[774006],
    edge_attr=[774006, 384],
    edge_label=[774006],
  }
)

In [23]:
import torch
import torch.nn as nn
from torch_geometric.nn import HGTConv
from torch_geometric.nn import Linear

class HGTBinaryModel(nn.Module):
    def __init__(self, metadata, hidden_channels=128, heads=4):
        super().__init__()

        # Proyecciones iniciales para cada tipo de nodo
        self.node_emb = nn.ModuleDict()
        for node_type in metadata[0]:
            self.node_emb[node_type] = Linear(-1, hidden_channels)

        # Dos capas heterogéneas
        self.conv1 = HGTConv(
            in_channels={nt: hidden_channels for nt in metadata[0]},
            out_channels=hidden_channels,
            metadata=metadata,
            heads=heads
        )


        self.conv2 = HGTConv(
            in_channels={nt: hidden_channels for nt in metadata[0]},
            out_channels=hidden_channels,
            metadata=metadata,
            heads=heads
        )
        
        self.classifier = nn.Sequential(
            nn.Linear(4 * hidden_channels, hidden_channels),
            nn.BatchNorm1d(hidden_channels),
            nn.ReLU(),
            nn.Dropout(p=0.3)
        )

        self.out_layer = nn.Linear(hidden_channels + 4 * hidden_channels, 1)

    def forward(self, data):
        x_dict = {}

        # Inicial
        for nt, x in data.x_dict.items():
            x_dict[nt] = self.node_emb[nt](x)

        h = self.conv1(x_dict, data.edge_index_dict)
        for nt in h:
            h[nt] = h[nt] + x_dict[nt]   # residual simple
        x_dict = h

        h = self.conv2(x_dict, data.edge_index_dict)

        for nt in h:
            h[nt] = h[nt] + x_dict[nt]   # residual simple
        x_dict = h

        user_idx, place_idx = data['user', 'rates', 'place'].edge_label_index

        u = x_dict["user"][user_idx]
        p = x_dict["place"][place_idx]

        edge_repr = torch.cat([
            u,
            p,
            u * p,
            torch.abs(u - p)
        ], dim=1)

        h = self.classifier(edge_repr)
        h = torch.cat([h, edge_repr], dim=1)  # Residual concatenado
        logit = self.out_layer(h).squeeze()

        return logit

In [24]:
model = HGTBinaryModel(
    metadata=train_data_raw.metadata(),
    hidden_channels=128,
    heads=4
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)

train_data_raw = train_data_raw.to(device)
val_data_raw   = val_data_raw.to(device)


# edge_attr_label = tus labels binarios (0/1) según tu criterio
y_train = train_data_raw["user","rates","place"].edge_attr_label[train_data_raw["user","rates","place"].edge_label_index[1]].float().to(device)
y_val   = val_data_raw["user","rates","place"].edge_attr_label[val_data_raw["user","rates","place"].edge_label_index[1]].float().to(device)

val_edge_index = val_data_raw["user", "rates", "place"].edge_label_index
train_edge_index = train_data_raw["user", "rates", "place"].edge_label_index


# un desbalanceo en la clases nos estafa afectando el entraenamiento
float(y_train.mean())
pos_weight = torch.tensor([(1 - 0.4222) / 0.4222]).to(device)
loss_fn = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)


In [25]:

import gc
torch.cuda.empty_cache()

torch.cuda.synchronize()


gc.collect()
torch.cuda.empty_cache()

In [26]:
epochs = 600

for epoch in range(epochs):
    # ===== Entrenamiento =====
    model.train()
    optimizer.zero_grad()

    logits_train = model(train_data_raw)
    loss = loss_fn(logits_train, y_train)

    loss.backward()
    optimizer.step()

    with torch.no_grad():
        prob = torch.sigmoid(logits_train)
        pred = (prob > 0.5).long()
        correct = (pred == y_train.long()).sum().item()
        acc = correct / y_train.size(0)

    model.eval()
    with torch.no_grad():
        val_logits = model(val_data_raw)

        val_loss = loss_fn(val_logits, y_val)

        val_prob = torch.sigmoid(val_logits)
        val_pred = (val_prob > 0.5).long()
        val_acc = (val_pred == y_val.long()).sum().item() / y_val.size(0)

    print(f"Epoch {epoch:03d} | "
        f"Train Loss: {loss.item():.4f} | Train Acc: {acc:.4f} | "
        f"Val Loss: {val_loss.item():.4f} | Val Acc: {val_acc:.4f}")

Epoch 000 | Train Loss: 0.8076 | Train Acc: 0.4655 | Val Loss: 0.8018 | Val Acc: 0.5778
Epoch 001 | Train Loss: 0.7733 | Train Acc: 0.5977 | Val Loss: 0.8025 | Val Acc: 0.5778
Epoch 002 | Train Loss: 0.7503 | Train Acc: 0.6417 | Val Loss: 0.8016 | Val Acc: 0.5778
Epoch 003 | Train Loss: 0.7280 | Train Acc: 0.6624 | Val Loss: 0.8001 | Val Acc: 0.5781
Epoch 004 | Train Loss: 0.7070 | Train Acc: 0.6738 | Val Loss: 0.7986 | Val Acc: 0.5834
Epoch 005 | Train Loss: 0.6874 | Train Acc: 0.6829 | Val Loss: 0.7972 | Val Acc: 0.5936
Epoch 006 | Train Loss: 0.6694 | Train Acc: 0.6930 | Val Loss: 0.7958 | Val Acc: 0.6048
Epoch 007 | Train Loss: 0.6527 | Train Acc: 0.7037 | Val Loss: 0.7943 | Val Acc: 0.6093
Epoch 008 | Train Loss: 0.6367 | Train Acc: 0.7154 | Val Loss: 0.7928 | Val Acc: 0.6113
Epoch 009 | Train Loss: 0.6213 | Train Acc: 0.7267 | Val Loss: 0.7910 | Val Acc: 0.6130
Epoch 010 | Train Loss: 0.6053 | Train Acc: 0.7384 | Val Loss: 0.7891 | Val Acc: 0.6146
Epoch 011 | Train Loss: 0.5892 |

KeyboardInterrupt: 

In [45]:

print(float(y_train.mean()))


0.41949155926704407


In [39]:
y_val.mean()

tensor(0.4222, device='cuda:0')

In [40]:
print(train_data_raw.x_dict["user"].shape)
print(train_data_raw.x_dict["place"].shape)

torch.Size([279837, 1411])
torch.Size([14569, 1411])


In [40]:
print("Train edges:", train_data_raw["user", "rates", "place"].edge_index.size(1))
print("Val edges:", val_data_raw["user", "rates", "place"].edge_index.size(1))
print("Labels train:", y_train.size(0))
print("Labels val:", y_val.size(0))

Train edges: 774006
Val edges: 774006
Labels train: 774006
Labels val: 193500


In [None]:
validación y métricas (AUC, Recall@K, NDCG)
cómo usar edge_attr dentro de HAN
(2 capas, residual, dropout)

In [None]:
from pyvis.network import Network

def plot_interactive_graph(data, num_edges=10000):
    edge_index = data['user', 'rates', 'place'].edge_index
    idx = torch.randint(0, edge_index.size(1), (num_edges,))
    edges_sample = edge_index[:, idx]

    net = Network(
    notebook=True,
    height="800px",
    width="100%",
    bgcolor="#ffffff",
    font_color="black",
    cdn_resources='in_line'  # evita el warning
    )

    for u, p in edges_sample.t().tolist():
        net.add_node(f"user_{u}", color="#1f77b4", title=f"User {u}")
        net.add_node(f"place_{p}", color="#ff7f0e", title=f"Place {p}")
        net.add_edge(f"user_{u}", f"place_{p}")


    import webbrowser
    webbrowser.open("graph.html")

# Ejemplo
plot_interactive_graph(train_data_raw, num_edges=10000)