In [1]:
import numpy as np
import pandas as pd
import copy
import torch
from CNN_classification import *

from sklearn.model_selection import train_test_split
import torch.optim as optim

import sys
sys.path.append("../Processing/")  # folder containing your file

from utils import *  # or import specific functions

In [3]:
cora = pd.read_csv("../Hyperedges/Cora_dataset.csv")

# I need to parse all the embeddings first
for col in ['BoW', 'Abstract_W2V', 'bert_embedding']:
    cora[col] = cora[col].apply(parse_embedding_flexible)

In [2]:
cora = pd.read_csv("./Cora_dataset_new_clsBERT.csv")
# I need to parse all the embeddings first
for col in ['bert_cls_embedding2']:
    cora[col] = cora[col].apply(parse_embedding_flexible)

In [5]:
X = np.stack(cora['bert_embedding'].values)  # shape: (num_samples, embedding_dim)
y = np.array(cora['topic2'].factorize()[0])  # convert labels to integers

# Split into train/test (or use your dataset split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

train_dataset = CNNDataset(X_train, y_train)
test_dataset  = CNNDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader  = DataLoader(test_dataset, batch_size=32)

In [6]:
input_dim = X.shape[1]       
output_dim = 7 # number of classes oin cora

In [7]:
model = CNNClassifier(input_dim, output_dim)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Training
patience = 50
to_patience = 0
num_epochs = 500
best_val_acc = 0

train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []

for epoch in range(num_epochs):
    if to_patience >= patience:
        break
    # --- Training ---
    model.train()
    total_train_loss = 0
    correct_train = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        total_train_loss += loss.item()
        preds = outputs.argmax(dim=1)
        correct_train += (preds == y_batch).sum().item()
    
    avg_train_loss = total_train_loss / len(train_loader)
    train_acc = correct_train / len(train_dataset)
    
    train_losses.append(avg_train_loss)
    train_accuracies.append(train_acc)

    # --- Validation ---
    model.eval()
    total_val_loss = 0
    correct_val = 0
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            total_val_loss += loss.item()
            preds = outputs.argmax(dim=1)
            correct_val += (preds == y_batch).sum().item()
    
    avg_val_loss = total_val_loss / len(test_loader)
    val_acc = correct_val / len(test_dataset)
    
    val_losses.append(avg_val_loss)
    val_accuracies.append(val_acc)

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        to_patience = 0
        best_model_wts = copy.deepcopy(model.state_dict())
    else:
        to_patience += 1

    if epoch % 20 == 0:
        print(f"Epoch {epoch}/{num_epochs} | "
            f"Train Loss: {avg_train_loss:.4f}, Train Acc: {train_acc:.4f} | "
            f"Val Loss: {avg_val_loss:.4f}, Val Acc: {val_acc:.4f}")
    

model.load_state_dict(best_model_wts)
torch.save(model.state_dict(), "best_model_CNN_BERTCLS.pth")
print(f"Best val accuracy: {best_val_acc:.4f}")


Epoch 0/500 | Train Loss: 3.4080, Train Acc: 0.1800 | Val Loss: 1.9123, Val Acc: 0.3038
Epoch 20/500 | Train Loss: 1.9985, Train Acc: 0.2207 | Val Loss: 1.9476, Val Acc: 0.3038
Epoch 40/500 | Train Loss: 1.9126, Train Acc: 0.2709 | Val Loss: 1.8524, Val Acc: 0.3038
Best val accuracy: 0.3038
