# ENV

pip install torch==1.13.1+cu116 torchvision==0.14.1+cu116 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116

pip install pyg_lib torch_sparse torch_scatter torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-1.13.0+cu116.html

pip install 'numpy<2'



In [None]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

import os
os.chdir("/home/dalai/GNN_E")

from scripts.models import *
import pandas as pd
from scripts.utils import *
from math import ceil
import gc
from tqdm import tqdm


import torch
import torch.optim as optim

print(torch.cuda.is_available())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
print(torch.__version__)

  from .autonotebook import tqdm as notebook_tqdm


True
cuda
1.13.1+cu116


In [2]:
# Load all movies with labels csv
df_all_movies = pd.read_csv("data/processed/all_movies_labelled_13_single.csv")
num_classes = 13 # based on df_movie choosen
batch_size = 16
half_wind_size = 4

In [3]:
#df_train, df_test = split_train_test_vertically(df_all_movies, test_movies_dict = {"Sintel": 7, "TearsOfSteel": 10, "Superhero": 9})

df_train, df_val, df_test = split_train_val_test_horizontally(
    df_all_movies, 
    percentage_train=0.8, 
    percentage_val=0.0, #0 to not have nay val set
    path_pickle_delay="data/raw/labels/run_onsets.pkl",
    path_movie_title_mapping="data/raw/labels/category_mapping_movies.csv", 
    tr_len=1.3
)

# Create a dataset
dataset_train = DatasetEmo(df=df_train, node_feat="symmetricwindow", sizewind=half_wind_size)
dataset_test = DatasetEmo(df=df_test, node_feat="symmetricwindow", sizewind=half_wind_size)

# Extarct the list of graphs of each dataset
graphs_list_train = dataset_train.get_graphs_list()
graphs_list_test = dataset_test.get_graphs_list()

# Create a dataloader
loader_train = pyg.loader.DataLoader(graphs_list_train, batch_size=batch_size, num_workers=4, persistent_workers=True)
loader_test = pyg.loader.DataLoader(graphs_list_test, batch_size=batch_size, num_workers=4, persistent_workers=True)

#Claulte number of batches
num_batches_train = ceil(len(graphs_list_train) / batch_size)
num_batches_test = ceil(len(graphs_list_test) / batch_size)

print(f"There are {len(graphs_list_train)} graphs in the train set.")
print(f"There are {len(graphs_list_test)} graphs in the test set.")
print(f"N batches in train: {num_batches_train}")
print(f"N batches in test: {num_batches_test}")




Movie: AfterTheRain
  Start Time (TR)+4: 80
  Total Length (TR): 382
  Train End (TR): 385
  Validation End (TR): 385
  Movie End (TR): 462

Movie: BetweenViewings
  Start Time (TR)+4: 79
  Total Length (TR): 622
  Train End (TR): 576
  Validation End (TR): 576
  Movie End (TR): 701

Movie: BigBuckBunny
  Start Time (TR)+4: 79
  Total Length (TR): 377
  Train End (TR): 380
  Validation End (TR): 380
  Movie End (TR): 456

Movie: Chatter
  Start Time (TR)+4: 79
  Total Length (TR): 312
  Train End (TR): 328
  Validation End (TR): 328
  Movie End (TR): 391

Movie: FirstBite
  Start Time (TR)+4: 79
  Total Length (TR): 461
  Train End (TR): 447
  Validation End (TR): 447
  Movie End (TR): 540

Movie: LessonLearned
  Start Time (TR)+4: 79
  Total Length (TR): 513
  Train End (TR): 489
  Validation End (TR): 489
  Movie End (TR): 592

Movie: Payload
  Start Time (TR)+4: 79
  Total Length (TR): 775
  Train End (TR): 699
  Validation End (TR): 699
  Movie End (TR): 854

Movie: Sintel
  Start

In [17]:
def train(model, train_loader, num_epochs=10, learning_rate=0.001):

    losses = []

    # Set the model to training mode
    model.train()
    
    # Define the loss function (CrossEntropyLoss for multi-class classification)
    criterion = torch.nn.CrossEntropyLoss()
    
    # Define the optimizer (Adam)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    for epoch in range(num_epochs):
        total_loss = 0
        
        # Progress bar for batches within the current epoch
        batch_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}", unit="batch", leave=True)
        
        for batch in batch_bar:
            batch = batch.to(device)

            optimizer.zero_grad()  # Zero the gradients
            
            # Forward pass: Get predictions and attention weights
            out = model(batch)  # The model outputs the log-softmax scores
            
            # Compute the loss (CrossEntropyLoss automatically applies softmax internally)
            loss = criterion(out, batch.y)  # batch.y are the labels for the graph
            
            # Backward pass: Compute gradients
            loss.backward()
            
            # Update model parameters
            optimizer.step()
            
            # Accumulate loss for logging
            total_loss += loss.item()

            # GPU memory monitoring
            current_memory = torch.cuda.memory_allocated(device) / 1e6  # Convert bytes to MB
            peak_memory = torch.cuda.max_memory_allocated(device) / 1e6

            # Update progress bar with metrics
            batch_bar.set_postfix(
                loss=loss.item(),
                mem_used=f"{current_memory:.2f}MB",
                peak_mem=f"{peak_memory:.2f}MB"
            )

            del batch, out, loss  # Clean up memory
            torch.cuda.empty_cache()  # Optionally, free unused memory

        gc.collect()  # Clean up CPU memory
        
        # Print the loss after each epoch
        print(f"\tEpoch {epoch+1}/{num_epochs}, Loss: {total_loss / len(train_loader)}\n")

        # Append the loss for this epoch
        losses.append(total_loss / len(train_loader))

    return losses


In [16]:
n_feat_per_node = graphs_list_train[0].x.shape[1]
MyGat = GATModel(input_dim = n_feat_per_node, 
                hidden_dim = num_classes, 
                output_dim = num_classes, 
                num_heads=num_classes)
MyGat = MyGat.to(device)

print(next(MyGat.parameters()).device)

print(torch.cuda.memory_summary(device=None, abbreviated=False))


cuda:0
|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |  607333 KB |    8061 MB |   51565 GB |   51565 GB |
|       from large pool |  605094 KB |    8060 MB |   51538 GB |   51537 GB |
|       from small pool |    2239 KB |       3 MB |      27 GB |      27 GB |
|---------------------------------------------------------------------------|
| Active memory         |  607333 KB |    8061 MB |   51565 GB |   51565 GB |
|       from large pool |  605094 KB |    8060 MB |   51538 GB |   51537 GB |
|       from small pool |    2239 KB |       3 MB |      27 GB |      27 GB |
|--------------------------------------------------------

In [18]:
torch.autograd.set_detect_anomaly(True)

losses_train = train(model=MyGat, train_loader=loader_train)

Epoch 1:   0%|          | 0/11069 [00:00<?, ?batch/s]

Epoch 1: 100%|██████████| 11069/11069 [42:09<00:00,  4.38batch/s, loss=3.55, mem_used=646.77MB, peak_mem=8854.30MB] 


	Epoch 1/10, Loss: 2.2578321152160306



Epoch 2: 100%|██████████| 11069/11069 [42:05<00:00,  4.38batch/s, loss=4, mem_used=425.58MB, peak_mem=8854.30MB]    


	Epoch 2/10, Loss: 2.1481567927466165



Epoch 3: 100%|██████████| 11069/11069 [42:05<00:00,  4.38batch/s, loss=4.46, mem_used=425.58MB, peak_mem=8854.30MB] 


	Epoch 3/10, Loss: 2.1425604642713734



Epoch 4: 100%|██████████| 11069/11069 [42:05<00:00,  4.38batch/s, loss=4.29, mem_used=425.58MB, peak_mem=8854.30MB] 


	Epoch 4/10, Loss: 2.138376322576849



Epoch 5: 100%|██████████| 11069/11069 [42:05<00:00,  4.38batch/s, loss=4.58, mem_used=425.58MB, peak_mem=8854.30MB] 


	Epoch 5/10, Loss: 2.140890210638073



Epoch 6: 100%|██████████| 11069/11069 [42:05<00:00,  4.38batch/s, loss=4.25, mem_used=425.58MB, peak_mem=8854.30MB] 


	Epoch 6/10, Loss: 2.1391676898753915



Epoch 7: 100%|██████████| 11069/11069 [42:49<00:00,  4.31batch/s, loss=4.16, mem_used=425.58MB, peak_mem=8854.30MB] 


	Epoch 7/10, Loss: 2.138584665844397



Epoch 8: 100%|██████████| 11069/11069 [42:11<00:00,  4.37batch/s, loss=4.51, mem_used=425.58MB, peak_mem=8854.30MB] 


	Epoch 8/10, Loss: 2.142617443753986



Epoch 9:  56%|█████▌    | 6178/11069 [23:29<18:35,  4.38batch/s, loss=2.72, mem_used=456.03MB, peak_mem=8854.30MB] 


KeyboardInterrupt: 

# ***

In [None]:
#del MyGat

In [None]:
batch_size = 16
loader_train = pyg.loader.DataLoader(graphs_list_train, batch_size=batch_size, num_workers=4, persistent_workers=True)
loader_test = pyg.loader.DataLoader(graphs_list_test, batch_size=batch_size, num_workers=4, persistent_workers=True)

#Claulte number of batches
num_batches_train = ceil(len(graphs_list_train) / batch_size)
num_batches_test = ceil(len(graphs_list_test) / batch_size)

print(f"There are {len(graphs_list_train)} graphs in the train set.")
print(f"There are {len(graphs_list_test)} graphs in the test set.")
print(f"N batches in train: {num_batches_train}")
print(f"N batches in test: {num_batches_test}")