# Importation

In [30]:
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

import os
import sys
import pickle

import models

from tqdm.notebook import tqdm
from multiprocessing import Pool

from torch_geometric.nn import summary

# Paramètres

In [None]:

# setup parameters

SEED = 1234
DATA_DIR = 'data'
DATASET_PATH = 'java-small-preprocessed-code2vec/java-small'
DATASET_NAME = 'java-small'
EMBEDDING_DIM = 128
DROPOUT = 0.25
BATCH_SIZE = 128
MAX_LENGTH = 200
LOG_EVERY = 1000 #print log of results after every LOG_EVERY batches
N_EPOCHS = 20
START_EPOCHS = 0
LOG_DIR = 'logs'
SAVE_DIR = 'checkpoints'
LOG_PATH = os.path.join(LOG_DIR, f'{DATASET_NAME}-log.txt')
STATE_FILE = os.path.join(SAVE_DIR, f"state_file.pth")
LOAD = True #set true if you want to load model from MODEL_SAVE_PATH

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


## Log func

In [32]:
def logfunc(log):
    with open(LOG_PATH, 'a+') as f:
        f.write(log+'\n')
    print(log)

## Dir init

In [33]:
if not os.path.isdir(f'{SAVE_DIR}'):
    os.makedirs(f'{SAVE_DIR}')

if not os.path.isdir(f'{LOG_DIR}'):
    os.makedirs(f'{LOG_DIR}')

""" if os.path.exists(LOG_PATH):
    os.remove(LOG_PATH) """

' if os.path.exists(LOG_PATH):\n    os.remove(LOG_PATH) '

# Chargement des données

## Dict des word (variables), path, target

In [34]:
with open(f'{DATA_DIR}/{DATASET_PATH}/{DATASET_NAME}.dict.c2v', 'rb') as file:
    word2count = pickle.load(file)
    path2count = pickle.load(file)
    target2count = pickle.load(file)
    n_training_examples = pickle.load(file)

# create vocabularies, initialized with unk and pad tokens

word2idx = {'<unk>': 0, '<pad>': 1}
path2idx = {'<unk>': 0, '<pad>': 1 }
target2idx = {'<unk>': 0, '<pad>': 1}

for w in word2count.keys():
    word2idx[w] = len(word2idx)

for p in path2count.keys():
    path2idx[p] = len(path2idx)

for t in target2count.keys():
    target2idx[t] = len(target2idx)

idx2word = {v: k for k, v in word2idx.items()}
idx2path = {v: k for k, v in path2idx.items()}
idx2target = {v: k for k, v in target2idx.items()}

In [35]:
del pickle

In [36]:
logfunc(f"nb_target : {len(idx2target)}, nb_var : {len(idx2word)}, nb_path {len(idx2path)}")

nb_target : 199749, nb_var : 507272, nb_path 807139


## File Reading

In [37]:
def load_data(file_path):
    with open(file_path, 'r') as f:
        return [
            (line.split(' ')[0], [t.split(',') for t in line.split(' ')[1:] if t.strip()])
            for line in f if len(line.split(' ')) - 1 <= MAX_LENGTH
        ]

In [38]:
def load_data(file_path):
    data = []
    
    with open(file_path, 'r') as f:
        for line in tqdm(f.readlines(), f"load {file_path}"):
            parts = line.strip().split(' ')
            if len(parts) - 1 > MAX_LENGTH:
                continue
            
            name = target2idx.get(parts[0], target2idx['<unk>'])
            
            path_contexts = [tuple(t.split(',')) for t in parts[1:] if t.strip()]
            left, path, right = zip(*path_contexts) if path_contexts else ([], [], [])
            
            left_tensor = torch.tensor([word2idx.get(l, word2idx['<unk>']) for l in left], dtype=torch.long)
            path_tensor = torch.tensor([path2idx.get(p, path2idx['<unk>']) for p in path], dtype=torch.long)
            right_tensor = torch.tensor([word2idx.get(r, word2idx['<unk>']) for r in right], dtype=torch.long)

            data.append((torch.tensor(name, dtype=torch.long), left_tensor, path_tensor, right_tensor))
    
    return data

In [39]:
data_test = load_data(f'{DATA_DIR}/{DATASET_PATH}/{DATASET_NAME}.test.c2v')

load data/java-small-preprocessed-code2vec/java-small/java-small.test.c2v:   0%|          | 0/56165 [00:00<?, …

In [40]:
data_val = load_data(f'{DATA_DIR}/{DATASET_PATH}/{DATASET_NAME}.val.c2v')

load data/java-small-preprocessed-code2vec/java-small/java-small.val.c2v:   0%|          | 0/23505 [00:00<?, ?…

In [41]:
data_train = load_data(f'{DATA_DIR}/{DATASET_PATH}/{DATASET_NAME}.train.c2v')

load data/java-small-preprocessed-code2vec/java-small/java-small.train.c2v:   0%|          | 0/665115 [00:00<?…

In [42]:
logfunc(f"len(data_test)={len(data_test)}, len(data_val)={len(data_val)}, len(data_train)={len(data_train)}")

len(data_test)=56165, len(data_val)=23505, len(data_train)=665115


In [43]:
n_training_examples

665115

## Data Loader

In [44]:
def collate_fn(samples):
    name_idx = torch.stack([e[0] for e in samples])
    
    max_length = max(len(e[1]) for e in samples)
    
    def pad_tensor(tensor_list, pad_value):
        return torch.stack([torch.cat([t, torch.full((max_length - len(t),), pad_value)]) for t in tensor_list])

    left_tensor = pad_tensor([e[1] for e in samples], word2idx['<pad>'])
    path_tensor = pad_tensor([e[2] for e in samples], path2idx['<pad>'])
    right_tensor = pad_tensor([e[3] for e in samples], word2idx['<pad>'])

    return name_idx, left_tensor, path_tensor, right_tensor


In [45]:
train_loader = DataLoader(data_train, batch_size=BATCH_SIZE, collate_fn=collate_fn,
                          pin_memory=True, shuffle=True, num_workers=0, prefetch_factor=None)
test_loader = DataLoader(data_test, batch_size=BATCH_SIZE, collate_fn=collate_fn, 
                         pin_memory=True, shuffle=False, num_workers=0, prefetch_factor=None)
eval_loader = DataLoader(data_val, batch_size=BATCH_SIZE, collate_fn=collate_fn, 
                         pin_memory=True, shuffle=False, num_workers=0, prefetch_factor=None)

In [46]:
len(train_loader), len(test_loader), len(eval_loader)

(5197, 439, 184)

In [47]:
c = [0 for i in range(4)]
for ts in tqdm(train_loader, "test for 0 in train tensor"):
    for j, t in enumerate(ts):
        c[j] += t.eq(0).sum().item()
print(c)

test for 0 in train tensor:   0%|          | 0/5197 [00:00<?, ?it/s]

[0, 0, 625202, 0]


In [48]:
del c, ts

In [49]:
m, Ma = sys.maxsize, 0
for v in path2count.values():
    m, Ma = min(m,v), max(Ma,v)
print(f"path count range : {(m, Ma)}, mean : {sum(path2count.values())/len(path2count.values())}")
del m, Ma

path count range : (5, 852233), mean : 130.4382007019874


In [50]:
train_loader.desc = "train"
test_loader.desc = "test"
eval_loader.desc = "eval"

# Instanciation

## Seed Fixing

In [51]:
torch.manual_seed(SEED)
if torch.cuda.is_available(): torch.cuda.manual_seed(SEED)
# torch.backends.cudnn.deterministic = True

In [52]:
def save_state(filepath: str, curent_epoch: int):
    """Save RNG states for PyTorch, CUDA and epochs number."""
    states = {
        'torch_state': torch.get_state(),
        'torch_cuda_state': torch.cuda.get_state_all() if torch.cuda.is_available() else None,
        'curent_epoch': curent_epoch
    }
    torch.save(states, filepath)
    logfunc(f"RNG states saved to {filepath}")

def load_state(filepath: str):
    """Load RNG states for PyTorch, CUDA and epochs number."""
    states = torch.load(filepath)
    
    curent_epoch=states['curent_epoch']
    torch.set_state(states['torch_state'])
    
    if torch.cuda.is_available() and states['torch_cuda_state'] is not None:
        torch.cuda.set_state_all(states['torch_cuda_state'])
    
    logfunc(f"RNG states loaded from {filepath}")
    return curent_epoch

In [53]:
model = models.Code2Vec(
    nodes_dim=      len(word2idx),      # nb de "var"
    paths_dim=      len(path2idx),      # nb de path
    embedding_dim=  EMBEDDING_DIM,      # à découpé
    output_dim=     len(target2idx),    # nb de classe
    dropout=        DROPOUT).to(device)

## weight loading, curent_epoch and rng restore 

In [None]:
try: 
    curent_epoch = load_state(STATE_FILE)
except:
    curent_epoch = START_EPOCHS

if LOAD and (curent_epoch != START_EPOCHS):
    MODEL_SAVE_PATH = os.path.join(SAVE_DIR, f'{DATASET_NAME}-{curent_epoch:02}-model.pt')

    logfunc(f'Loading model from {MODEL_SAVE_PATH}')
    model.load_state_dict(torch.load(MODEL_SAVE_PATH))


Loading model from checkpoints\java-small-01-model.pt


In [None]:
optimizer = optim.Adam(model.parameters(), lr=0.001)

criterion = nn.CrossEntropyLoss().to(device)
eval_criterion = nn.CrossEntropyLoss(reduction='sum').to(device)

## Overview

In [56]:
logfunc(f"\nModel structure: {model}\n")


Model structure: Code2Vec(
  (node_embedding): Embedding(507272, 128)
  (path_embedding): Embedding(807139, 128)
  (out): Linear(in_features=128, out_features=199749, bias=False)
  (do): Dropout(p=0.25, inplace=False)
)



In [57]:
for i in train_loader:
    a=i
    break
logfunc(summary(model, *[b.to(device) for b in a][1:]))
logfunc(f"shape for sumary: {[i.shape for i in a]}")
logfunc("\n")

+-----------------------------+------------------------------------+-----------------+-------------+
| Layer                       | Input Shape                        | Output Shape    | #Param      |
|-----------------------------+------------------------------------+-----------------+-------------|
| Code2Vec                    | [128, 200], [128, 200], [128, 200] | [128, 199749]   | 193,861,760 |
| ├─(node_embedding)Embedding | [128, 200]                         | [128, 200, 128] | 64,930,816  |
| ├─(path_embedding)Embedding | [128, 200]                         | [128, 200, 128] | 103,313,792 |
| ├─(out)Linear               | [128, 128]                         | [128, 199749]   | 25,567,872  |
| ├─(do)Dropout               | [128, 200, 384]                    | [128, 200, 384] | --          |
+-----------------------------+------------------------------------+-----------------+-------------+
shape for sumary: [torch.Size([128]), torch.Size([128, 200]), torch.Size([128, 200]), torch

# Training

## métrique

In [None]:
def calculate_accuracy(fx:torch.Tensor, y:torch.Tensor):
    """
    Calculate top-1 accuracy

    fx = [batch size, output dim]
     y = [batch size]
    """
    pred_idxs = fx.max(1, keepdim=True)[1]
    correct = pred_idxs.eq(y.view_as(pred_idxs)).sum()
    acc = correct.float()/pred_idxs.shape[0]
    return acc

def calculate_f1(fx, y):
    """
    Calculate precision, recall and F1 score
    - Takes top-1 predictions
    - Converts to strings
    - Splits into sub-tokens
    - Calculates TP, FP and FN
    - Calculates precision, recall and F1 score

    fx = [batch size, output dim]
     y = [batch size]
    """
    pred_idxs = fx.max(1, keepdim=True)[1]
    pred_names = [idx2target[i.item()] for i in pred_idxs]
    original_names = [idx2target[i.item()] for i in y]
    true_positive, false_positive, false_negative = 0, 0, 0
    for p, o in zip(pred_names, original_names):
        predicted_subtokens = p.split('|')
        original_subtokens = o.split('|')
        for subtok in predicted_subtokens:
            if subtok in original_subtokens:
                true_positive += 1
            else:
                false_positive += 1
        for subtok in original_subtokens:
            if not subtok in predicted_subtokens:
                false_negative += 1
    try:
        precision = true_positive / (true_positive + false_positive)
        recall = true_positive / (true_positive + false_negative)
        f1 = 2 * precision * recall / (precision + recall)
    except ZeroDivisionError:
        precision, recall, f1 = 0, 0, 0
    return precision, recall, f1


def get_metrics(tensor_n, tensor_l, tensor_p, tensor_r, model, criterion):
    """
    Takes inputs, calculates loss, accuracy and other metrics, then calculates gradients and updates parameters

    if optimizer is None, then we are doing evaluation so no gradients are calculated and no parameters are updated
    """

    fx = model(tensor_l, tensor_p, tensor_r)

    loss = criterion(fx, tensor_n)

    acc = calculate_accuracy(fx, tensor_n)
    precision, recall, f1 = calculate_f1(fx, tensor_n)

    return loss, acc, precision, recall, f1

## Eval func

In [None]:
def evaluate(model:torch.nn, eval_loader:DataLoader, criterion, device:torch.device):
    """
    Evaluation loop using DataLoader.
    Wraps computations in `torch.no_grad()` to avoid unnecessary gradient calculations.
    """

    model.eval()  # Set model to evaluation mode

    cuml_loss, cuml_acc = 0, 0
    true_positive, false_positive, false_negative = 0, 0, 0

    nb_ex = len(eval_loader.dataset)

    with torch.no_grad():
        for tensor_n, tensor_l, tensor_p, tensor_r in tqdm(eval_loader, desc="eval for {eval_loader.desc} batch", position=1):
            # Move tensors to GPU
            tensor_n = tensor_n.to(device, non_blocking=True)
            tensor_l = tensor_l.to(device, non_blocking=True)
            tensor_p = tensor_p.to(device, non_blocking=True)
            tensor_r = tensor_r.to(device, non_blocking=True)
            if torch.cuda.is_available(): torch.cuda.synchronize(device)

            fx = model(tensor_l, tensor_p, tensor_r)

            cuml_loss += criterion(fx, tensor_n)

            # top-1 prediction
            pred_idxs = fx.max(1, keepdim=True)[1]

            #acc = calculate_accuracy(fx, tensor_n)
            cuml_acc += pred_idxs.eq(tensor_n.view_as(pred_idxs)).sum()

            #p, r, f1 = calculate_f1(fx, tensor_n)
            """Calculate precision, recall and F1 score
            - Converts to strings
            - Splits into sub-tokens
            - Calculates TP, FP and FN
            - Calculates precision, recall and F1 score"""
            pred_names = [idx2target[i.item()] for i in pred_idxs]
            original_names = [idx2target[i.item()] for i in tensor_n]
            for p, o in zip(pred_names, original_names):
                predicted_subtokens = p.split('|')
                original_subtokens = o.split('|')
                for subtok in predicted_subtokens:
                    if subtok in original_subtokens:
                        true_positive += 1
                    else:
                        false_positive += 1
                for subtok in original_subtokens:
                    if not subtok in predicted_subtokens:
                        false_negative += 1

    try:
        precision = true_positive / (true_positive + false_positive)
        recall = true_positive / (true_positive + false_negative)
        f1 = 2 * precision * recall / (precision + recall)
    except ZeroDivisionError:
        precision, recall, f1 = 0, 0, 0

    return cuml_loss / nb_ex, cuml_acc / nb_ex, precision, recall, f1


## Training func

In [None]:
def train(model:torch.nn, train_loader:DataLoader, criterion, device:torch.device):
    """
    Training loop using DataLoader for batch streaming
    """
    model.train()

    n_batches = 0

    for tensor_n, tensor_l, tensor_p, tensor_r in tqdm(train_loader, desc="batch - trainning", position=1):
        # Move tensors to GPU
        tensor_n = tensor_n.to(device, non_blocking=True)
        tensor_l = tensor_l.to(device, non_blocking=True)
        tensor_p = tensor_p.to(device, non_blocking=True)
        tensor_r = tensor_r.to(device, non_blocking=True)
        if torch.cuda.is_available(): torch.cuda.synchronize(device)

        # Forward pass
        optimizer.zero_grad()
               
        fx = model(tensor_l, tensor_p, tensor_r)
        loss = criterion(fx, tensor_n)

        # Backpropagation
        loss.backward()
        optimizer.step()

        # Update metrics
        n_batches += 1


## Training Loop

In [32]:
import gc
collected = gc.collect()
print(collected)

39


In [None]:
best_valid_loss = float('inf')

for epoch in tqdm(range(START_EPOCHS+1, N_EPOCHS+1), desc="epoch", position=0):
    logfunc(f"Epoch: {epoch:02} - Training")
    train(model, train_loader, optimizer, criterion, device)

    logfunc(f"Epoch: {epoch:02} - Validation - train dataset")
    train_loss, train_acc, train_p, train_r, train_f1 = evaluate(model, eval_loader, eval_criterion, device)

    logfunc(f"Epoch: {epoch:02} - Validation - valid dataset")
    valid_loss, valid_acc, valid_p, valid_r, valid_f1 = evaluate(model, eval_loader, eval_criterion, device)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
    
    torch.save(model.state_dict(), os.path.join(SAVE_DIR, f'{DATASET_NAME}-{epoch:02}-model.pt'))

    save_state(STATE_FILE, epoch)

    log = f"| Epoch: {epoch:02} |\n"
    log += f"| Train Loss: {train_loss:.3f} | Train Precision: {train_p:.3f} | Train Recall: {train_r:.3f} | Train F1: {train_f1:.3f} | Train Acc: {train_acc * 100:.2f}% |\n"
    log += f"| Val. Loss: {valid_loss:.3f} | Val. Precision: {valid_p:.3f} | Val. Recall: {valid_r:.3f} | Val. F1: {valid_f1:.3f} | Val. Acc: {valid_acc * 100:.2f}% |"
    logfunc(log)


epoch:   0%|          | 0/18 [00:00<?, ?it/s]

Epoch: 03 - Training


batch - trainning:   0%|          | 0/5197 [00:00<?, ?it/s]

	| Batches: 1000 | Completion: 19.245% |
	| Loss: 9.027 | Acc.: 0.168 | P: 0.321 | R: 0.240 | F1: 0.274
	| Batches: 2000 | Completion: 38.490% |
	| Loss: 9.090 | Acc.: 0.174 | P: 0.332 | R: 0.244 | F1: 0.281
	| Batches: 3000 | Completion: 57.734% |
	| Loss: 9.009 | Acc.: 0.183 | P: 0.346 | R: 0.252 | F1: 0.292
	| Batches: 4000 | Completion: 76.979% |
	| Loss: 8.919 | Acc.: 0.190 | P: 0.356 | R: 0.259 | F1: 0.300
	| Batches: 5000 | Completion: 96.224% |
	| Loss: 8.814 | Acc.: 0.197 | P: 0.366 | R: 0.267 | F1: 0.308
Epoch: 03 - Validation


eval for {eval_loader.desc} batch:   0%|          | 0/439 [00:00<?, ?it/s]

| Epoch: 03 |
| Train Loss: 8.793 | Train Precision: 0.367 | Train Recall: 0.268 | Train F1: 0.310 | Train Acc: 19.81% |
| Val. Loss: 12.142 | Val. Precision: 0.095 | Val. Recall: 0.126 | Val. F1: 0.108 | Val. Acc: 6.10% |
Epoch: 04 - Training


batch - trainning:   0%|          | 0/5197 [00:00<?, ?it/s]

KeyboardInterrupt: 

# Testing

In [None]:
logfunc('Testing')

# model.load_state_dict(torch.load(MODEL_SAVE_PATH))

test_loss, test_acc, test_p, test_r, test_f1 = evaluate(model, test_loader, criterion, device)

logfunc(f'| Test Loss: {test_loss:.3f} | Test Precision: {test_p:.3f} | Test Recall: {test_r:.3f} | Test F1: {test_f1:.3f} | Test Acc: {test_acc*100:.2f}% |')