# Importation

In [2]:
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

import os
import random 
import pickle

import models

from tqdm.notebook import tqdm
from multiprocessing import Pool

from torch_geometric.nn import summary

# Paramètres

In [25]:
# setup parameters

SEED = 1234
DATA_DIR = 'data'
DATASET = 'java-small'
EMBEDDING_DIM = 128
DROPOUT = 0.25
BATCH_SIZE = 256
CHUNKS = 10
MAX_LENGTH = 200
LOG_EVERY = 100 #print log of results after every LOG_EVERY batches
N_EPOCHS = 50
LOG_DIR = 'logs'
SAVE_DIR = 'checkpoints'
LOG_PATH = os.path.join(LOG_DIR, f'{DATASET}-log.txt')
MODEL_SAVE_PATH = os.path.join(SAVE_DIR, f'{DATASET}-model.pt')
LOAD = False #set true if you want to load model from MODEL_SAVE_PATH

device = torch.device('cuda')

In [None]:
torch.cuda.current_device()

0

# Seed fixing

In [4]:
random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
# torch.backends.cudnn.deterministic = True

# Chargement des données

## Dict des word (variables), path, target

In [5]:
with open(f'{DATA_DIR}/{DATASET}/{DATASET}.dict.c2v', 'rb') as file:
    word2count = pickle.load(file)
    path2count = pickle.load(file)
    target2count = pickle.load(file)
    n_training_examples = pickle.load(file)

# create vocabularies, initialized with unk and pad tokens

word2idx = {'<unk>': 0, '<pad>': 1}
path2idx = {'<unk>': 0, '<pad>': 1 }
target2idx = {'<unk>': 0, '<pad>': 1}

for w in word2count.keys():
    word2idx[w] = len(word2idx)

for p in path2count.keys():
    path2idx[p] = len(path2idx)

for t in target2count.keys():
    target2idx[t] = len(target2idx)

idx2word = {v: k for k, v in word2idx.items()}
idx2path = {v: k for k, v in path2idx.items()}
idx2target = {v: k for k, v in target2idx.items()}

In [6]:
len(idx2target)

11318

In [7]:
print(target2idx)



## File Reading

In [8]:
def load_data(file_path):
    with open(file_path, 'r') as f:
        a = [line.strip() for line in f]
        b = map(lambda l: (l.split(' ')[0], 
                                    [t.split(',') for t in l.split(' ')[1:] if t.strip()]), a)
        return [i for i in b if len(i[1]) <= MAX_LENGTH ]

In [9]:
data_test = load_data(f'{DATA_DIR}/{DATASET}/{DATASET}.test.c2v')

In [10]:
data_val = load_data(f'{DATA_DIR}/{DATASET}/{DATASET}.val.c2v')

In [11]:
data_train = load_data(f'{DATA_DIR}/{DATASET}/{DATASET}.train.c2v')

In [12]:
len(data_test), len(data_val), len(data_train)

(57088, 23844, 555075)

In [13]:
n_training_examples

1000

## Data Loader

In [14]:
def collate_fn(samples):
    name_idx = torch.tensor([target2idx.get(e[0], target2idx['<unk>']) for e in samples])

    path_context_idx = [
        ([word2idx.get(l, word2idx['<unk>']) for l, p, r in e[1]], 
         [path2idx.get(p, path2idx['<unk>']) for l, p, r in e[1]],
         [word2idx.get(r, word2idx['<unk>']) for l, p, r in e[1]])
        for e in samples
    ]
    
    # Determine max length for padding
    max_length = max(map(lambda e: len(e[0]), path_context_idx))

    # Pad sequences
    def pad_sequence(sequences, pad_value):
        return [seq + [pad_value] * (max_length - len(seq)) for seq in sequences]

    left_tensor = torch.tensor(pad_sequence([e[0] for e in path_context_idx], word2idx['<pad>']))
    path_tensor = torch.tensor(pad_sequence([e[1] for e in path_context_idx], path2idx['<pad>']))
    right_tensor = torch.tensor(pad_sequence([e[2] for e in path_context_idx], word2idx['<pad>']))

    return name_idx, left_tensor, path_tensor, right_tensor

In [27]:
train_loader = DataLoader(data_train, batch_size=BATCH_SIZE, collate_fn=collate_fn,
                          pin_memory=True, shuffle=True, num_workers=0, prefetch_factor=None)
test_loader = DataLoader(data_test, batch_size=BATCH_SIZE, collate_fn=collate_fn, 
                         pin_memory=True, shuffle=False, num_workers=0, prefetch_factor=None)
eval_loader = DataLoader(data_val, batch_size=BATCH_SIZE, collate_fn=collate_fn, 
                         pin_memory=True, shuffle=False, num_workers=0, prefetch_factor=None)

In [16]:
c = [0 for i in range(4)]
for ts in train_loader:
    for j, t in enumerate(ts):
        c[j] += t.eq(0).sum().item()
c

[465970, 8982284, 24025891, 11479783]

# Instanciation

In [17]:
model = models.Code2Vec(len(word2idx), len(path2idx), EMBEDDING_DIM, len(target2idx), DROPOUT).to(device)

if LOAD:
    print(f'Loading model from {MODEL_SAVE_PATH}')
    model.load_state_dict(torch.load(MODEL_SAVE_PATH))

optimizer = optim.Adam(model.parameters(), lr=0.001)

criterion = nn.CrossEntropyLoss().to(device)

In [18]:
if not os.path.isdir(f'{SAVE_DIR}'):
    os.makedirs(f'{SAVE_DIR}')

if not os.path.isdir(f'{LOG_DIR}'):
    os.makedirs(f'{LOG_DIR}')

if os.path.exists(LOG_PATH):
    os.remove(LOG_PATH)

## Overview

In [19]:
print(f"Model structure: {model}\n\n")

Model structure: Code2Vec(
  (node_embedding): Embedding(73906, 128)
  (path_embedding): Embedding(323, 128)
  (out): Linear(in_features=128, out_features=11318, bias=True)
  (do): Dropout(p=0.25, inplace=False)
)




In [20]:
for i in train_loader:
    a=i
    break
print(summary(model, *[b.to(device) for b in a][1:]))

+-----------------------------+------------------------------------+-----------------+------------+
| Layer                       | Input Shape                        | Output Shape    | #Param     |
|-----------------------------+------------------------------------+-----------------+------------|
| Code2Vec                    | [256, 197], [256, 197], [256, 197] | [256, 11318]    | 11,010,614 |
| ├─(node_embedding)Embedding | [256, 197]                         | [256, 197, 128] | 9,459,968  |
| ├─(path_embedding)Embedding | [256, 197]                         | [256, 197, 128] | 41,344     |
| ├─(out)Linear               | [256, 128]                         | [256, 11318]    | 1,460,022  |
| ├─(do)Dropout               | [256, 197, 384]                    | [256, 197, 384] | --         |
+-----------------------------+------------------------------------+-----------------+------------+


# Training

## métrique

In [25]:
def calculate_accuracy(fx, y):
    """
    Calculate top-1 accuracy

    fx = [batch size, output dim]
     y = [batch size]
    """
    pred_idxs = fx.max(1, keepdim=True)[1]
    correct = pred_idxs.eq(y.view_as(pred_idxs)).sum()
    acc = correct.float()/pred_idxs.shape[0]
    return acc


def calculate_f1(fx, y):
    """
    Calculate precision, recall and F1 score
    - Takes top-1 predictions
    - Converts to strings
    - Splits into sub-tokens
    - Calculates TP, FP and FN
    - Calculates precision, recall and F1 score

    fx = [batch size, output dim]
     y = [batch size]
    """
    pred_idxs = fx.max(1, keepdim=True)[1]
    pred_names = [idx2target[i.item()] for i in pred_idxs]
    original_names = [idx2target[i.item()] for i in y]
    true_positive, false_positive, false_negative = 0, 0, 0
    for p, o in zip(pred_names, original_names):
        predicted_subtokens = p.split('|')
        original_subtokens = o.split('|')
        for subtok in predicted_subtokens:
            if subtok in original_subtokens:
                true_positive += 1
            else:
                false_positive += 1
        for subtok in original_subtokens:
            if not subtok in predicted_subtokens:
                false_negative += 1
    try:
        precision = true_positive / (true_positive + false_positive)
        recall = true_positive / (true_positive + false_negative)
        f1 = 2 * precision * recall / (precision + recall)
    except ZeroDivisionError:
        precision, recall, f1 = 0, 0, 0
    return precision, recall, f1


def get_metrics(tensor_n, tensor_l, tensor_p, tensor_r, model, criterion):
    """
    Takes inputs, calculates loss, accuracy and other metrics, then calculates gradients and updates parameters

    if optimizer is None, then we are doing evaluation so no gradients are calculated and no parameters are updated
    """

    fx = model(tensor_l, tensor_p, tensor_r)

    loss = criterion(fx, tensor_n)

    acc = calculate_accuracy(fx, tensor_n)
    precision, recall, f1 = calculate_f1(fx, tensor_n)

    return loss, acc, precision, recall, f1

## Eval func

In [None]:
def evaluate(model, eval_loader, criterion, device):
    """
    Evaluation loop using DataLoader.
    Wraps computations in `torch.no_grad()` to avoid unnecessary gradient calculations.
    """

    model.eval()  # Set model to evaluation mode

    epoch_loss, epoch_acc, epoch_p, epoch_r, epoch_f1 = 0, 0, 0, 0, 0
    n_batches = 0

    with torch.no_grad():
        for tensor_n, tensor_l, tensor_p, tensor_r in eval_loader:
            # Move tensors to GPU
            tensor_n = tensor_n.to(device, non_blocking=True)
            tensor_l = tensor_l.to(device, non_blocking=True)
            tensor_p = tensor_p.to(device, non_blocking=True)
            tensor_r = tensor_r.to(device, non_blocking=True)
            torch.cuda.synchronize()

            # Forward pass
            loss, acc, p, r, f1 = get_metrics(tensor_n, tensor_l, tensor_p, tensor_r, model, criterion)

            # Accumulate metrics
            epoch_loss += loss
            epoch_acc += acc
            epoch_p += p
            epoch_r += r
            epoch_f1 += f1
            n_batches += 1

            if n_batches % LOG_EVERY == 0:
                log = f"\t| Batches: {n_batches} |\n"
                log += f"\t| Loss: {epoch_loss / n_batches:.3f} | Acc.: {epoch_acc / n_batches:.3f} | P: {epoch_p / n_batches:.3f} | R: {epoch_r / n_batches:.3f} | F1: {epoch_f1 / n_batches:.3f}"
                
                with open(LOG_PATH, 'a+') as f:
                    f.write(log + '\n')
                print(log)

    return epoch_loss / n_batches, epoch_acc / n_batches, epoch_p / n_batches, epoch_r / n_batches, epoch_f1 / n_batches


## Training func

In [None]:
def train(model, train_loader, optimizer, criterion, device):
    """
    Training loop using DataLoader for batch streaming
    """
    model.train()

    epoch_loss, epoch_acc, epoch_p, epoch_r, epoch_f1 = 0, 0, 0, 0, 0
    n_batches = 0

    for tensor_n, tensor_l, tensor_p, tensor_r in train_loader:
        # Move tensors to GPU
        tensor_n = tensor_n.to(device, non_blocking=True)
        tensor_l = tensor_l.to(device, non_blocking=True)
        tensor_p = tensor_p.to(device, non_blocking=True)
        tensor_r = tensor_r.to(device, non_blocking=True)
        torch.cuda.synchronize()

        # Forward pass
        optimizer.zero_grad()
        # loss, acc, p, r, f1 = get_metrics(tensor_n, tensor_l, tensor_p, tensor_r, model, criterion)
        
        fx = model(tensor_l, tensor_p, tensor_r)
        loss = criterion(fx, tensor_n)
        acc = calculate_accuracy(fx, tensor_n)
        p, r, f1 = calculate_f1(fx, tensor_n)

        # Backpropagation
        loss.backward()
        optimizer.step()

        # Update metrics
        epoch_loss += loss
        epoch_acc += acc
        epoch_p += p
        epoch_r += r
        epoch_f1 += f1
        n_batches += 1

        if n_batches % LOG_EVERY == 0:
            log = f"\t| Batches: {n_batches} | Completion: {((n_batches*BATCH_SIZE)/len(data_train))*100:.3f}% |\n"
            log += f"\t| Loss: {epoch_loss / n_batches:.3f} | Acc.: {epoch_acc / n_batches:.3f} | P: {epoch_p / n_batches:.3f} | R: {epoch_r / n_batches:.3f} | F1: {epoch_f1 / n_batches:.3f}"
            with open(LOG_PATH, 'a+') as f:
                f.write(log + '\n')
            print(log)

    return epoch_loss / n_batches, epoch_acc / n_batches, epoch_p / n_batches, epoch_r / n_batches, epoch_f1 / n_batches


## Log func

In [32]:
def logfunc(log):
    with open(LOG_PATH, 'a+') as f:
        f.write(log+'\n')
    print(log)

## Training Loop

In [None]:
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    logfunc(f"Epoch: {epoch+1:02} - Training")
    train_loss, train_acc, train_p, train_r, train_f1 = train(model, train_loader, optimizer, criterion, device)

    logfunc(f"Epoch: {epoch+1:02} - Validation")
    valid_loss, valid_acc, valid_p, valid_r, valid_f1 = evaluate(model, test_loader, criterion, device)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), os.path.join(SAVE_DIR, f'{DATASET}-{epoch:02}-model.pt'))

    log = f"| Epoch: {epoch+1:02} |\n"
    log += f"| Train Loss: {train_loss:.3f} | Train Precision: {train_p:.3f} | Train Recall: {train_r:.3f} | Train F1: {train_f1:.3f} | Train Acc: {train_acc * 100:.2f}% |\n"
    log += f"| Val. Loss: {valid_loss:.3f} | Val. Precision: {valid_p:.3f} | Val. Recall: {valid_r:.3f} | Val. F1: {valid_f1:.3f} | Val. Acc: {valid_acc * 100:.2f}% |"
    logfunc(log)


Epoch: 01 - Training
	| Batches: 100 | Completion: 4.612% |
	| Loss: 2.946 | Acc.: 0.732 | P: 0.732 | R: 0.732 | F1: 0.732
	| Batches: 200 | Completion: 9.224% |
	| Loss: 2.140 | Acc.: 0.784 | P: 0.784 | R: 0.784 | F1: 0.784
	| Batches: 300 | Completion: 13.836% |
	| Loss: 1.866 | Acc.: 0.802 | P: 0.802 | R: 0.802 | F1: 0.802
	| Batches: 400 | Completion: 18.448% |
	| Loss: 1.732 | Acc.: 0.810 | P: 0.810 | R: 0.810 | F1: 0.810
	| Batches: 500 | Completion: 23.060% |
	| Loss: 1.643 | Acc.: 0.816 | P: 0.816 | R: 0.816 | F1: 0.816
	| Batches: 600 | Completion: 27.672% |
	| Loss: 1.586 | Acc.: 0.820 | P: 0.820 | R: 0.820 | F1: 0.820
	| Batches: 700 | Completion: 32.284% |
	| Loss: 1.542 | Acc.: 0.823 | P: 0.823 | R: 0.823 | F1: 0.823
	| Batches: 800 | Completion: 36.896% |
	| Loss: 1.508 | Acc.: 0.825 | P: 0.825 | R: 0.825 | F1: 0.825
	| Batches: 900 | Completion: 41.508% |
	| Loss: 1.488 | Acc.: 0.826 | P: 0.826 | R: 0.826 | F1: 0.826
	| Batches: 1000 | Completion: 46.120% |
	| Loss: 1.46

KeyboardInterrupt: 

# Evaluation

In [None]:
logfunc('Testing')

model.load_state_dict(torch.load(MODEL_SAVE_PATH))

test_loss, test_acc, test_p, test_r, test_f1 = evaluate(model, eval_loader, criterion, device)

logfunc(f'| Test Loss: {test_loss:.3f} | Test Precision: {test_p:.3f} | Test Recall: {test_r:.3f} | Test F1: {test_f1:.3f} | Test Acc: {test_acc*100:.2f}% |')