In [1]:
%load_ext autoreload
%autoreload 2

import os
import pickle
from concurrent.futures import ProcessPoolExecutor
from pathlib import Path
from typing import *

import pandas as pd
import plotly.express as px

from spot.data import GitRepo
from spot.utils import cst, proj_root, pushover_alert, run_long_task, tqdm

os.chdir(proj_root())

datadir = Path(os.getenv("datadir"))
repos_dir = datadir / "SPOT-data/repos"

useful_repos_path = proj_root() / "scripts" / "useful_repos.pkl"
with useful_repos_path.open("rb") as f:
    useful_repos: list[GitRepo] = pickle.load(f)

repos_split_path = datadir / "SPOT-data/repos-processed-with_margin/repos_split.pkl"
with repos_split_path.open("rb") as f:
    repos_split: dict[str, list[GitRepo]] = pickle.load(f)

In [2]:
import numpy as np
import torch
from datasets import Dataset

from spot.data import load_datasets
from spot.model import CtxArgs, DecodingArgs, ModelSPOT, ModelWrapper, TokenizerSPOT
from spot.utils import TaskLoggingMonitor

train_r0 = False  # whether to train or load trained R0 model
with_margin = True
data_reduction = 1


margin_tag = "with_margin" if with_margin else "no_margin"

r0_datasets, repos_split = load_datasets(
    datadir / f"SPOT-data/repos-processed-{margin_tag}"
)

data_tag = "data_full" if data_reduction == 1 else f"data_1-{data_reduction}"
n_train = len(r0_datasets["train"].data) // data_reduction

r0_model_name = f"SPOT-{margin_tag}-{data_tag}"

if train_r0:
    r0_model_path = "Salesforce/codet5-base"
else: 
    r0_model_path = datadir / f"checkpoints/saved/{r0_model_name}"
    
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
tokenizer: TokenizerSPOT = TokenizerSPOT.from_pretrained(r0_model_path)

r0_model: ModelSPOT = ModelSPOT.from_pretrained(r0_model_path).to(device)
r0_monitor = TaskLoggingMonitor("R0")
r0_args = DecodingArgs(
    sampling_batch_size=512,
    ctx_args=CtxArgs(
        ctx_size=512,
        ctx_margin=128,
        types_in_ctx=True,
    ),
    max_workers=20,
)
r0_wrapper = ModelWrapper(r0_model, tokenizer, r0_args, r0_monitor)



In [3]:
import wandb
from spot.model import ModelTrainingArgs

r0_train_args = ModelTrainingArgs(
    train_batch_size=42,
    eval_batch_size=256,
    max_epochs=3,
)
r0_trainer = r0_wrapper.build_trainer(
    datadir / "checkpoints" / r0_model_name,
    r0_train_args,
    dataset=r0_datasets["train"].data,
    eval_dataset=r0_datasets["valid"].data,
)

if train_r0:
    wandb.init(
        project=r0_model_name,
        dir=str(datadir),
        config={"r0_decoding_args": r0_args, "r0_train_args": r0_train_args},
    )

    with run_long_task(f"Training {r0_model_name}"):
        init_perf = r0_trainer.evaluate(max_length=r0_args.generation_max_length)
        print("initial eval loss:", init_perf)
        r0_trainer.train()

    wandb.log({"time_stats": r0_monitor.timer.total_times()})

    final_perf = r0_trainer.evaluate(max_length=r0_args.generation_max_length)
    print("final eval loss:", final_perf)
    wandb.finish()


Using amp half precision backend


In [None]:
from spot.data import preds_to_accuracies, pretty_print_accuracies

r0_preds = r0_wrapper.predict(r0_datasets["test"], tqdm_args={})

pretty_print_accuracies(preds_to_accuracies(r0_preds, r0_datasets["test"]))

predict:   0%|          | 0/3974 [00:00<?, ?it/s]

partial_acc: 76.09%
partial_acc_wo_any: 76.54%
partial_accs:
   FuncArg: 72.92%
   FuncReturn: 82.47%
   ClassAtribute: 71.09%
   GlobalVar: 78.50%
   LocalVar: 78.96%
full_acc: 68.64%
full_accs:
   FuncArg: 66.84%
   FuncReturn: 76.65%
   ClassAtribute: 60.33%
   GlobalVar: 47.66%
   LocalVar: 53.96%
n_labels: 17756


In [4]:
train_r1 = True

r1_model_name = f"SPOT-R1-{margin_tag}-{data_tag}"

if train_r1:
    r1_model_path = "Salesforce/codet5-base"
else:
    r1_model_path = datadir / f"checkpoints/saved/{r1_model_name}"

r1_model: ModelSPOT = ModelSPOT.from_pretrained(r1_model_path).to(device)
r1_monitor = TaskLoggingMonitor("R1")
r1_args = DecodingArgs(
    sampling_batch_size=512,
    ctx_args=CtxArgs(
        ctx_size=512,
        ctx_margin=128,
        types_in_ctx=False,
    ),
    max_workers=20,
)
r1_wrapper = ModelWrapper(r1_model, tokenizer, r1_args, r1_monitor)

loading configuration file https://huggingface.co/Salesforce/codet5-base/resolve/main/config.json from cache at /mnt/data0/jiayi/hugface_cache/transformers/f1adf9032ebe26d0dd0b9c4917416e2db960b7e8b8e68f0612e8e5d5379488f5.20220fde7ff6c94c24bdcd615678f6a4374f3176abdc061beecc43a906725837
Model config T5Config {
  "_name_or_path": "/content/drive/MyDrive/CodeT5/pretrained_models/codet5_base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "bos_token_id": 1,
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 2,
  "feed_forward_proj": "relu",
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_atte

In [5]:
import pickle

from spot.data import TypeInfDataset, save_datasets

test_r1_parsing = False
use_file_level_feedback = False

feedback_tag = "-per_file" if use_file_level_feedback else ""
r1_data_path = datadir / f"SPOT-data/{'test-' if test_r1_parsing else ''}repos-processed-R1-{margin_tag}{feedback_tag}"
r1_datasets: Dict[str, TypeInfDataset]
load_r1_data = (r1_data_path / 'train-extra.pkl').exists() and not test_r1_parsing

if load_r1_data:
    print(f"Loading R1 datasets from {r1_data_path}...")
    r1_datasets, _ = load_datasets(r1_data_path)
else:
    # compute the r0_predictions first in case the next step fails
    r0_cache_path = r1_data_path / "r0_predictions.pkl"

    if r0_cache_path.exists():
        print(f"Loading R0 predictions from {r0_cache_path}...")
        with open(r0_cache_path, "rb") as f:
            r0_predictions = pickle.load(f)
    else:
        r0_predictions = dict()
        for name in ["valid", "test", "train"]:
            print("Predicting on:", name)
            r0_data = r0_datasets[name]
            if test_r1_parsing:
                r0_data = r0_data[:64]
            r0_predictions[name] = r0_wrapper.predict(r0_data, tqdm_args={"leave": False})
        
        r1_data_path.mkdir(parents=True, exist_ok=True)
        with open(r1_data_path / "r0_predictions.pkl", "wb") as f:
            pickle.dump(r0_predictions, f)

Loading R1 datasets from /mnt/data0/jiayi/SPOT-data/repos-processed-R1-with_margin...


In [6]:
if not load_r1_data:
    r1_datasets = {}
    for name in ["test", "train", "valid"]:
        print("Processing dataset:", name)
        repos = [r.repo_dir(repos_dir) for r in repos_split[name]]
        r0_data = r0_datasets[name]
        r0_preds = r0_predictions[name]
        if test_r1_parsing:
            r0_data = r0_data[:16]
            r0_preds = r0_preds[:16]
        r1_datasets[name] = r1_wrapper.generate_r1_inputs(
            repos, r0_data, r0_preds, tqdm_args={"leave": False}, use_file_level_feedback=use_file_level_feedback,
        )
    save_datasets(r1_datasets, repos_split, r1_data_path)

In [7]:
r1_train_args = ModelTrainingArgs(
    train_batch_size=38,
    eval_batch_size=200,
    max_epochs=3,
)
r1_trainer = r1_wrapper.build_trainer(
    datadir / "checkpoints" / r1_model_name,
    r1_train_args,
    dataset=r1_datasets["train"].data,
    eval_dataset=r1_datasets["valid"].data,
)

if train_r1:
    wandb.init(
        project=r1_model_name,
        dir=str(datadir),
        config={"r1_decoding_args": r1_args, "r1_train_args": r1_train_args},
    )

    with run_long_task(f"Training {r1_model_name}"):
        init_perf = r1_trainer.evaluate(max_length=r1_args.generation_max_length)
        print("initial performance:", init_perf)
        r1_trainer.train()

    wandb.log({"time_stats": r1_monitor.timer.total_times()})

    final_perf = r1_trainer.evaluate(max_length=r1_args.generation_max_length)
    print("final performance:", final_perf)
    wandb.finish()

PyTorch: setting up devices
Using amp half precision backend
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmrvplusone[0m. Use [1m`wandb login --relogin`[0m to force relogin


***** Running Evaluation *****
  Num examples = 5347
  Batch size = 200


Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
***** Running training *****
  Num examples = 81003
  Num Epochs = 3
  Instantaneous batch size per device = 36
  Total train batch size (w. parallel, distributed & accumulation) = 36
  Gradient Accumulation steps = 1
  Total optimization steps = 6753


initial performance: {'eval_loss': 2.967663049697876, 'eval_runtime': 45.7404, 'eval_samples_per_second': 116.899, 'eval_steps_per_second': 0.59}


Step,Training Loss,Validation Loss


In [None]:
from spot.data import preds_to_accuracies, pretty_print_accuracies

r1_preds = r1_wrapper.predict(r1_datasets["test"], tqdm_args={})
r1_accs = preds_to_accuracies(r1_preds, r1_datasets["test"])
pretty_print_accuracies(r1_accs)

predict:   0%|          | 0/22 [00:00<?, ?it/s]

partial_acc: 2.53%
partial_acc_wo_any: 2.53%
partial_accs:
   FuncArg: 4.35%
full_acc: 1.27%
full_accs:
   FuncArg: 2.17%
n_labels: 79


In [9]:
from spot.visualization import display_code_sequence, visualize_batch

display_code_sequence([visualize_batch(r1_datasets["test"], i, r1_preds, tokenizer, r1_args.ctx_args) for i in range(10)])

Tab(children=(HTML(value="<pre style='line-height: 1.2; padding: 10px; color: rgb(212,212,212); background-col…