# CodeLM

In [None]:
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# %matplotlib inline

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl

import transformers
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
transformers.logging.set_verbosity_error()

# from rank_bm25 import BM25Okapi, BM25L, BM25Plus
# from sentence_transformers import SentenceTransformer

import re
from tqdm import tqdm
from glob import glob
from itertools import combinations

import warnings
warnings.filterwarnings('ignore')

from argparse import ArgumentParser

# import wandb
# from pytorch_lightning.loggers import WandbLogger

# wandb.init(project="DACON_236228", name="CodeLM")
# wandb_logger = WandbLogger(project="DACON_236228", name="CodeLM")

parser = ArgumentParser(description="CodeLM")

parser.add_argument('--text_pretrained_model', default="graphcodebert-base", type=str)
parser.add_argument('--text_len', default=512, type=int)
parser.add_argument('--truncation_side', default='left', type=str) # right or left
parser.add_argument('--optimizer', default="adamw", type=str)
parser.add_argument('--learning_rate', default=0.00003, type=float)
parser.add_argument('--scheduler', default="none", type=str)
parser.add_argument('--batch_size', default=32, type=int)
parser.add_argument('--epochs', default=10, type=int)
parser.add_argument('--cv', default=5, type=int)
parser.add_argument('--seed', default=826, type=int)
parser.add_argument('--mixed_precision', default=16, type=int)
parser.add_argument('--device', nargs='+', default=[0], type=int)
parser.add_argument('--num_workers', default=0, type=int)
args = parser.parse_args('')

# wandb.config.update(args)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

text_len = args.text_len
BATCH_SIZE = args.batch_size
EPOCHS = args.epochs
CV = args.cv
SEED = args.seed

def set_seeds(seed=SEED):
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    pl.seed_everything(SEED)

set_seeds()

idx = f"{args.text_pretrained_model}"
idx

## pretrain.py

In [None]:
if args.text_pretrained_model == "unixcoder-base": # 1024
    txt_model_name = "microsoft/unixcoder-base"
    latent_dim = 768
if args.text_pretrained_model == "graphcodebert-base": # 512
    txt_model_name = "microsoft/graphcodebert-base"
    latent_dim = 768
if args.text_pretrained_model == "codebert-base": # 512
    txt_model_name = "microsoft/codebert-base"
    latent_dim = 768

tokenizer = AutoTokenizer.from_pretrained(txt_model_name)

model = AutoModel.from_pretrained(txt_model_name)

In [None]:
df = pd.read_csv("data/sample_train.csv")
# df = pd.read_csv("data/pp_train_graphcodebert-base_bm25plus_frac0.02.csv")
df.head()

In [None]:
test_df = pd.read_csv("data/test.csv")
# test_df = pd.read_parquet('data/pp_test_graphcodebert-base_bm25plus.parquet', engine='pyarrow')
test_df.head()

## data_loader.py

In [None]:
class TextDataset(Dataset):
    def __init__(self, df, is_test=False):
        self.df = df
        self.is_test = is_test
        self.tokenizer = tokenizer
        if args.truncation_side == "left":
            self.tokenizer.truncation_side = 'left'

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        encoding = self.tokenizer.encode_plus(
            row['code1'], row['code2'],
            add_special_tokens=True,
            padding="max_length",
            truncation=True,
            max_length=text_len,
            return_tensors="pt",
        )

        for k,v in encoding.items():
            encoding[k] = v.squeeze()

        if not self.is_test:
            labels = torch.tensor(row["similar"], dtype=torch.float)
            return encoding, labels

        return encoding

## model.py

In [None]:
class TextModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.txt_model = AutoModel.from_pretrained(txt_model_name)
        self.classifier = nn.Sequential(
            nn.Linear(latent_dim, 1)
        )

    def forward(self, inputs):
        txt_side = self.txt_model(**inputs)
        txt_feature = txt_side.last_hidden_state[:, 0, :]
        outputs = self.classifier(txt_feature)
        return outputs

In [None]:
class TextClassifier(pl.LightningModule):
    def __init__(self, backbone, args):
        super().__init__()
        self.backbone = backbone

    def forward(self, x):
        outputs = self.backbone(x)
        return outputs

    def step(self, batch):
        x = batch[0]
        y = batch[1]
        y_hat = self.forward(x)
        loss = nn.BCEWithLogitsLoss()(y_hat.squeeze(), y)
        return loss, y, y_hat

    def training_step(self, batch, batch_idx):
        loss, y, y_hat = self.step(batch)
        y_pred = (y_hat > 0).float().squeeze()
        acc = (y_pred == y).float().mean()
        self.log('train_loss', loss, on_step=False, on_epoch=True, prog_bar=True)
        self.log("train_acc", acc, on_step=False, on_epoch=True, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        loss, y, y_hat = self.step(batch)
        y_pred = (y_hat > 0).float().squeeze()
        acc = (y_pred == y).float().mean()
        self.log('val_loss', loss, on_epoch=True, prog_bar=True)
        self.log("val_acc", acc, on_epoch=True, prog_bar=True)
        return loss

    def test_step(self, batch, batch_idx):
        loss, y, y_hat = self.step(batch)

    def predict_step(self, batch, batch_idx, dataloader_idx=0):
        y_hat = self.forward(batch)
        return y_hat

    def configure_optimizers(self):
        if args.optimizer == "sgd":
            optimizer = torch.optim.SGD(self.parameters(), lr=args.learning_rate, momentum=0.9)
        if args.optimizer == "adam":
            optimizer = torch.optim.Adam(self.parameters(), lr=args.learning_rate)
        if args.optimizer == "adamw":
            optimizer = torch.optim.AdamW(self.parameters(), lr=args.learning_rate)
        
        if args.scheduler == "none":
            return optimizer
        if args.scheduler == "cosine":
            scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
                optimizer=optimizer,
                T_max=args.epochs//2,
                eta_min=args.learning_rate//10,
            )
            return [optimizer], [scheduler]

## train.py

In [None]:
## preprocessing.py

val_acc_list = []
preds_list = []

skf = StratifiedKFold(n_splits=CV, shuffle=True, random_state=SEED)

for i, (train_index, val_index) in enumerate(skf.split(df, df['similar'])):

    train_df = df.iloc[train_index]
    val_df = df.iloc[val_index]

## data_loaders.py

    train_ds = TextDataset(train_df, False)
    val_ds = TextDataset(val_df, False)
    test_ds = TextDataset(test_df, True)

    train_dataloader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=args.num_workers)
    val_dataloader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=args.num_workers)
    test_dataloader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=args.num_workers)

## train.py

    model = TextClassifier(TextModel(), args)

    callbacks = [
        # pl.callbacks.EarlyStopping(
        #     monitor="val_acc", patience=3, mode="max"
        # ),
        pl.callbacks.ModelCheckpoint(
            dirpath="saved/", filename=f"{idx}_{i}",
            monitor="val_acc", mode="max"
        ),
    ]
    
    trainer = pl.Trainer(
        max_epochs=EPOCHS, accelerator="auto", callbacks=callbacks,
        precision=args.mixed_precision, #logger=wandb_logger,
        devices=args.device, #strategy='ddp_find_unused_parameters_true'
    )

    trainer.fit(model, train_dataloader, val_dataloader)
    
    ckpt = torch.load(f"saved/{idx}_{i}.ckpt")
    model.load_state_dict(ckpt['state_dict'])

## test.py

    eval_dict = trainer.validate(model, dataloaders=val_dataloader)[0]
    val_acc_list.append(eval_dict["val_acc"])
    
    preds = trainer.predict(model, dataloaders=test_dataloader)
    preds_list.append(np.vstack(preds))

    break
    
val_acc_mean = np.mean(val_acc_list)

print(f"val_acc_mean: {val_acc_mean}")
# wandb.log({'val_acc_mean': val_acc_mean})

In [None]:
preds = np.mean(preds_list, axis=0)

preds.shape

## Submission

In [None]:
submission = pd.read_csv('data/sample_submission.csv')
submission['similar'] = np.where(preds>0, 1, 0)
submission.to_csv(f'{idx}.csv', index=False)

submission.head()

In [None]:
# wandb.finish()