In [None]:
PREV_MODELS = [
    "0726-1049-md-graphcodebert-fts-from-graphcodebert-base-scaler-fts/ckpt_003.pth",
    "0724-1749-md-graphcodebert-fts-from-graphcodebert-base-scaler-fts/ckpt_003.pth",
]

In [None]:
import json
import os
from os import path as osp
from pathlib import Path

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import GroupShuffleSplit
from torch.nn import DataParallel
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

import transformers as tx

In [None]:
class Config:
    device = "cuda"
    input_dir = Path("../input/AI4Code/")
    working_dir = Path("../input/ai4code-model/")

    data_type = "md"

    md_max_len = 128
    py_max_len = 23
    total_max_len = 512
    num_workers = 2
    batch_size = 64
    seed = 42

    # try:
    #     base_model = str(prev_model).split("/")[0].split("-")[3]
    # except:
    #     base_model = str(prev_model).split("/")[0].split("-")[0]

    # if base_model == "codebert":
    #     model_path = "microsoft/codebert-base"
    # elif base_model == "graphcodebert":
    #     model_path = "microsoft/graphcodebert-base"
    # elif base_model == "codet5":
    #     model_path = "Salesforce/codet5-base"


config = Config()

In [None]:
def preprocess(config):
    data_path = list((config.input_dir / "test").glob("*.json"))
    notebooks = [
        read_notebook(path)
        for path in tqdm(data_path, desc="Reading notebooks")
    ]

    df = (
        pd.concat(notebooks)
        .set_index("id", append=True)
        .swaplevel()
        .sort_index(level="id", sort_remaining=False)
    )

    df_test = df.reset_index().dropna()

    df_test["rank"] = df_test.groupby(["id", "cell_type"]).cumcount()
    df_test["pred"] = df_test.groupby(["id", "cell_type"])["rank"].rank(
        pct=True
    )
    df_test["pct_rank"] = 0

    df_test_py = (
        df_test[df_test["cell_type"] == "code"].dropna().reset_index(drop=True)
    )
    df_test_md = (
        df_test[df_test["cell_type"] == "markdown"]
        .dropna()
        .reset_index(drop=True)
    )

    fts_test = get_features(df_test)

    return df_test, df_test_md, df_test_py, fts_test


def read_notebook(path):
    return (
        pd.read_json(path, dtype={"cell_type": "category", "source": "str"})
        .assign(id=path.stem)
        .rename_axis("cell_id")
    )


def get_ranks(base, derived):
    return [base.index(d) for d in derived]


def clean_code(cell):
    return str(cell).replace("\\n", "\n")


def sample_cells(cells, sample_size=20):
    cells = [clean_code(cell) for cell in cells]
    if sample_size >= len(cells):
        return [cell[:200] for cell in cells]
    else:
        step = len(cells) / sample_size
        idx = 0
        samples = []
        while int(np.round(idx)) < len(cells):
            samples.append(cells[int(np.round(idx))])
            idx += step
        assert cells[0] in samples
        if cells[-1] not in samples:
            samples[-1] = cells[-1]
        return samples


def get_features(df):
    features = {}
    df = df.sort_values("rank").reset_index(drop=True)
    for idx, sub_df in tqdm(df.groupby("id")):
        total_md = sub_df[sub_df.cell_type == "markdown"].shape[0]
        code_sub_df = sub_df[sub_df.cell_type == "code"]
        total_code = code_sub_df.shape[0]
        codes = sample_cells(code_sub_df.source.values, 20)
        features[idx] = {
            "total_code": total_code,
            "total_md": total_md,
            "codes": codes,
        }
    return features

In [None]:
class NotebookDataset(Dataset):
    def __init__(self, df, fts, config):
        super().__init__()

        self.df = df.reset_index(drop=True)
        self.fts = fts
        self.config = config

        self.tokenizer = tx.AutoTokenizer.from_pretrained(config.model_path)

    def __getitem__(self, idx):
        item = self.df.iloc[idx]

        inputs = self.tokenizer.encode_plus(
            item.source,
            None,
            add_special_tokens=True,
            padding="max_length",
            truncation=True,
            return_token_type_ids=True,
            max_length=self.config.md_max_len,
        )
        code_inputs = self.tokenizer.batch_encode_plus(
            [str(x) for x in self.fts[item.id]["codes"]],
            add_special_tokens=True,
            padding="max_length",
            truncation=True,
            max_length=self.config.py_max_len,
        )

        n_md = self.fts[item.id]["total_md"]
        n_code = self.fts[item.id]["total_md"]

        if n_md + n_code == 0:
            fts = torch.FloatTensor([0])
        else:
            fts = torch.FloatTensor([n_md / (n_md + n_code)])

        ids = inputs["input_ids"]
        for x in code_inputs["input_ids"]:
            ids.extend(x[:-1])
        ids = ids[: self.config.total_max_len]
        if len(ids) != self.config.total_max_len:
            ids = ids + [
                self.tokenizer.pad_token_id,
            ] * (self.config.total_max_len - len(ids))
        ids = torch.LongTensor(ids)

        mask = inputs["attention_mask"]
        for x in code_inputs["attention_mask"]:
            mask.extend(x[:-1])
        mask = mask[: self.config.total_max_len]
        if len(mask) != self.config.total_max_len:
            mask = mask + [
                self.tokenizer.pad_token_id,
            ] * (self.config.total_max_len - len(mask))
        mask = torch.LongTensor(mask)

        return ids, mask, fts

    def __len__(self):
        return self.df.shape[0]

In [None]:
def get_model(config):
    model = NotebookArranger(tx.AutoModel.from_pretrained(config.model_path))
    model.load_state_dict(torch.load(config.working_dir / config.prev_model))
    return model.to(config.device)


class NotebookArranger(nn.Module):
    def __init__(self, pretrained_model):
        super().__init__()
        self.model = pretrained_model
        self.fc = nn.Linear(769, 1)

    def forward(self, ids, mask, fts):
        x = self.model(ids, mask)[0]
        y = self.fc(torch.cat((x[:, 0, :], fts), 1))
        return y

In [None]:
def test(model, dataloader, config):
    model.eval()

    tbar = tqdm(dataloader, total=len(dataloader))

    preds = []
    with torch.no_grad():
        for _, data in enumerate(tbar):
            ids = data[0].to(config.device)
            mask = data[1].to(config.device)
            fts = data[2].to(config.device)

            pred = model(ids=ids, mask=mask, fts=fts).view(-1)

            preds.append(pred.detach().cpu().numpy().ravel())

    return np.concatenate(preds)

In [None]:
# Load Data
df_test, df_test_md, df_test_py, fts_test = preprocess(config)

if config.data_type == "all":
    df_testset = df_test
elif config.data_type == "md":
    df_testset = df_test_md
elif config.data_type == "py":
    df_testset = df_test_py

testset = NotebookDataset(df_testset, fts=fts_test, config=config)
testloader = DataLoader(
    testset,
    batch_size=config.batch_size,
    shuffle=False,
    num_workers=config.num_workers,
    drop_last=False,
)

In [None]:
# Load Model and Test
dfs, num_models = [], len(PREV_MODELS)
for i, prev_model in enumerate(PREV_MODELS):
    config.prev_model = prev_model

    model = get_model(config)

    y_test = test(model, testloader, config)

    if config.data_type == "all":
        df_test["pred"] = y_test
    elif config.data_type == "md":
        df_test.loc[df_test["cell_type"] == "markdown", "pred"] = y_test
    elif config.data_type == "py":
        df_test.loc[df_test["cell_type"] == "code", "pred"] = y_test

    df_pred = (
        df_test.sort_values("pred")
        .groupby("id")["cell_id"]
        .apply(lambda x: " ".join(x))
        .reset_index()
    )
    df_pred.rename(columns={"cell_id": "cell_order"}, inplace=True)

    dfs.append(df_pred)

In [None]:
# Ensemble
df_1, len_df_1, num_dfs = dfs[0], len(dfs[0]), len(dfs)

ensembled_order = []
for idx in range(len_df_1):
    ensembled_sample = {
        k: v / num_dfs  # TBD weighted based on performance?
        for v, k in enumerate(df_1.iloc[idx]["cell_order"].split(" "))
    }
    for df in dfs[1:]:
        sample = {
            k: v / num_dfs
            for v, k in enumerate(df.iloc[idx]["cell_order"].split(" "))
        }
        for key in ensembled_sample:
            ensembled_sample[key] += sample[key]
    ensembled_order.append(
        " ".join(
            [
                i[0]
                for i in list(
                    sorted(ensembled_sample.items(), key=lambda x: x[1])
                )
            ]
        )
    )
df_1["cell_order"] = ensembled_order

df_1.to_csv("submission.csv", index=False)