In [None]:
MODEL_PATHS = ["0716-1258-md-codebert-from-codebert-base-scaler/ckpt_002.pth"]

In [None]:
import json
import os
from os import path as osp
from pathlib import Path

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import GroupShuffleSplit
from torch.nn import DataParallel
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

import transformers as tx

In [None]:
class Config:
    device = "cuda"
    input_dir = Path("../input/AI4Code/")
    working_dir = Path("../input/ai4code-model/")

    data_type = "md"

    optim = ["AdamW"][0]
    loss = ["MSE"][0]
    valid_ratio = 0.1
    max_len = 256
    num_epochs = 3
    num_workers = 2
    batch_size = 192
    lr = 3e-4
    accum_steps = 4
    seed = 42

    mode = "test"


config = Config()

In [None]:
def preprocess(config):
    if not osp.exists(config.input_dir / f"{config.mode}.csv"):
        data_path = list((config.input_dir / config.mode).glob("*.json"))
        notebooks = [
            read_notebook(path)
            for path in tqdm(data_path, desc="Reading notebooks")
        ]

        df = (
            pd.concat(notebooks)
            .set_index("id", append=True)
            .swaplevel()
            .sort_index(level="id", sort_remaining=False)
        )

        if config.mode == "train":
            df_orders = pd.read_csv(
                config.input_dir / "train_orders.csv",
                index_col="id",
                squeeze=True,
            ).str.split()

            df_orders_ = df_orders.to_frame().join(
                df.reset_index("cell_id").groupby("id")["cell_id"].apply(list),
                how="right",
            )

            ranks = {
                id_: {
                    "cell_id": cell_id,
                    "rank": get_ranks(cell_order, cell_id),
                }
                for id_, cell_order, cell_id in df_orders_.itertuples()
            }
            df_ranks = (
                pd.DataFrame.from_dict(ranks, orient="index")
                .rename_axis("id")
                .apply(pd.Series.explode)
                .set_index("cell_id", append=True)
            )

            df_ancestors = pd.read_csv(
                config.input_dir / "train_ancestors.csv", index_col="id"
            )
            df = (
                df.reset_index()
                .merge(df_ranks, on=["id", "cell_id"])
                .merge(df_ancestors, on=["id"])
            )
            df["pct_rank"] = df["rank"] / df.groupby("id")[
                "cell_id"
            ].transform("count")

            splitter = GroupShuffleSplit(
                n_splits=1,
                test_size=config.valid_ratio,
                random_state=config.seed,
            )
            idx_train, idx_valid = next(
                splitter.split(df, groups=df["ancestor_id"])
            )

            df_train = df.loc[idx_train].reset_index(drop=True).dropna()
            df_valid = df.loc[idx_valid].reset_index(drop=True).dropna()

            df_train_py = (
                df_train[df_train["cell_type"] == "code"]
                .drop("parent_id", axis=1)
                .dropna()
                .reset_index(drop=True)
            )
            df_valid_py = (
                df_valid[df_valid["cell_type"] == "code"]
                .drop("parent_id", axis=1)
                .dropna()
                .reset_index(drop=True)
            )
            df_train_md = (
                df_train[df_train["cell_type"] == "markdown"]
                .drop("parent_id", axis=1)
                .dropna()
                .reset_index(drop=True)
            )
            df_valid_md = (
                df_valid[df_valid["cell_type"] == "markdown"]
                .drop("parent_id", axis=1)
                .dropna()
                .reset_index(drop=True)
            )

            fts_train = get_features(df_train)
            fts_valid = get_features(df_valid)

            df_train.to_csv(config.input_dir / "train.csv", index=False)
            df_valid.to_csv(config.input_dir / "valid.csv", index=False)
            df_train_md.to_csv(config.input_dir / "train_md.csv", index=False)
            df_valid_md.to_csv(config.input_dir / "valid_md.csv", index=False)
            df_train_py.to_csv(config.input_dir / "train_py.csv", index=False)
            df_valid_py.to_csv(config.input_dir / "valid_py.csv", index=False)
            json.dump(
                open(config.input_dir / "train_fts.json", "w"), fts_train
            )
            json.dump(
                open(config.input_dir / "valid_fts.json", "w"), fts_valid
            )

            return (
                df_train,
                df_valid,
                df_train_md,
                df_valid_md,
                df_train_py,
                df_valid_py,
                fts_train,
                fts_valid,
                df_orders,
            )

        elif config.mode == "test":
            df_test = (
                (
                    pd.concat(notebooks)
                    .set_index("id", append=True)
                    .swaplevel()
                    .sort_index(level="id", sort_remaining=False)
                )
                .reset_index(drop=True)
                .dropna()
            )

            df_test["rank"] = df_test.groupby(["id", "cell_type"]).cumcount()
            df_test["pred"] = df_test.groupby(["id", "cell_type"])[
                "rank"
            ].rank(pct=True)
            df_test["pct_rank"] = 0

            df_test_py = (
                df_test[df_test["cell_type"] == "code"]
                .drop("parent_id", axis=1)
                .dropna()
                .reset_index(drop=True)
            )
            df_test_md = (
                df_test[df_test["cell_type"] == "markdown"]
                .drop("parent_id", axis=1)
                .dropna()
                .reset_index(drop=True)
            )

            fts_test = get_features(df_test)

            df_test.to_csv(config.input_dir / "test.csv", index=False)
            df_test_md.to_csv(config.input_dir / "test_md.csv", index=False)
            df_test_py.to_csv(config.input_dir / "test_py.csv", index=False)
            json.dump(open(config.input_dir / "test_fts.json", "w"), fts_test)

            return df_test, df_test_md, df_test_py, fts_test

    else:
        if config.mode == "train":
            df_train = pd.read_csv(config.input_dir / "train.csv").reset_index(
                drop=True
            )
            df_valid = pd.read_csv(config.input_dir / "valid.csv").reset_index(
                drop=True
            )
            df_train_md = pd.read_csv(
                config.input_dir / "train_md.csv"
            ).reset_index(drop=True)
            df_valid_md = pd.read_csv(
                config.input_dir / "valid_md.csv"
            ).reset_index(drop=True)
            df_train_py = pd.read_csv(
                config.input_dir / "train_py.csv"
            ).reset_index(drop=True)
            df_valid_py = pd.read_csv(
                config.input_dir / "valid_py.csv"
            ).reset_index(drop=True)
            fts_train = json.load(
                open(config.input_dir / "train_fts.json", "r")
            )
            fts_valid = json.load(
                open(config.input_dir / "valid_fts.json", "r")
            )
            df_orders = pd.read_csv(
                config.input_dir / "train_orders.csv",
                index_col="id",
                squeeze=True,
            ).str.split()
            return (
                df_train,
                df_valid,
                df_train_md,
                df_valid_md,
                df_train_py,
                df_valid_py,
                fts_train,
                fts_valid,
                df_orders,
            )

        elif config.mode == "test":
            df_test = pd.read_csv(config.input_dir / "test.csv").reset_index(
                drop=True
            )
            df_test_md = pd.read_csv(
                config.input_dir / "test_md.csv"
            ).reset_index(drop=True)
            df_test_py = pd.read_csv(
                config.input_dir / "test_py.csv"
            ).reset_index(drop=True)
            fts_test = json.load(open(config.input_dir / "test_fts.json", "r"))
            return df_test, df_test_md, df_test_py, fts_test


def read_notebook(path):
    return (
        pd.read_json(path, dtype={"cell_type": "category", "source": "str"})
        .assign(id=path.stem)
        .rename_axis("cell_id")
    )


def get_ranks(base, derived):
    return [base.index(d) for d in derived]


def clean_code(cell):
    return str(cell).replace("\\n", "\n")


def sample_cells(cells, sample_size=20):
    cells = [clean_code(cell) for cell in cells]
    if sample_size >= len(cells):
        return [cell[:200] for cell in cells]
    else:
        step = len(cells) / sample_size
        idx = 0
        samples = []
        while int(np.round(idx)) < len(cells):
            samples.append(cells[int(np.round(idx))])
            idx += step
        assert cells[0] in samples
        if cells[-1] not in samples:
            samples[-1] = cells[-1]
        return samples


def get_features(df):
    features = {}
    df = df.sort_values("rank").reset_index(drop=True)
    for idx, sub_df in tqdm(df.groupby("id")):
        total_md = sub_df[sub_df.cell_type == "markdown"].shape[0]
        code_sub_df = sub_df[sub_df.cell_type == "code"]
        total_code = code_sub_df.shape[0]
        codes = sample_cells(code_sub_df.source.values, 20)
        features[idx] = {
            "total_code": total_code,
            "total_md": total_md,
            "codes": codes,
        }
    return features

In [None]:
class NotebookDataset(Dataset):
    def __init__(self, df, fts, config):
        super().__init__()

        self.df = df.reset_index(drop=True)
        self.fts = fts
        self.config = config

        self.tokenizer = tx.AutoTokenizer.from_pretrained(config.model_path)

    def __getitem__(self, idx):
        item = self.df.iloc[idx]

        inputs = self.tokenizer.encode_plus(
            item.source,
            None,
            add_special_tokens=True,
            padding="max_length",
            truncation=True,
            return_token_type_ids=True,
            max_length=self.config.max_len,
        )
        code_inputs = self.tokenizer.batch_encode_plus(
            [str(x) for x in self.fts[item.id]["codes"]],
            add_special_tokens=True,
            padding="max_length",
            truncation=True,
            max_length=23,
        )

        n_md = self.fts[item.id]["total_md"]
        n_code = self.fts[item.id]["total_md"]

        if n_md + n_code == 0:
            fts = torch.FloatTensor([0])
        else:
            fts = torch.FloatTensor([n_md / (n_md + n_code)])

        ids = inputs["input_ids"]
        for x in code_inputs["input_ids"]:
            ids.extend(x[:-1])
        ids = ids[: self.total_max_len]
        if len(ids) != self.total_max_len:
            ids = ids + [
                self.tokenizer.pad_token_id,
            ] * (self.total_max_len - len(ids))
        ids = torch.LongTensor(ids)

        mask = inputs["attention_mask"]
        for x in code_inputs["attention_mask"]:
            mask.extend(x[:-1])
        mask = mask[: self.total_max_len]
        if len(mask) != self.total_max_len:
            mask = mask + [
                self.tokenizer.pad_token_id,
            ] * (self.total_max_len - len(mask))
        mask = torch.LongTensor(mask)

        if self.config.mode == "train":
            target = torch.FloatTensor([item.pct_rank])
            return ids, mask, fts, target
        else:
            return ids, mask, fts

    def __len__(self):
        return self.df.shape[0]

In [None]:
def get_model(config):
    model = NotebookArranger(tx.AutoModel.from_pretrained(config.model_path))

    try:
        model.load_state_dict(
            torch.load(
                config.working_dir / config.base_model / config.prev_model
            )
        )
    except:
        print(
            f"There is no {config.prev_model}, use base {config.base_model} instead"
        )

    if config.mode == "train":
        model = DataParallel(model, device_ids=[0, 1, 2, 3])
    return model.to(config.device)


class NotebookArranger(nn.Module):
    def __init__(self, pretrained_model):
        super().__init__()
        self.model = pretrained_model
        self.fc = nn.Linear(769, 1)

    def forward(self, ids, mask, fts):
        x = self.model(ids, mask)[0]
        y = self.fc(torch.cat((x[:, 0, :], fts), 1))
        return y

In [None]:
def test(model, dataloader, config):
    model.eval()

    tbar = tqdm(dataloader, total=len(dataloader))

    preds = []
    with torch.no_grad():
        for _, data in enumerate(tbar):
            ids = data[0].to(config.device)
            mask = data[1].to(config.device)
            fts = data[2].to(config.device)

            pred = model(ids=ids, mask=mask, fts=fts).view(-1)

            preds.append(pred.detach().cpu().numpy().ravel())

    return np.concatenate(preds)

In [None]:
print(f"Loading Data..: Start")
df_test, df_test_md, df_test_py, fts_test = preprocess(config)

if config.data_type == "all":
    df_testset = df_test
elif config.data_type == "md":
    df_testset = df_test_md
elif config.data_type == "py":
    df_testset = df_test_py

testset = NotebookDataset(df_testset, fts=fts_test, config=config)
testloader = DataLoader(
    testset,
    batch_size=config.batch_size,
    shuffle=False,
    num_workers=config.num_workers,
    drop_last=False,
)
print("Loading Data..: Done!")

In [None]:
dfs, num_models = [], len(MODEL_PATHS)
for i, model_path in enumerate(MODEL_PATHS):
    print(f"Loading Model {i} / {num_models}..: Start")
    tokenizer, model = get_model(config)
    print(f"Loading Model {i} / {num_models}..: Done!")

    print(f"Testing Model {i} / {num_models}..: Start")
    y_test = test(model, testloader, config)

    if config.data_type == "all":
        df_test["pred"] = y_test
    elif config.data_type == "md":
        df_test.loc[df_test["cell_type"] == "markdown", "pred"] = y_test
    elif config.data_type == "py":
        df_test.loc[df_test["cell_type"] == "code", "pred"] = y_test

    df_pred = (
        df_test.sort_values("pred")
        .groupby("id")["cell_id"]
        .apply(lambda x: " ".join(x))
        .reset_index()
    )
    df_pred.rename(columns={"cell_id": "cell_order"}, inplace=True)

    dfs.append(df_pred)
    print(f"Testing Model {i} / {num_models}..: Done!")

In [None]:
print("Ensemble..: Start")
df_1, len_df_1, num_dfs = dfs[0], len(dfs[0]), len(dfs)

ensembled_order = []
for idx in range(len_df_1):
    ensembled_sample = {
        k: v / num_dfs  # TBD weighted based on performance?
        for v, k in enumerate(df_1.iloc[idx]["cell_order"].split(" "))
    }
    for df in dfs[1:]:
        sample = {
            k: v / num_dfs
            for v, k in enumerate(df.iloc[idx]["cell_order"].split(" "))
        }
        for key in ensembled_sample:
            ensembled_sample[key] += sample[key]
    ensembled_order.append(
        " ".join(
            [
                i[0]
                for i in list(
                    sorted(ensembled_sample.items(), key=lambda x: x[1])
                )
            ]
        )
    )
df_1["cell_order"] = ensembled_order

df_1.to_csv("submission.csv", index=False)
print("Ensemble..: Done!")