# Contents

- [`AI4Code - EDA & Baseline.ipynb`](#ai4code---eda--baselineipynb): Task description (_Contains TPU Explanation_) & Baseline before EDA
- [`Getting Started with AI4Code.ipynb`](#getting-started-with-ai4codeipynb): Just an ordinary competition baseline using XGBoost model
- [`AI4Code PyTorch - BERT Large + W&B.ipynb`](#ai4code-pytorch---bert-large--wbipynb): **_Useful_** baseline using Torch-based BERT model
- [`AI4Code Pytorch DistilBert Baseline.ipynb`](#ai4code-pytorch-distilbert-baselineipynb): **_Fully-modularized_** baseline using DistilBert model

### Additional links

- [Huggingface Tutorial](https://www.ohsuz.dev/22f4e8e7-64a3-4789-9dd2-171913883733)


# `AI4Code - EDA & Baseline.ipynb`


## Task description

- Visualization

  ![](https://storage.googleapis.com/kaggle-media/Images/notebook_cell_examples.png)

- Submission format

  ```csv
  id,cell_order
  0009d135ece78d,ddfd239c c6cd22db 1372ae9b ...
  0010483c12ba9b,54c7cab3 fe66203e 7844d5f8 ...
  0010a919d60e4f,aafc3d23 80e077ec b190ebb4 ...
  0028856e09c5b7,012c9d02 d22526d1 3ae7ece3 ...
  etc.
  ```


## Competition evaluation

- [Kendall taus correlation](https://en.wikipedia.org/wiki/Kendall_rank_correlation_coefficient)

$$
K = 1 - 4 \frac{\sum_i S_{i}}{\sum_i n_i(n_i - 1)}
$$


In [None]:
from bisect import bisect

# Actually O(N^2), but fast in practice for our data
def count_inversions(a):
    inversions = 0
    sorted_so_far = []
    for i, u in enumerate(a):  # O(N)
        j = bisect(sorted_so_far, u)  # O(log N)
        inversions += i - j
        sorted_so_far.insert(j, u)  # O(N)
    return inversions


def calc_kendall_tau(ground_truth, predictions):
    total_inversions = 0  # total inversions in predicted ranks across all instances
    total_2max = 0  # maximum possible inversions across all instances
    for gt, pred in zip(ground_truth, predictions):
        ranks = [
            gt.index(x) for x in pred
        ]  # rank predicted order in terms of ground truth
        total_inversions += count_inversions(ranks)
        n = len(gt)
        total_2max += n * (n - 1)
    return 1 - 4 * total_inversions / total_2max


## File information

- `train/`

  ```json
  {
    "cell_type": {
      "5460dfe1": "code",
      "17d509b0": "code",
      "8861a30e": "code",
      "e2980b77": "markdown",
      "5bcf2a5e": "markdown",
      "5ca5f1a9": "markdown"
    },
    "source": {
      "5460dfe1": "!nvidia-smi",
      "17d509b0": "import numpy as np\n ...",
      "8861a30e": "NUM_CLASSES = 397\n ...",
      "e2980b77": "# Data",
      "5bcf2a5e": "original kernel\uff1ahttps://www.kaggle.com/kneroma/clean-fast-simple-bird-identifier-inferenceoriginal",
      "5ca5f1a9": "I tried efficinet B4 B5, but the effect did not improve compared with the original kernel"
    }
  }
  ```

- `train_orders.csv`: Same format as submission format

- `train_ancestors.csv`: Forking history of notebooks in the training dataset

  ```csv
  id,ancestor_id,parent_id
  00001756c60be8,945aea18,
  00015c83e2717b,aa2da37e,317b65d12af9df
  0001bdd4021779,a7711fde,
  ```

- `test/`: Same format as train dataset


## Accelerator setup on Kaggle notebooks


In [None]:
import tensorflow as tf

# Using TPU
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is set. On Kaggle this is always the case.
    TPU = tf.distribute.cluster_resolver.TPUClusterResolver()
except ValueError:
    TPU = None

if TPU:
    print(f"\n... RUNNING ON TPU - {TPU.master()}...")
    tf.config.experimental_connect_to_cluster(TPU)
    tf.tpu.experimental.initialize_tpu_system(TPU)
    strategy = tf.distribute.experimental.TPUStrategy(TPU)
else:
    print(f"\n... RUNNING ON CPU/GPU ...")
    # Yield the default distribution strategy in Tensorflow
    #   --> Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()


In [None]:
# What Is a Replica?
#    --> A single Cloud TPU device consists of FOUR chips, each of which has TWO TPU cores.
#    --> Therefore, for efficient utilization of Cloud TPU, a program should make use of each of the EIGHT (4x2) cores.
#    --> Each replica is essentially a copy of the training graph that is run on each core and
#        trains a mini-batch containing 1/8th of the overall batch size
N_REPLICAS = strategy.num_replicas_in_sync


## Data access on Kaggle notebooks


In [None]:
from kaggle_datasets import KaggleDatasets

if TPU:
    # Google Cloud Dataset path to training and validation images
    DATA_DIR = KaggleDatasets().get_gcs_path("AI4Code")
    save_locally = tf.saved_model.SaveOptions(experimental_io_device="/job:localhost")
    load_locally = tf.saved_model.LoadOptions(experimental_io_device="/job:localhost")
else:
    # Local path to training and validation images
    DATA_DIR = "/kaggle/input/AI4Code"
    load_locally = save_locally = None


## Use XLA opt on Kaggle notebooks

- **XLA** (Accelerated Linear Algebra) is a domain-specific compiler for linear algebra that can accelerate TensorFlow models with potentially no source code changes. The results are improvements in speed and memory usage.
- XLA compiles the TensorFlow graph into a sequence of computation kernels generated specifically for the given model. Because these kernels are unique to the model, they can exploit model-specific information for optimization.
  - Normally, each TensorFlow operation has a precompiled GPU/TPU kernel implementation that the TensorFlow executor dispatches to.


In [None]:
# enable XLA optmizations (10% speedup when using @tf.function calls)
tf.config.optimizer.set_jit(True)


## Baseline codes - Initialization


In [None]:
import os, glob
import pandas as pd
from IPython.display import display

# Get json directory paths
TRAIN_JSON_DIR = os.path.join(DATA_DIR, "train")
TEST_JSON_DIR = os.path.join(DATA_DIR, "test")

# Get all json file paths
TRAIN_JSON_PATHS = glob(os.path.join(TRAIN_JSON_DIR, "*.json"), recursive=True)
TEST_JSON_PATHS = glob(os.path.join(TEST_JSON_DIR, "*.json"), recursive=True)

# Get number of train and test files
N_TRAIN = len(TRAIN_JSON_PATHS)
N_TEST = len(TEST_JSON_PATHS)

# Get CSV filepaths
TRAIN_ANCESTORS_CSV = os.path.join(DATA_DIR, "train_ancestors.csv")
TRAIN_ORDERS_CSV = os.path.join(DATA_DIR, "train_orders.csv")
SS_CSV = os.path.join(DATA_DIR, "sample_submission.csv")

# Convert CSV into dataframe
train_ancestors_df = pd.read_csv(TRAIN_ANCESTORS_CSV)
train_orders_df = pd.read_csv(TRAIN_ORDERS_CSV)
ss_df = pd.read_csv(SS_CSV)

print("\n... TRAIN ANCESTORS DATAFRAME... \n")
display(train_ancestors_df)

print("\n... TRAIN ORDERS DATAFRAME... \n")
display(train_orders_df)

print("\n\n\n... ORIGINAL SUBMISSION DATAFRAME... \n")
display(ss_df)

# For debugging purposes when the test set hasn't been substituted we will know
DEBUG = len(ss_df) == 4


print("\n... BASIC DATA SETUP FINISHED ...\n\n")


In [None]:
import json
from joblib import Parallel, delayed
from tqdm.notebook import tqdm

FIRST_RUN = not os.path.isfile("/kaggle/input/ai4code-train-dataframe/train.csv")


def load_json_to_df(all_fpaths, do_parallel=True):
    def __fpath_to_df(fpath):
        tmp_df = (
            pd.read_json(fpath, dtype={"cell_type": "category", "source": "str"})
            .reset_index()
            .rename({"index": "cell_id"}, axis=1)
        )
        tmp_df["id"] = fpath.rsplit(".", 1)[0].rsplit("/", 1)[-1]
        return tmp_df

    if do_parallel:
        all_example_dfs = Parallel()(
            delayed(__fpath_to_df)(fpath)
            for fpath in tqdm(all_fpaths, total=len(all_fpaths))
        )
    else:
        all_example_dfs = [
            __fpath_to_df(fpath) for fpath in tqdm(all_fpaths, total=len(all_fpaths))
        ]
    return pd.concat(all_example_dfs).reset_index(drop=True)


if FIRST_RUN:
    print("\n... CREATING TRAIN AND TEST DATAFRAMES (20-30 MINUTES) ...\n")
    train_df = load_json_to_df(TRAIN_JSON_PATHS, do_parallel=False)
    train_df = train_df[["id", "cell_id", "cell_type", "source"]]
    train_df.to_csv(
        "train.csv", index=False, encoding="utf-8", quoting=csv.QUOTE_NONNUMERIC
    )
else:
    print(
        "\n... LOADING TRAIN DATAFRAME AND CREATING TEST DATAFRAME (1-3 MINUTES) ...\n"
    )
    train_df = pd.read_csv(
        "/kaggle/input/ai4code-train-dataframe/train.csv", keep_default_na=False
    )

test_df = load_json_to_df(TEST_JSON_PATHS)
test_df = test_df[["id", "cell_id", "cell_type", "source"]]

print("\n... ALL TRAIN EXAMPLES AS A DATAFRAME ...\n\n")
display(train_df)

print("\n\n\n\n... ALL TEST EXAMPLES AS A DATAFRAME ...\n\n")
display(train_df)

print(
    "\n\n\n\n... VIEW THE ROWS THAT WERE PREVIOUSLY CAUSING PROBLEMS THAT CONTAIN NAN LIKE STRINGS ...\n\n"
)
nan_weirdos = [2076836, 2915099, 3416950, 4260446]
for row_idx in nan_weirdos:
    _row = train_df.iloc[row_idx]
    with open([x for x in TRAIN_JSON_PATHS if _row["id"] in x][0]) as json_file:
        data = json.load(json_file)
        print(
            f"ROW INDEX: {row_idx}\nSOURCE IN OUR DATAFRAME        : ", _row["source"]
        )
        print(
            "SOURCE DIRECTLY FROM JSON FIL E: ", data["source"][_row["cell_id"]], "\n"
        )


## Baseline codes - Utils


In [None]:
from IPython.display import Code, Markdown, Pretty


def flatten_l_o_l(nested_list):
    """Flatten a nested list of lists"""
    return [item for sublist in nested_list for item in sublist]


def display_markdown(markdown_str):
    """Wrapper function to display markdown as output of code cell"""
    display(Markdown(markdown_str))


def display_code(code_str):
    """Wrapper function to display markdown as output of code cell"""
    display(Code(code_str))


def get_ex_order(ex_id, orders_df=train_orders_df):
    return orders_df[orders_df["id"] == ex_id].cell_order.values[0].split()


def display_notebook(
    ex_id=None,
    df=train_df,
    show_ordered=True,
    render_markdown=True,
    order_df=train_orders_df,
):
    """Function to allow for visualization of a complete notebook"""

    # Get random ex_id if not provided
    if ex_id is None:
        ex_id = df["id"].sample(1).values[0]
    print(
        f"\n\n\n\n... INVESTIGATING AND VISUALIZING EXAMPLE {ex_id} –– CELLS WILL BE {'ORDERED' if show_ordered else 'UNORDERED'} ...\n\n\n\n"
    )

    # Get unordered subset of  dataframe
    u_sub_df = df[df["id"] == ex_id].reset_index(drop=True)

    # Get unordered subset of dataframe
    if show_ordered:
        cell_id_sorter = {c_id: i for i, c_id in enumerate(get_ex_order(ex_id))}
        u_sub_df["sorter"] = u_sub_df.cell_id.map(cell_id_sorter)
        o_sub_df = (
            u_sub_df.sort_values(by="sorter")
            .reset_index(drop=True)
            .drop(columns=["sorter"])
        )

    for i, (_, row) in enumerate(
        u_sub_df.iterrows() if not show_ordered else o_sub_df.iterrows()
    ):
        print("\n\n")
        display_markdown("---")
        print()
        display_markdown(
            f"{'----- '+'CELL '+str(i+1)+' OF TYPE '+str(row.cell_type.upper())+' -----':^120}"
        )
        display_markdown("---")
        print()
        if render_markdown:
            display_markdown(row["source"]) if row[
                "cell_type"
            ] == "markdown" else display_code(row["source"])
        else:
            display_code(row["source"])
        print()
        display_markdown("---")


## Baseline codes - Feature engineering

- Number of total cells
- Number of code cells
- Number of markdown cells
- Fraction of code cells
- Fraction of markdown cells
- Position of cell in notebook (ground truth if known)


In [None]:
# There's definitely a faster way to do this with grouping then applying then ungrouping
def add_style_specific_counts(df=train_df):
    id_w_style_to_count = (
        df.groupby(df["id"] + "_" + df["cell_type"])["source"]
        .count()
        .reset_index()
        .groupby("index")
        .first()["source"]
        .to_dict()
    )
    df["n_code_cells"] = (df["id"] + "_code").progress_apply(
        lambda x: id_w_style_to_count.get(x, 0)
    )
    df["n_markdown_cells"] = (df["id"] + "_markdown").progress_apply(
        lambda x: id_w_style_to_count.get(x, 0)
    )
    return df


def add_position_information(df=train_df, orders_df=train_orders_df):
    all_cell_ids_in_order = orders_df.cell_order.apply(lambda x: x.split()).to_list()
    all_cell_pos = flatten_l_o_l(
        [range(len(sub_cell_ids)) for sub_cell_ids in all_cell_ids_in_order]
    )
    all_cell_ids_in_order = flatten_l_o_l(all_cell_ids_in_order)
    cell_id_2_pos = {
        c_id: pos for c_id, pos in zip(all_cell_ids_in_order, all_cell_pos)
    }
    df.insert(4, "cell_pos", df["cell_id"].map(cell_id_2_pos))
    return df


train_df = add_position_information(train_df)
train_df["n_total_cells"] = train_df.groupby("id")["source"].transform("count")
train_df["relative_position"] = (train_df["cell_pos"] + 1) / train_df["n_total_cells"]
train_df = add_style_specific_counts(train_df)
train_df["code_fraction"] = train_df["n_code_cells"] / train_df["n_total_cells"]
train_df["markdown_fraction"] = train_df["n_markdown_cells"] / train_df["n_total_cells"]

test_df["n_total_cells"] = test_df.groupby("id")["source"].transform("count")
test_df = add_style_specific_counts(test_df)
test_df["code_fraction"] = test_df["n_code_cells"] / test_df["n_total_cells"]
test_df["markdown_fraction"] = test_df["n_markdown_cells"] / test_df["n_total_cells"]

train_meta_df = train_df.drop_duplicates("id").reset_index(drop=True)[
    [
        "id",
        "n_total_cells",
        "n_code_cells",
        "n_markdown_cells",
        "code_fraction",
        "markdown_fraction",
    ]
]
test_meta_df = test_df.drop_duplicates("id").reset_index(drop=True)[
    [
        "id",
        "n_total_cells",
        "n_code_cells",
        "n_markdown_cells",
        "code_fraction",
        "markdown_fraction",
    ]
]

display(train_df.head())
display(test_df.head())


## Baseline codes - EDA

1. Investigate distribution
   - The lengths of the notebooks as described by total cell count, markdown cell count, and code cell count, all skew heavily towards shorter rather than longer.
     - n_total_cells has a mean of only **45.75** while the maximum value is **over 1000**
     - Similar patterns are seen within n_code_cells and n_markdown_cells
   - There are certain heuristic rules that always exist:
     - There are always at least 2 cells per notebook
     - There are always at least 1 code cell per notebook
     - There are always at least 1 markdown cell per notebook
   - Some observations can be gleaned contrasting notebook and markdown cells
     - Notebook cells are more than twice as common as Markdown cells (67% vs. 33%)
       - i.e. The average notebook is comprised of ~30.2 notebook cells and ~15.5 markdown cells (based on the average total notebook length of ~45.7 cells.
     - Some notebook are almost entirely code cells (a fraction of 99.7% of cells)
     - Some notebook are almost entirely markdown cells (a fraction of 98.8% of cells)


In [None]:
import plotly.express as px

display(train_meta_df.describe().T)

fig = px.histogram(
    train_meta_df,
    ["n_total_cells"],
    title="<b>Number of Total Cells Per Notebook  <sub><i>Log Y-Axis</i></sub></b>",
    nbins=200,
    marginal="violin",
    log_y=True,
)
fig.update_layout(showlegend=False)
fig.show()

fig = px.histogram(
    train_meta_df,
    ["n_code_cells", "n_markdown_cells"],
    barmode="overlay",
    title="<b>Number of Markdown v. Code Cells Per Notebook  <sub><i>Log Y-Axis</i></sub></b>",
    nbins=200,
    marginal="violin",
    log_y=True,
)
fig.update_layout(showlegend=False)
fig.show()

fig = px.histogram(
    train_meta_df,
    ["code_fraction", "markdown_fraction"],
    barmode="overlay",
    title="<b>Distribution of Fractional Composition of Markdown v. Code Cells Per Notebook  <sub><i>Log Y-Axis</i></sub></b>",
    nbins=200,
    marginal="violin",
    log_y=True,
)
fig.update_layout(showlegend=False)
fig.show()


2. Investigate lengths of individual cell blocks


In [None]:
train_df["n_cell_chars"] = train_df["source"].progress_apply(len)
train_df.describe().T


3. Fix base64 image blocks


In [None]:
import re


def remove_long_useless_strs(src):
    delim_pairs = [
        (";base64,", '\\"'),
        (";base64,", "\)"),
        ("weight = b'", "'"),
        ("PARAM = b'", "'"),
    ]

    for delim_1, delim_2 in delim_pairs:
        src = re.sub(f"{delim_1}.*?{delim_2}", "(replaced)", src, flags=re.DOTALL)
    return src


train_df["source"] = train_df["source"].progress_apply(remove_long_useless_strs)
train_df["n_cell_chars"] = train_df["source"].progress_apply(len)
train_df.describe().T


## Modelling

- Leverage `TFIDF` and `XGBRanker`


In [None]:
## CREATE TRAIN/VAL SPLITS
from sklearn.model_selection import GroupShuffleSplit

# Apply merge
train_df = pd.merge(
    train_df, train_ancestors_df[["id", "ancestor_id"]], on="id", how="left"
)

VAL_FRAC = 0.1
FEAT_COLS = [
    "id",
    "cell_id",
    "cell_type",
    "source",
    "n_total_cells",
    "n_code_cells",
    "n_markdown_cells",
    "code_fraction",
    "markdown_fraction",
    "n_cell_chars",
]
LABEL_COLS = ["cell_pos"]  # or relative position
GROUP_COLS = ["ancestor_id"]
g_splitter = GroupShuffleSplit(n_splits=1, test_size=VAL_FRAC, random_state=0)

# Split, keeping notebooks with a common origin (ancestor_id) together
train_ids, val_ids = next(
    g_splitter.split(
        train_df[FEAT_COLS], train_df[LABEL_COLS], groups=train_df[GROUP_COLS]
    )
)
ids_train, ids_valid = ids[ids_train], ids[ids_valid]

val_df = train_df.iloc[val_ids].reset_index(drop=True)
train_df = train_df.iloc[train_ids].reset_index(drop=True)


In [None]:
## CONVERT TO CUDF TO FREE MEMORY AND PREPARE FOR MODELLING
import gc
import cudf, cupy, cuml

val_df = cudf.from_pandas(val_df)
train_df = cudf.from_pandas(train_df)
gc.collect()
gc.collect()

train_df


# `Getting Started with AI4Code.ipynb`


In [None]:
## Setup
import json
from pathlib import Path

import numpy as np
import pandas as pd
from scipy import sparse
from tqdm import tqdm

pd.options.display.width = 180
pd.options.display.max_colwidth = 120

data_dir = Path("../input/AI4Code")


In [None]:
## Load data
NUM_TRAIN = 10000


def read_notebook(path):
    return (
        pd.read_json(path, dtype={"cell_type": "category", "source": "str"})
        .assign(id=path.stem)
        .rename_axis("cell_id")
    )


paths_train = list((data_dir / "train").glob("*.json"))[:NUM_TRAIN]
notebooks_train = [read_notebook(path) for path in tqdm(paths_train, desc="Train NBs")]
df = (
    pd.concat(notebooks_train)
    .set_index("id", append=True)
    .swaplevel()
    .sort_index(level="id", sort_remaining=False)
)

df


In [None]:
nb_id = df.index.unique("id")[6]
print("Notebook:", nb_id)

print("The disordered notebook:")
nb = df.loc[nb_id, :]
display(nb)
print()


In [None]:
## Order the cells
df_orders = pd.read_csv(
    data_dir / "train_orders.csv",
    index_col="id",
    squeeze=True,
).str.split()  # Split the string representation of cell_ids into a list

cell_order = df_orders.loc[nb_id]

nb.loc[cell_order, :]


In [None]:
## Order the cells in an alternative way
def get_ranks(base, derived):
    return [base.index(d) for d in derived]


cell_ranks = get_ranks(cell_order, list(nb.index))
nb.insert(0, "rank", cell_ranks)

nb.sort_values("rank")  # sort by rank


In [None]:
df_orders_ = df_orders.to_frame().join(
    df.reset_index("cell_id").groupby("id")["cell_id"].apply(list),
    how="right",
)

ranks = {
    id_: {"cell_id": cell_id, "rank": get_ranks(cell_order, cell_id)}
    for id_, cell_order, cell_id in df_orders_.itertuples()
}

df_ranks = (
    pd.DataFrame.from_dict(ranks, orient="index")
    .rename_axis("id")
    .apply(pd.Series.explode)
    .set_index("cell_id", append=True)
)

df_ranks


In [None]:
## Split
from sklearn.model_selection import GroupShuffleSplit

df_ancestors = pd.read_csv(data_dir / "train_ancestors.csv", index_col="id")

NVALID = 0.1  # size of validation set
splitter = GroupShuffleSplit(n_splits=1, test_size=NVALID, random_state=0)

# Split, keeping notebooks with a common origin (ancestor_id) together
ids = df.index.unique("id")
ancestors = df_ancestors.loc[ids, "ancestor_id"]
ids_train, ids_valid = next(splitter.split(ids, groups=ancestors))
ids_train, ids_valid = ids[ids_train], ids[ids_valid]

df_train = df.loc[ids_train, :]
df_valid = df.loc[ids_valid, :]


In [None]:
## Feature engineering
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(min_df=0.01)
X_train = tfidf.fit_transform(df_train["source"].astype(str))

# Rank of each cell within the notebook
y_train = df_ranks.loc[ids_train].to_numpy()

# Number of cells in each notebook
groups = df_ranks.loc[ids_train].groupby("id").size().to_numpy()

# Add code cell ordering
X_train = sparse.hstack(
    (
        X_train,
        np.where(
            df_train["cell_type"] == "code",
            df_train.groupby(["id", "cell_type"]).cumcount().to_numpy() + 1,
            0,
        ).reshape(-1, 1),
    )
)
print(X_train.shape)


In [None]:
## Train
from xgboost import XGBRanker

model = XGBRanker(
    min_child_weight=10,
    subsample=0.5,
    tree_method="hist",
)
model.fit(X_train, y_train, group=groups)


In [None]:
## Evaluate
X_valid = tfidf.transform(df_valid["source"].astype(str))
y_valid = df_orders.loc[ids_valid]  # The metric uses cell ids

X_valid = sparse.hstack(
    (
        X_valid,
        np.where(
            df_valid["cell_type"] == "code",
            df_valid.groupby(["id", "cell_type"]).cumcount().to_numpy() + 1,
            0,
        ).reshape(-1, 1),
    )
)

y_pred = pd.DataFrame({"rank": model.predict(X_valid)}, index=df_valid.index)
y_pred = (
    y_pred.sort_values(["id", "rank"])  # Sort the cells in each notebook by their rank.
    # The cell_ids are now in the order the model predicted.
    .reset_index("cell_id")  # Convert the cell_id index into a column.
    .groupby("id")["cell_id"]
    .apply(list)  # Group the cell_ids for each notebook into a list.
)


In [None]:
## Metric
y_dummy = df_valid.reset_index("cell_id").groupby("id")["cell_id"].apply(list)
calc_kendall_tau(y_valid, y_dummy)


In [None]:
## Submission
paths_test = list((data_dir / "test").glob("*.json"))
notebooks_test = [read_notebook(path) for path in tqdm(paths_test, desc="Test NBs")]
df_test = (
    pd.concat(notebooks_test)
    .set_index("id", append=True)
    .swaplevel()
    .sort_index(level="id", sort_remaining=False)
)

X_test = tfidf.transform(df_test["source"].astype(str))
X_test = sparse.hstack(
    (
        X_test,
        np.where(
            df_test["cell_type"] == "code",
            df_test.groupby(["id", "cell_type"]).cumcount().to_numpy() + 1,
            0,
        ).reshape(-1, 1),
    )
)

y_infer = pd.DataFrame({"rank": model.predict(X_test)}, index=df_test.index)
y_infer = (
    y_infer.sort_values(["id", "rank"])
    .reset_index("cell_id")
    .groupby("id")["cell_id"]
    .apply(list)
)

y_sample = pd.read_csv(data_dir / "sample_submission.csv", index_col="id", squeeze=True)

y_submit = (
    y_infer.apply(" ".join)  # list of ids -> string of ids
    .rename_axis("id")
    .rename("cell_order")
)
y_submit.to_csv("submission.csv")


# `AI4Code PyTorch - BERT Large + W&B.ipynb`


In [None]:
## Setup model
from pathlib import Path
import transformers
from torch.cuda.amp import GradScaler


class Config:
    NB_EPOCHS = 2
    LR = 3e-4
    T_0 = 20
    η_min = 1e-4
    MAX_LEN = 120
    TRAIN_BS = 16
    VALID_BS = 16
    MODEL_NAME = "bert-large-uncased"
    data_dir = Path("../input/AI4Code")
    TOKENIZER = transformers.BertTokenizer.from_pretrained(
        MODEL_NAME, do_lower_case=True
    )
    scaler = GradScaler()
    wandb = True


In [None]:
## Setup WandB
import wandb

WANDB_CONFIG = {
    "TRAIN_BS": Config.TRAIN_BS,
    "VALID_BS": Config.VALID_BS,
    "N_EPOCHS": Config.NB_EPOCHS,
    "ARCH": Config.MODEL_NAME,
    "MAX_LEN": Config.MAX_LEN,
    "LR": Config.LR,
    "NUM_WORKERS": 8,
    "OPTIM": "AdamW",
    "LOSS": "MSELoss",
    "DEVICE": "cuda",
    "T_0": 20,
    "η_min": 1e-4,
    "infra": "Kaggle",
    "competition": "ai4code",
    "_wandb_kernel": "tanaym",
}

if Config.wandb:
    from kaggle_secrets import UserSecretsClient

    user_secrets = UserSecretsClient()
    wb_key = user_secrets.get_secret("WANDB_API_KEY")

    wandb.login(key=wb_key)

    run = wandb.init(
        project="pytorch",
        config=WANDB_CONFIG,
        group="nlp",
        job_type="train",
    )


def wandb_log(**kwargs):
    """Logs a key-value pair to W&B"""
    for k, v in kwargs.items():
        wandb.log({k: v})


In [None]:
## Preprocessing
NUM_TRAIN = 15000

paths_train = list((Config.data_dir / "train").glob("*.json"))[:NUM_TRAIN]
notebooks_train = [read_notebook(path) for path in tqdm(paths_train, desc="Train NBs")]
df = (
    pd.concat(notebooks_train)
    .set_index("id", append=True)
    .swaplevel()
    .sort_index(level="id", sort_remaining=False)
)

df_orders = pd.read_csv(
    Config.data_dir / "train_orders.csv",
    index_col="id",
    squeeze=True,
).str.split()

df_orders_ = df_orders.to_frame().join(
    df.reset_index("cell_id").groupby("id")["cell_id"].apply(list),
    how="right",
)

ranks = {}
for id_, cell_order, cell_id in df_orders_.itertuples():
    ranks[id_] = {"cell_id": cell_id, "rank": get_ranks(cell_order, cell_id)}

df_ranks = (
    pd.DataFrame.from_dict(ranks, orient="index")
    .rename_axis("id")
    .apply(pd.Series.explode)
    .set_index("cell_id", append=True)
)

df_ancestors = pd.read_csv(Config.data_dir / "train_ancestors.csv", index_col="id")
df = (
    df.reset_index()
    .merge(df_ranks, on=["id", "cell_id"])
    .merge(df_ancestors, on=["id"])
)
df["pct_rank"] = df["rank"] / df.groupby("id")["cell_id"].transform("count")


In [None]:
NVALID = 0.1  # size of validation set

splitter = GroupShuffleSplit(n_splits=1, test_size=NVALID, random_state=0)

train_ind, val_ind = next(splitter.split(df, groups=df["ancestor_id"]))

train_df = df.loc[train_ind].reset_index(drop=True)
val_df = df.loc[val_ind].reset_index(drop=True)

train_df_mark = train_df[train_df["cell_type"] == "markdown"].reset_index(drop=True)
val_df_mark = val_df[val_df["cell_type"] == "markdown"].reset_index(drop=True)


In [None]:
import torch.nn as nn


class BERTLargeModel(nn.Module):
    def __init__(self):
        super(BERTLargeModel, self).__init__()
        self.bert = transformers.BertModel.from_pretrained(Config.MODEL_NAME)
        self.drop = nn.Dropout(0.3)
        self.fc = nn.Linear(1024, 1)

    def forward(self, ids, mask, token_type_ids):
        _, output = self.bert(
            ids, attention_mask=mask, token_type_ids=token_type_ids, return_dict=False
        )
        output = self.drop(output)
        output = self.fc(output)
        return output


In [None]:
## Custom Dataset class
import torch
from torch.utils.data import Dataset, DataLoader


class AI4CodeDataset(Dataset):
    def __init__(self, df, is_test=False):
        self.df = df.reset_index(drop=True)
        self.is_test = is_test

    def __getitem__(self, idx):
        sample = self.df.iloc[idx]

        inputs = Config.TOKENIZER.encode_plus(
            sample["source"],
            None,
            add_special_tokens=True,
            max_length=Config.MAX_LEN,
            padding="max_length",
            return_token_type_ids=True,
            truncation=True,
        )
        ids = torch.tensor(inputs["input_ids"], dtype=torch.long)
        mask = torch.tensor(inputs["attention_mask"], dtype=torch.long)
        token_type_ids = torch.tensor(inputs["token_type_ids"], dtype=torch.long)

        if self.is_test:
            return (ids, mask, token_type_ids)
        else:
            targets = torch.tensor([sample.pct_rank], dtype=torch.float)
            return (ids, mask, token_type_ids, targets)

    def __len__(self):
        return len(self.df)


In [None]:
## Trainer class and optimizer-returning function
from torch.cuda.amp import autocast
from sklearn.metrics import mean_squared_error


class Trainer:
    def __init__(
        self,
        config,
        dataloaders,
        optimizer,
        model,
        wandb,
        loss_fns,
        scheduler,
        device="cuda:0",
    ):
        self.train_loader, self.valid_loader = dataloaders
        self.train_loss_fn, self.valid_loss_fn = loss_fns
        self.scheduler = scheduler
        self.optimizer = optimizer
        self.model = model
        self.wandb = wandb
        self.device = torch.device(device)
        self.config = config

    def train_one_epoch(self):
        """
        Trains the model for 1 epoch
        """
        self.model.train()
        train_pbar = tqdm(enumerate(self.train_loader), total=len(self.train_loader))
        train_preds, train_targets = [], []

        for bnum, cache in train_pbar:
            ids = self._convert_if_not_tensor(cache[0], dtype=torch.long)
            mask = self._convert_if_not_tensor(cache[1], dtype=torch.long)
            ttis = self._convert_if_not_tensor(cache[2], dtype=torch.long)
            targets = self._convert_if_not_tensor(cache[3], dtype=torch.float)

            with autocast(enabled=True):
                outputs = self.model(ids=ids, mask=mask, token_type_ids=ttis).view(-1)

                loss = self.train_loss_fn(outputs, targets)
                loss_itm = loss.item()

                if self.wandb:
                    wandb_log(train_batch_loss=loss_itm)

                train_pbar.set_description("loss: {:.2f}".format(loss_itm))

                Config.scaler.scale(loss).backward()
                Config.scaler.step(self.optimizer)
                Config.scaler.update()
                self.optimizer.zero_grad()
                self.scheduler.step()

            train_targets.extend(targets.cpu().detach().numpy().tolist())
            train_preds.extend(outputs.cpu().detach().numpy().tolist())

        # Tidy
        del outputs, targets, ids, mask, ttis, loss_itm, loss
        gc.collect()
        torch.cuda.empty_cache()

        return train_preds, train_targets

    @torch.no_grad()
    def valid_one_epoch(self):
        """
        Validates the model for 1 epoch
        """
        self.model.eval()
        valid_pbar = tqdm(enumerate(self.valid_loader), total=len(self.valid_loader))
        valid_preds, valid_targets = [], []

        for idx, cache in valid_pbar:
            ids = self._convert_if_not_tensor(cache[0], dtype=torch.long)
            mask = self._convert_if_not_tensor(cache[1], dtype=torch.long)
            ttis = self._convert_if_not_tensor(cache[2], dtype=torch.long)
            targets = self._convert_if_not_tensor(cache[3], dtype=torch.float)

            outputs = self.model(ids=ids, mask=mask, token_type_ids=ttis).view(-1)
            valid_loss = self.valid_loss_fn(outputs, targets)

            if self.wandb:
                wandb_log(valid_batch_loss=valid_loss.item())

            valid_pbar.set_description(desc=f"val_loss: {valid_loss.item():.4f}")

            valid_targets.extend(targets.cpu().detach().numpy().tolist())
            valid_preds.extend(outputs.cpu().detach().numpy().tolist())

        # Tidy
        del outputs, targets, ids, mask, ttis, valid_loss
        gc.collect()
        torch.cuda.empty_cache()

        return valid_preds, valid_targets

    def fit(
        self,
        epochs: int = 10,
        output_dir: str = "/kaggle/working/",
        custom_name: str = "model.pth",
    ):
        """
        Low-effort alternative for doing the complete training and validation process
        """
        best_loss = int(1e7)
        best_preds = None
        for epx in range(epochs):
            print(f"{'='*20} Epoch: {epx+1} / {epochs} {'='*20}")

            train_preds, train_targets = self.train_one_epoch()
            train_mse = mean_squared_error(train_targets, train_preds)
            print(f"Training loss: {train_mse:.4f}")

            valid_preds, valid_targets = self.valid_one_epoch()
            valid_mse = mean_squared_error(valid_targets, valid_preds)
            print(f"Validation loss: {valid_mse:.4f}")

            if self.wandb:
                wandb_log(train_mse=train_mse, valid_mse=valid_mse)

            if valid_mse < best_loss:
                best_loss = valid_mse
                self.save_model(output_dir, custom_name)
                print(f"Saved model with val_loss: {best_loss:.4f}")

    def save_model(self, path, name, verbose=False):
        """
        Saves the model at the provided destination
        """
        try:
            if not os.path.exists(path):
                os.makedirs(path)
        except:
            print("Errors encountered while making the output directory")

        torch.save(self.model.state_dict(), os.path.join(path, name))
        if verbose:
            print(f"Model Saved at: {os.path.join(path, name)}")

    def _convert_if_not_tensor(self, x, dtype):
        if self._tensor_check(x):
            return x.to(self.device, dtype=dtype)
        else:
            return torch.tensor(x, dtype=dtype, device=self.device)

    def _tensor_check(self, x):
        return isinstance(x, torch.Tensor)


def yield_optimizer(model):
    """Returns optimizer for specific parameters"""
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.003,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]
    return transformers.AdamW(optimizer_parameters, lr=Config.LR)


In [None]:
## Training
import platform

if torch.cuda.is_available():
    print("[INFO] Using GPU: {}\n".format(torch.cuda.get_device_name()))
    DEVICE = torch.device("cuda:0")
else:
    print("\n[INFO] GPU not found. Using CPU: {}\n".format(platform.processor()))
    DEVICE = torch.device("cpu")

train_set = AI4CodeDataset(train_df_mark)
valid_set = AI4CodeDataset(val_df_mark)

train_loader = DataLoader(
    train_set, batch_size=Config.TRAIN_BS, shuffle=True, num_workers=8
)

valid_loader = DataLoader(
    valid_set, batch_size=Config.VALID_BS, shuffle=False, num_workers=8
)

model = BERTLargeModel().to(DEVICE)
nb_train_steps = int(len(train_df_mark) / Config.TRAIN_BS * Config.NB_EPOCHS)
optimizer = yield_optimizer(model)
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
    optimizer, T_0=Config.T_0, eta_min=Config.η_min
)
train_loss_fn, valid_loss_fn = nn.MSELoss(), nn.MSELoss()

if Config.wandb:
    wandb.watch(model, criterion=train_loss_fn)

trainer = Trainer(
    config=Config,
    dataloaders=(train_loader, valid_loader),
    loss_fns=(train_loss_fn, valid_loss_fn),
    optimizer=optimizer,
    model=model,
    scheduler=scheduler,
    wandb=Config.wandb,
)

best_pred = trainer.fit(epochs=Config.NB_EPOCHS, custom_name=f"ai4code_bert_large.bin")

if Config.wandb:
    run.finish()


# `AI4Code Pytorch DistilBert Baseline.ipynb`


In [None]:
# Setup
import sys, os

import json
from pathlib import Path

import numpy as np
import pandas as pd
from scipy import sparse
from tqdm import tqdm

pd.options.display.width = 180
pd.options.display.max_colwidth = 120

BERT_PATH = (
    "../input/huggingface-bert-variants/distilbert-base-uncased/distilbert-base-uncased"
)

data_dir = Path("../input/AI4Code")
NUM_TRAIN = 10000


# Preprocessing
def read_notebook(path):
    return (
        pd.read_json(path, dtype={"cell_type": "category", "source": "str"})
        .assign(id=path.stem)
        .rename_axis("cell_id")
    )


paths_train = list((data_dir / "train").glob("*.json"))[:NUM_TRAIN]
notebooks_train = [read_notebook(path) for path in tqdm(paths_train, desc="Train NBs")]
df = (
    pd.concat(notebooks_train)
    .set_index("id", append=True)
    .swaplevel()
    .sort_index(level="id", sort_remaining=False)
)

df_orders = pd.read_csv(
    data_dir / "train_orders.csv",
    index_col="id",
    squeeze=True,
).str.split()  # Split the string representation of cell_ids into a list

df_orders_ = df_orders.to_frame().join(
    df.reset_index("cell_id").groupby("id")["cell_id"].apply(list),
    how="right",
)

ranks = {
    id_: {"cell_id": cell_id, "rank": get_ranks(cell_order, cell_id)}
    for id_, cell_order, cell_id in df_orders_.itertuples()
}

df_ranks = (
    pd.DataFrame.from_dict(ranks, orient="index")
    .rename_axis("id")
    .apply(pd.Series.explode)
    .set_index("cell_id", append=True)
)

df_ancestors = pd.read_csv(data_dir / "train_ancestors.csv", index_col="id")

df = (
    df.reset_index()
    .merge(df_ranks, on=["id", "cell_id"])
    .merge(df_ancestors, on=["id"])
)

df["pct_rank"] = df["rank"] / df.groupby("id")["cell_id"].transform("count")

# Splitting
from sklearn.model_selection import GroupShuffleSplit

NVALID = 0.1  # size of validation set

splitter = GroupShuffleSplit(n_splits=1, test_size=NVALID, random_state=0)

train_ind, val_ind = next(splitter.split(df, groups=df["ancestor_id"]))

train_df = df.loc[train_ind].reset_index(drop=True)
val_df = df.loc[val_ind].reset_index(drop=True)

train_df_mark = train_df[train_df["cell_type"] == "markdown"].reset_index(drop=True)
val_df_mark = val_df[val_df["cell_type"] == "markdown"].reset_index(drop=True)

# Model
from transformers import DistilBertModel, DistilBertTokenizer
import torch
import torch.nn as nn
import torch.nn.functional as F

MAX_LEN = 128


class MarkdownModel(nn.Module):
    def __init__(self):
        super(MarkdownModel, self).__init__()
        self.distill_bert = DistilBertModel.from_pretrained(BERT_PATH)
        self.top = nn.Linear(768, 1)

    def forward(self, ids, mask):
        x = self.distill_bert(ids, mask)[0]
        x = self.top(x[:, 0, :])
        return x


# Dataset
from torch.utils.data import DataLoader, Dataset


class MarkdownDataset(Dataset):
    def __init__(self, df, max_len):
        super().__init__()
        self.df = df.reset_index(drop=True)
        self.max_len = max_len
        self.tokenizer = DistilBertTokenizer.from_pretrained(
            BERT_PATH, do_lower_case=True
        )

    def __getitem__(self, index):
        row = self.df.iloc[index]

        inputs = self.tokenizer.encode_plus(
            row.source,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            return_token_type_ids=True,
            truncation=True,
        )
        ids = torch.LongTensor(inputs["input_ids"])
        mask = torch.LongTensor(inputs["attention_mask"])

        return ids, mask, torch.FloatTensor([row.pct_rank])

    def __len__(self):
        return self.df.shape[0]


train_ds = MarkdownDataset(train_df_mark, max_len=MAX_LEN)
val_ds = MarkdownDataset(val_df_mark, max_len=MAX_LEN)

# Optimizer
def adjust_lr(optimizer, epoch):
    if epoch < 1:
        lr = 5e-5
    elif epoch < 2:
        lr = 1e-3
    elif epoch < 5:
        lr = 1e-4
    else:
        lr = 1e-5

    for p in optimizer.param_groups:
        p["lr"] = lr
    return lr


def get_optimizer(net):
    optimizer = torch.optim.Adam(
        filter(lambda p: p.requires_grad, net.parameters()),
        lr=3e-4,
        betas=(0.9, 0.999),
        eps=1e-08,
    )
    return optimizer


# DataLoader
BS = 32
NW = 8

train_loader = DataLoader(
    train_ds,
    batch_size=BS,
    shuffle=True,
    num_workers=NW,
    pin_memory=False,
    drop_last=True,
)
val_loader = DataLoader(
    val_ds,
    batch_size=BS,
    shuffle=False,
    num_workers=NW,
    pin_memory=False,
    drop_last=False,
)

# Training
def read_data(data):
    return tuple(d.cuda() for d in data[:-1]), data[-1].cuda()


def validate(model, val_loader):
    model.eval()

    tbar = tqdm(val_loader, file=sys.stdout)

    preds = []
    labels = []

    with torch.no_grad():
        for idx, data in enumerate(tbar):
            inputs, target = read_data(data)

            pred = model(inputs[0], inputs[1])

            preds.append(pred.detach().cpu().numpy().ravel())
            labels.append(target.detach().cpu().numpy().ravel())

    return np.concatenate(labels), np.concatenate(preds)


def train(model, train_loader, val_loader, epochs):
    np.random.seed(0)

    optimizer = get_optimizer(model)

    criterion = torch.nn.MSELoss()

    for e in range(epochs):
        model.train()
        tbar = tqdm(train_loader, file=sys.stdout)

        lr = adjust_lr(optimizer, e)

        loss_list = []
        preds = []
        labels = []

        for idx, data in enumerate(tbar):
            inputs, target = read_data(data)

            optimizer.zero_grad()
            pred = model(inputs[0], inputs[1])

            loss = criterion(pred, target)
            loss.backward()
            optimizer.step()

            loss_list.append(loss.detach().cpu().item())
            preds.append(pred.detach().cpu().numpy().ravel())
            labels.append(target.detach().cpu().numpy().ravel())

            avg_loss = np.round(np.mean(loss_list), 4)

            tbar.set_description(f"Epoch {e+1} Loss: {avg_loss} lr: {lr}")

        y_val, y_pred = validate(model, val_loader)

        print("Validation MSE:", np.round(mean_squared_error(y_val, y_pred), 4))
        print()
    return model, y_pred


model = MarkdownModel()
model = model.cuda()
model, y_pred = train(model, train_loader, val_loader, epochs=1)

val_df["pred"] = val_df.groupby(["id", "cell_type"])["rank"].rank(pct=True)
val_df.loc[val_df["cell_type"] == "markdown", "pred"] = y_pred

y_dummy = val_df.sort_values("pred").groupby("id")["cell_id"].apply(list)
calc_kendall_tau(df_orders.loc[y_dummy.index], y_dummy)


In [None]:
# Test & Submission
paths_test = list((data_dir / "test").glob("*.json"))
notebooks_test = [read_notebook(path) for path in tqdm(paths_test, desc="Test NBs")]
test_df = (
    pd.concat(notebooks_test)
    .set_index("id", append=True)
    .swaplevel()
    .sort_index(level="id", sort_remaining=False)
).reset_index()

test_df["rank"] = test_df.groupby(["id", "cell_type"]).cumcount()
test_df["pred"] = test_df.groupby(["id", "cell_type"])["rank"].rank(pct=True)

test_df["pct_rank"] = 0
test_ds = MarkdownDataset(
    test_df[test_df["cell_type"] == "markdown"].reset_index(drop=True), max_len=MAX_LEN
)
test_loader = DataLoader(
    test_ds,
    batch_size=BS,
    shuffle=False,
    num_workers=NW,
    pin_memory=False,
    drop_last=False,
)

_, y_test = validate(model, test_loader)
test_df.loc[test_df["cell_type"] == "markdown", "pred"] = y_test

sub_df = (
    test_df.sort_values("pred")
    .groupby("id")["cell_id"]
    .apply(lambda x: " ".join(x))
    .reset_index()
)
sub_df.rename(columns={"cell_id": "cell_order"}, inplace=True)

sub_df.to_csv("submission.csv", index=False)
