In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from transformers import AutoTokenizer
from datasets import Dataset, load_dataset

from pathlib import Path

In [None]:
data_dir = Path("../data/interim/")
ckpt = "distilbert-base-uncased"

## Tokenize text

In [None]:
tokenizer = AutoTokenizer.from_pretrained(ckpt, use_fast=True)

In [None]:
df = pd.read_csv(data_dir/"wndp-api-ohe.csv")
df.columns = df.columns.str.lower().str.replace(" ", "_")
df.dropna(inplace=True)
df.head()

In [None]:
df.shape

In [None]:
sample = df.text[:5]
sample[3]

In [None]:
tokenizer(sample[3])

In [None]:
tokens = tokenizer.tokenize(sample[3])
tokens

In [None]:
ids = tokenizer.convert_tokens_to_ids(tokens)
ids

In [None]:
tokenizer.decode(ids)

In [None]:
tokenizer.decode(tokenizer(sample[3])["input_ids"])

## Build the dataset

In [None]:
# Pick only the columns we need for the model
df = df[list(set(df.columns).difference({"patient_id", "terms"}))]
df.head()

In [None]:
# Split dataset into train, val & test set
_ds = (Dataset
          .from_pandas(df)
          .train_test_split(test_size=0.15, 
                            shuffle=True)
     )
ds = _ds["train"].train_test_split(test_size=0.2)
ds["val"] = ds.pop("test")
ds["test"] = _ds["test"]

In [None]:
ds = ds.remove_columns(column_names=["__index_level_0__"])

In [None]:
labels = sorted(ds["train"].column_names)
labels.remove("text")
labels

In [None]:
# Add labels to the dataset as float (pytorch excepts float tensors)
ds = ds.map(lambda row: {"labels": [float(row[l]) for l in labels]})

In [None]:
ds["train"][0]

Let us look at how long is each of the text description

In [None]:
text_stats = np.array([len(row.split()) if row else 0 for row in ds["train"]["text"]])

In [None]:
sns.histplot(text_stats, bins=20, kde=True)

In [None]:
(text_stats < 3).sum()/len(text_stats)

In [None]:
(text_stats > 128).sum()/len(text_stats)

1% of the data have words more than 128 words

In [None]:
for row in ds["train"]["text"]:
    if len(row.split()) < 3:
        print(row)

In [None]:
for row in ds["train"]["text"]:
    if len(row.split()) > 200:
        print(len(row.split()), row)
        print("-" * 20)

In [None]:
%%time
def tok_fn(row):
    return tokenizer(row["text"], 
                     truncation=True, 
                     padding="max_length", 
                     max_length=128)

tok_ds = ds.map(tok_fn, batched=True, num_proc=16, remove_columns=labels + ["text"])

In [None]:
tok_ds

In [None]:
tok_ds["train"][0]

In [None]:
# Save the processed data in a parquet file
for split,split_ds in tok_ds.items():
    split_ds.to_parquet(f"../data/processed/wndp-api-data-{split}.parquet")

In [None]:
!ls ../data/processed/

In [None]:
%%time
data_files = {
    "train": "../data/processed/wndp-api-data-train.parquet",
    "val": "../data/processed/wndp-api-data-val.parquet",
    "test": "../data/processed/wndp-api-data-test.parquet",
}

ds = load_dataset("parquet", data_files=data_files)