In [1]:
import json
import os

import polars as pl

In [2]:
data_path = "data"
tabular_data_path = os.path.join(data_path, "tabular")
vocabulary_path = os.path.join(data_path, "vocabulary.json")

In [3]:
splits = ["train", "validation", "test"]
with open(vocabulary_path) as f:
    vocabulary = json.load(f)
print(vocabulary)

['bland', 'crunchy', 'green', 'long', 'orange', 'peel', 'red', 'savory', 'seeds', 'skin', 'soft', 'sour', 'spherical', 'sweet', 'temperate', 'tropical', 'warm', 'yellow']


In [4]:
multi_hot_expression = [
    pl.col("description")
    .str.split(" ")
    .list.contains(word)
    .cast(pl.Int8)
    .alias(word)
    for word in vocabulary
]

In [5]:
for split in splits:
    dataset_path = os.path.join(tabular_data_path, f"{split}.csv")
    processed_data_path = os.path.join(tabular_data_path, f"{split}_processed.csv")
    print(f"accessing {dataset_path}")
    dataset = pl.read_csv(dataset_path)
    processed_dataset = (
        dataset
        .with_columns(multi_hot_expression)
        .drop("description")
        .select(pl.exclude("class"), pl.col("class"))
    )
    processed_dataset.write_csv(processed_data_path)

accessing data/tabular/train.csv
accessing data/tabular/validation.csv
accessing data/tabular/test.csv
