In [1]:
import json
import os

import polars as pl

In [2]:
data_path = "data"
tabular_data_path = os.path.join(data_path, "tabular")
vocabulary_path = os.path.join(data_path, "vocabulary.json")

In [3]:
splits = ["train", "validation", "test"]
with open(vocabulary_path) as f:
    vocabulary = json.load(f)
print("Vocabulary: ", vocabulary)

Vocabulary:  ['bland', 'crunchy', 'green', 'long', 'orange', 'peel', 'red', 'savory', 'seeds', 'skin', 'soft', 'sour', 'spherical', 'sweet', 'temperate', 'tropical', 'warm', 'yellow']


In [4]:
multi_hot_expression = [
    pl.col("description")
    .str.split(" ")
    .list.contains(word)
    .cast(pl.Int8)
    .alias(word)
    for word in vocabulary
]

In [5]:
file_types = [
    "images",
    "tabular",
    "text",
    "fusion"
]

for split in splits:
    print(f"Processing split: {split}")

    for file_type in file_types:

        source_path = os.path.join(tabular_data_path, f"{split}_{file_type}.csv")
        target_path = os.path.join(tabular_data_path, f"{split}_{file_type}_processed.csv")

        dataset = pl.read_csv(source_path)

        # Check if multi-hot is needed
        if "description" in dataset.columns:
            print("Applying multi-hot encoding...")
            processed = (
                dataset
                .with_columns(multi_hot_expression)
                .drop("description")
                .select(pl.exclude("class"), pl.col("class"))
            )
        else:
            # No text column -> just move class to the end
            processed = (
                dataset
                .select(pl.exclude("class"), pl.col("class"))
            )

        processed.write_csv(target_path)
        print(f"Saved → {target_path}")

print("All processed CSV files saved successfully.")


Processing split: train
Saved → data\tabular\train_images_processed.csv
Saved → data\tabular\train_tabular_processed.csv
Applying multi-hot encoding...
Saved → data\tabular\train_text_processed.csv
Applying multi-hot encoding...
Saved → data\tabular\train_fusion_processed.csv
Processing split: validation
Saved → data\tabular\validation_images_processed.csv
Saved → data\tabular\validation_tabular_processed.csv
Applying multi-hot encoding...
Saved → data\tabular\validation_text_processed.csv
Applying multi-hot encoding...
Saved → data\tabular\validation_fusion_processed.csv
Processing split: test
Saved → data\tabular\test_images_processed.csv
Saved → data\tabular\test_tabular_processed.csv
Applying multi-hot encoding...
Saved → data\tabular\test_text_processed.csv
Applying multi-hot encoding...
Saved → data\tabular\test_fusion_processed.csv
All processed CSV files saved successfully.
