In [25]:
import os
from pathlib import Path
import rootutils
from datasets import load_dataset
import pandas as pd

In [26]:
rootutils.setup_root(Path.cwd(), indicator=".project-root", pythonpath=True)
ROOT_DIR = Path(os.environ.get("PROJECT_ROOT", Path.cwd()))

(ROOT_DIR / "data" / "raw").mkdir(parents=True, exist_ok=True)
(ROOT_DIR / "data" / "parallel").mkdir(parents=True, exist_ok=True)

In [27]:
ds_spivavtor = load_dataset("grammarly/spivavtor", split="train+validation")
df_spivavtor = ds_spivavtor.to_pandas()
standard_spivavtor = df_spivavtor.rename(columns={"tgt": "target"})["target"].tolist()
print(f"Loaded {len(standard_spivavtor)} sentences from Spivavtor.")

Loaded 69760 sentences from Spivavtor.


In [28]:
print("Loading hutsul/hutsul-manually-annotated...")
ds = load_dataset("hutsul/hutsul-manually-annotated")
df_hutsul = ds["train"].to_pandas()[["source", "target"]]
standard_hutsul = df_hutsul["target"].tolist()
print(f"Loaded {len(standard_hutsul)} pairs from Hutsul manually annotated dataset.")

Loading hutsul/hutsul-manually-annotated...
Loaded 9852 pairs from Hutsul manually annotated dataset.
Loaded 9852 pairs from Hutsul manually annotated dataset.


In [29]:
all_standard_sentences = standard_spivavtor + standard_hutsul

unique_standard = list(set(all_standard_sentences))
unique_standard = [s for s in unique_standard if isinstance(s, str) and len(s.strip()) > 0]

print(f"Total unique Standard Ukrainian sentences: {len(unique_standard)}")

output_path_std = ROOT_DIR / "data" / "raw" / "standard_ukrainian.csv"
df_std = pd.DataFrame({"text": unique_standard})
df_std.to_csv(output_path_std, index=False)
print(f"Saved Standard Ukrainian corpus to {output_path_std}")

Total unique Standard Ukrainian sentences: 77731
Saved Standard Ukrainian corpus to /Users/denys.koval/Labs/projects/surdo-perevodchik/data/raw/standard_ukrainian.csv


In [30]:
output_path_hutsul = ROOT_DIR / "data" / "parallel" / "hutsul_parallel.csv"
df_hutsul.to_csv(output_path_hutsul, index=False)
print(f"Saved Hutsul parallel corpus to {output_path_hutsul}")

Saved Hutsul parallel corpus to /Users/denys.koval/Labs/projects/surdo-perevodchik/data/parallel/hutsul_parallel.csv
