In [None]:
!pip install -q wget

In [None]:
import wget
import os
import pandas as pd
import json

In [None]:
# data prep for multilingual tokenizer training
bigc_train_json_url = "https://raw.githubusercontent.com/csikasote/bigc/main/data/bem/splits/train.jsonl"
ffstc_path = "/ocean/projects/cis210027p/gichamba/iwslt25/mymy/train.csv"
bem_en_path = "/ocean/projects/cis210027p/gichamba/iwslt25/bem_en.txt"
fon_fr_path = "/ocean/projects/cis210027p/gichamba/iwslt25/fon_fr.txt"

temp_dir = "temp"
os.makedirs(temp_dir, exist_ok=True)
bigc_train_json_path = os.path.join(temp_dir, "train.jsonl")

if not os.path.exists(bigc_train_json_path):
    wget.download(bigc_train_json_url, bigc_train_json_path)

bem_lines = []
en_lines = []
fr_lines = []
fon_lines = []
# load data
with open(bigc_train_json_path, "r") as f:
    lines = f.readlines()
    # extract bem_transcription and en_translation

    for line in lines:
        line = json.loads(line)
        bem_transcription = line["bem_transcription"].strip()
        en_translation = line["en_translation"].strip()

        if bem_transcription != "." and en_translation != "." and bem_transcription != "" and en_translation != "":
            # ensure sentence capitalization
            bem_transcription[0].upper() + bem_transcription[1:]
            en_translation[0].upper() + en_translation[1:]
            bem_lines.append(bem_transcription)
            en_lines.append(en_translation)
        else :
            print(f"Skipping line with bem_transcription: {bem_transcription} and en_translation: {en_translation}")
    print(f"Loaded {len(bem_lines)} lines in bigc train data")
big_c_num = len(bem_lines)

# open ffstc
df = pd.read_csv(ffstc_path)
for i, row in df.iterrows():
    fr_translation = row["utterance"].strip()

    if fr_translation != "." and fr_translation != "":
        # ensure sentence capitalization
        fr_translation[0].upper() + fr_translation[1:]
        fr_lines.append(fr_translation)
print(f"Loaded {len(fr_lines)} lines in ffstc data")

# open bem_en
with open(bem_en_path, "r") as f:
    bem_en_lines = f.readlines()
    for line in bem_en_lines:
        line = line.strip()
        bem, en = line.split(" || ")
        bem_lines.append(bem)
        en_lines.append(en)
    print(f"Loaded {len(bem_lines) - big_c_num} lines in bem_en data")

# open fon_fr
with open(fon_fr_path, "r", encoding="utf-8") as f:
    fon_fr_lines = f.readlines()
    for line in fon_fr_lines:
        line = line.strip()
        fon, fr = line.split(" || ")
        fon_lines.append(fon)
        fr_lines.append(fr)
    print(f"Loaded {len(fon_lines)} lines in fon_fr data")

# summary stats
print(f"Total bem lines: {len(bem_lines)}")
print(f"Total en lines: {len(en_lines)}")
print(f"Total fr lines: {len(fr_lines)}")
print(f"Total fon lines: {len(fon_lines)}")


In [None]:
# dump all data to temp file in temp dir
all_data_path = os.path.join(temp_dir, "all_data.txt")
with open(all_data_path, "w") as f:
    for bem, en, fr, fon in zip(bem_lines, en_lines, fr_lines, fon_lines):
        f.write(f"{bem}\n")
        f.write(f"{en}\n")
        f.write(f"{fr}\n")
        f.write(f"{fon}\n")

In [None]:
!cat temp/all_data.txt | wc -l

In [None]:
# delete temp files
os.remove(bigc_train_json_path)
os.remove(all_data_path)
# remove temp dir
os.rmdir(temp_dir)