In [15]:
import wget
import os
import pandas as pd
import json

In [16]:
# data prep for multilingual tokenizer training
bigc_train_json_url = "https://raw.githubusercontent.com/csikasote/bigc/main/data/bem/splits/train.jsonl"
bemba_speech_url = "https://raw.githubusercontent.com/csikasote/BembaSpeech/refs/heads/master/bem/train.tsv"
alffa_fon_url = "https://raw.githubusercontent.com/besacier/ALFFA_PUBLIC/refs/heads/master/ASR/FONGBE/data/train/text"
ffr_url = "https://raw.githubusercontent.com/bonaventuredossou/ffr-v1/refs/heads/master/FFR-Dataset/FFR Dataset v2/ffr_dataset_v2.txt"
# local paths
ffstc_path = "../../mymy/train.csv"

temp_dir = "temp"
os.makedirs(temp_dir, exist_ok=True)
bigc_train_json_path = os.path.join(temp_dir, "bem_train.jsonl")
bemba_speech_path = os.path.join(temp_dir, "bemba_speech.tsv")
alffa_fon_path = os.path.join(temp_dir, "alffa_fon.txt")
ffr_path = os.path.join(temp_dir, "ffr.txt")

if not os.path.exists(bigc_train_json_path):
    wget.download(bigc_train_json_url, bigc_train_json_path)

if not os.path.exists(bemba_speech_path):
    wget.download(bemba_speech_url, bemba_speech_path)

if not os.path.exists(alffa_fon_path):
    wget.download(alffa_fon_url, alffa_fon_path)

if not os.path.exists(ffr_path):
    wget.download(ffr_url, ffr_path)

In [17]:
fon_sentence = "Àgɔ́! Ǹkɔ̀xɔ̀ wá, wɛ̀tɛ̀ ànù yì bó, ɖò lɛ̃̀, ɔ̃̀, ɛ̃́, ì, ú, ò, ɖɔ́, gbè, kpó, xù, ʋù, zã́. Sɛ́ wɛ́ ɖé ɖé, mí xó wɛ̀, é yà hùn dɔ̀ wɛ̃́. ʋɛ̀, ɖè, ɖɔ̀, mɛ̃̀, yì, gbɔ̀, sɔ̃̀, lɛ́ nɔ̀ ɖó lɛ́. Àɖó lɛ̀ wɛ̃̀ dɔ̀, ɔ̀kpà kpɛ́!"

In [19]:
bem_lines = []
en_lines = []
fra_lines = []
fon_lines = []
# load data
with open(bigc_train_json_path, "r") as f:
    lines = f.readlines()
    # extract bem_transcription and en_translation

    for line in lines:
        line = json.loads(line)
        bem_transcription = line["bem_transcription"].strip()
        en_translation = line["en_translation"].strip()

        if bem_transcription != "." and en_translation != "." and bem_transcription != "" and en_translation != "":
            bem_lines.append(bem_transcription)
            en_lines.append(en_translation)
        else :
            print(f"Skipping line with bem_transcription: {bem_transcription} and en_translation: {en_translation}")
    print(f"Loaded {len(bem_lines)} lines in bigc train data")
big_c_num = len(bem_lines)

# open ffstc
df = pd.read_csv(ffstc_path)
for i, row in df.iterrows():
    fr_translation = row["utterance"].strip()

    if fr_translation != "." and fr_translation != "":
        fra_lines.append(fr_translation)
print(f"Loaded {len(fra_lines)} lines in ffstc data")
ffstc_num = len(fra_lines)

# open bemba_speech
with open(bemba_speech_path, "r", encoding="utf-8") as f:
    bemba_speech_lines = f.readlines()
    for line in bemba_speech_lines[1:]:
        line = line.strip()
        if line != "":
            _, bem = line.split("\t")
            bem = bem.strip()
            # write
            if bem != "." and bem != "":
                bem_lines.append(bem)
print(f"Loaded {len(bem_lines)-big_c_num} lines in bemba_speech data")


# open alffa_fon
with open(alffa_fon_path, "r", encoding="utf-8") as f:
    alffa_fon_lines = f.readlines()
    for line in alffa_fon_lines:
        line = line.strip()
        if line != "":
            _, fon = line.split(' ', 1)  # Split on first space only
            fon = fon.strip()
            # write
            if fon != "." and fon != "":
                fon_lines.append(fon)
print(f"Loaded {len(fon_lines)} lines in alffa_fon data")
alffa_fon_num = len(fon_lines)

# open ffr
with open(ffr_path, "r", encoding="utf-8") as f:
    ffr_lines = f.readlines()
    for line in ffr_lines:
        line = line.strip()
        if line != "":
            parts = line.split('\t')
            if len(parts) == 2:
                fon, fra = parts
                fon = fon.strip()
                fra = fra.strip()
                # write
                if fon != "." and fon != "":
                    fon_lines.append(fon)
                if fra != "." and fra != "":
                    fra_lines.append(fra)
print(f"Loaded {len(fra_lines)-ffstc_num} lines in ffr data")

# add fon_sentence to fon_lines
fon_lines.append(fon_sentence)
# summary stats
print()
print(f"Total bem lines: {len(bem_lines)}")
print(f"Total en lines: {len(en_lines)}")
print(f"Total fr lines: {len(fra_lines)}")
print(f"Total fon lines: {len(fon_lines)}")


Skipping line with bem_transcription: Umulumendo umo nasenda umupila elo umunakwe namukoba mukulu. and en_translation: 
Skipping line with bem_transcription: Mumbali ya ici ici cimashini cipompa amenshi kuli  icikulwa. and en_translation: .
Skipping line with bem_transcription: Kabili umulumendo naikata pa kubeya kwakwe ilyo alekopwa icikope. and en_translation: .
Skipping line with bem_transcription: Elyo kabili naisala na amenso yakwe. and en_translation: .
Skipping line with bem_transcription: Abantu baleteya ubwangalo ubwakushelemuka pa meenshi makasa munshita yakasuba. and en_translation: .
Skipping line with bem_transcription: Umwana naikala elyo alelya. and en_translation: .
Skipping line with bem_transcription: Imbwa na ingombe  fili mwibala. and en_translation: .
Skipping line with bem_transcription: Shitaata ali mumuputule wakulilamo naikala petebulo. and en_translation: .
Skipping line with bem_transcription: Uyu umulumendo wa caice alepeluka ku ntambo iikakilwe ku mutanto u

In [20]:
# dump all data to temp file in temp dir
all_data_path = os.path.join(temp_dir, "all_data.txt")
with open(all_data_path, "w") as f:
    for bem, en, fr, fon in zip(bem_lines, en_lines, fra_lines, fon_lines):
        f.write(f"{bem}\n")
        f.write(f"{en}\n")
        f.write(f"{fr}\n")
        f.write(f"{fon}\n")

In [21]:
!cat temp/all_data.txt | wc -l

251951


In [None]:
# delete temp files
os.remove(bigc_train_json_path)
os.remove(all_data_path)
# remove temp dir
os.rmdir(temp_dir)