In [4]:
import warnings
import os
import pandas as pd
import numpy as np
from utils import preprocess_arabic_text
from tqdm import tqdm

In [5]:
warnings.filterwarnings(action='ignore', category=pd.errors.DtypeWarning)

In [6]:

datasets_base_path = "./datasets/"
arabic_speech_corpus_path = os.path.join(datasets_base_path,"arabic-speech-corpus")
cv_corpus_path = os.path.join(datasets_base_path, "cv-corpus-17.0-2024-03-15/ar")

output_base_folder = os.path.join(datasets_base_path, 'prepared')

In [7]:
dataframes = ["validated", "validated_sentences", "unvalidated_sentences", "train", "test", "other", "dev"]

In [None]:
arabic_text = np.array([])


for dataframe in dataframes:
    print(f"processing {dataframe} file")
    
    df = pd.read_csv(os.path.join(cv_corpus_path, dataframe + ".tsv"), sep='\t')
    sentences = set()
    for sentence in tqdm(df['sentence'], desc='Sentences'):
        sentences.add(preprocess_arabic_text(sentence))
    

    arabic_text = np.append(arabic_text, np.array(list(sentences)))


In [None]:
np.savetxt(os.path.join(output_base_folder, "arabic_text.txt"),arabic_text, fmt='%s')

In [10]:
train_df = pd.read_csv(os.path.join(cv_corpus_path, "train.tsv"), sep='\t')
test_df = pd.read_csv(os.path.join(cv_corpus_path, "test.tsv"), sep='\t')
other_df = pd.read_csv(os.path.join(cv_corpus_path, "other.tsv"), sep='\t')

In [18]:
train_df = train_df[['path', 'sentence']]
test_df = test_df[['path', 'sentence']]


test_df["path"].values

array(['common_voice_ar_24203362.mp3', 'common_voice_ar_28865270.mp3',
       'common_voice_ar_22931432.mp3', ...,
       'common_voice_ar_20835625.mp3', 'common_voice_ar_20835626.mp3',
       'common_voice_ar_20835656.mp3'], dtype=object)

In [42]:
all_file_names = set()
frames = ["train.tsv", "other.tsv", "dev.tsv", "validated.tsv"]

audio_file = []
sentences = []
for frame in frames:
    print(f"processing {frame} file...")
    df = pd.read_csv(os.path.join(cv_corpus_path, frame), sep='\t')
    for filename in tqdm(df["path"].values, ncols=100):
        if not (filename in all_file_names):
            s = df[df["path"] == filename]["sentence"].iloc[0]
            all_file_names.add(filename)
            audio_file.append(filename)
            sentences.append(s)

            
            # all_sentences.add(preprocess_arabic_text(sentence))
    

processing train.tsv file...


100%|████████████████████████████████████████████████████████| 28369/28369 [01:35<00:00, 297.97it/s]


In [50]:
aggregated_df = pd.DataFrame(columns=["audio_file", "sentence"])

print(f"filenames: {len(audio_file)}")
print(f"sentence {len(sentences)}")

aggregated_df["audio_file"] = audio_file
aggregated_df["sentence"] = sentences


test_df = pd.read_csv(os.path.join(cv_corpus_path, "test.tsv"), sep='\t')

test_df[["path", "sentence"]].to_csv(os.path.join(output_base_folder, 'test.csv'), sep=',', index=False)

aggregated_df.to_csv(os.path.join(output_base_folder, 'train.csv'), sep=',', index=False)

filenames: 28369
sentence 28369
