In [1]:
import numpy as np
import pandas as pd
import re
import torch
import os
from torch.utils.data import DataLoader, random_split, Dataset
import requests
from datasets import load_dataset
from datasets import Dataset

In [2]:
from huggingface_hub import HfApi, HfFolder
from transformers import set_seed

HF_TOKEN = os.environ['HF_TOKEN']

HfFolder.save_token(HF_TOKEN)

seed = 1
torch.cuda.manual_seed_all(seed)
set_seed(seed)
np.random.seed(seed)

## Quran Dataset

In [3]:
quran = pd.read_csv("../raw/quran/Finished.csv").drop(columns=['Unnamed: 0'])

In [4]:
quran

Unnamed: 0,en,ar
0,"praise be to god, lord of the worlds.",الحَمدُ لِلَّهِ رَبِّ العالَمينَ
1,"the most gracious, the most merciful.",الرَّحمٰنِ الرَّحيمِ
2,master of the day of judgment.,مالِكِ يَومِ الدّينِ
3,"it is you we worship, and upon you we call for...",إِيّاكَ نَعبُدُ وَإِيّاكَ نَستَعينُ
4,guide us to the straight path.,اهدِنَا الصِّراطَ المُستَقيمَ
...,...,...
6117,the king of mankind.,مَلِكِ النّاسِ
6118,the god of mankind.,إِلٰهِ النّاسِ
6119,from the evil of the sneaky whisperer.,مِن شَرِّ الوَسواسِ الخَنّاسِ
6120,who whispers into the hearts of people.,الَّذي يُوَسوِسُ في صُدورِ النّاسِ


## Shamela Books

In [5]:
# Use GPT-4o to translate

In [2]:
books = pd.read_csv("../raw/shamela_books/Finished.csv").drop(columns=["Unnamed: 0", 'book'])

In [7]:
books.sample().values # USE GPT4O PLZ

array([['Whoever among you knows something about the matter of Dimna, whether good or bad, let him say so, and let him speak about it in front of the assembly and witnesses, so that the judgment in his matter will be more appropriate, and haste is from desire, and following one’s companions in falsehood is humiliation.',
        'فمن علم منكم شيئاً في أمر دمنة من خيرٍ أو شرٍ، فليقل ذلك، وليتكلم به على رءوس الجمع والأشهاد، ليكون القضاء في أمره أولى، والعجلة من الهوى، ومتابعة الأصحاب على الباطل ذل.']],
      dtype=object)

## Hadith

In [8]:
hadith = pd.read_csv("../raw/hadith/Finished.csv").drop(columns=["Unnamed: 0"])

In [9]:
hadith.sample().values

array([['Allah will set right his affairs and raise his status in one night, or in a moment of one night, such that those invested with authority will agree upon his Caliphate.',
        'يُصْلِحُ الله أَمْرَهُ وَيَرْفَعُ قَدْرَهُ فِي لَيْلَةٍ وَاحِدَةٍ أَوْ فِي سَاعَةٍ وَاحِدَةٍ مِنَ اللَّيْلِ حَيْثُ يَتَّفِقُ عَلَى خِلَافَتِهِ أَهْلُ الْحَلِّ وَالْعَقْدِ فِيهَا']],
      dtype=object)

## Combining Datasets

In [13]:
used_datasets = [quran, books, hadith]
all_df = pd.concat(used_datasets)

from camel_tools.utils import dediac
all_df = all_df.map(dediac.dediac_ar)
all_df = all_df[all_df['ar'].str.contains(r"[\u0600-\u06FF]+")]
all_df = all_df[all_df['en'].str.contains(r"[A-Za-z]+")]
all_df = all_df.map(lambda text: text.strip())
all_df.reset_index(inplace=True, drop=True)

In [14]:
all_df.sample().values

array([['have no doubt that I am a fool to him, and that I am like some who eat his money.',
        'وما أشك أني عنده غمر ، وأني كبعض من يأكل ماله.']], dtype=object)

In [15]:
print("Percentage of each sub_dataset to the whole dataset:")

for i, ds in enumerate(used_datasets):
    print(f"{i}: {len(ds) / sum(len(d) for d in used_datasets):.2%}")

Percentage of each sub_dataset to the whole dataset:
0: 23.08%
1: 51.16%
2: 25.76%


In [16]:
dataset = Dataset.from_pandas(all_df)
dataset = dataset.shuffle(seed=seed)

In [17]:
dataset.push_to_hub("Abdulmohsena/Classic-Arabic-English-Language-Pairs")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Abdulmohsena/Classic-Arabic-English-Language-Pairs/commit/9eceb1fc1246321843f4a0bb4b9231819256366a', commit_message='Upload dataset', commit_description='', oid='9eceb1fc1246321843f4a0bb4b9231819256366a', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Abdulmohsena/Classic-Arabic-English-Language-Pairs', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Abdulmohsena/Classic-Arabic-English-Language-Pairs'), pr_revision=None, pr_num=None)