In [6]:
import os
import requests
import json
from datasets import Dataset, DatasetDict

In [16]:
# GitHub raw file URLs
base_url = "https://raw.githubusercontent.com/laith85/Transformer_NMT_AD/main/"
data_files = {
    "Gulf": ("Gulf_Dialects.txt", "MSA_For_Gulf_Dialects.txt"),
    "Iraqi": ("Iraqi_Dialects.txt", "MSA_For_Iraqi_Dialects.txt"),
    "Levantine": ("Levantine_Dialects.txt", "MSA_For_Levantine_Dialects.txt"),
    "Nile Basin": ("Nile Basin_ Dialects.txt", "MSA_For_Nile_Basin_Dialects.txt"),
    "North Africa": ("North_Africa _Dialect.txt", "MSA_For_North_Africa_Dialects.txt")
}

# Initialize list to hold each data entry as a dictionary
data_entries = []

# Function to download and read content from GitHub
def fetch_content(file_url):
    response = requests.get(file_url)
    response.raise_for_status()  # Raise error for bad status
    return response.text.splitlines()

# Load data from files in GitHub
for dialect, (dialect_file, msa_file) in data_files.items():
    dialect_url = base_url + dialect_file
    msa_url = base_url + msa_file
    
    dialect_sentences = fetch_content(dialect_url)
    msa_sentences = fetch_content(msa_url)
    
    # Ensure same number of sentences in dialect and MSA files
    assert len(dialect_sentences) == len(msa_sentences), \
        f"Mismatch in lines for {dialect_file} and {msa_file}"
    
    # Combine text, translation, and label into a dictionary for each sentence pair
    for dialect_sentence, msa_sentence in zip(dialect_sentences, msa_sentences):
        data_entries.append({
            "text": dialect_sentence.strip(),
            "translation": msa_sentence.strip(),
            "label": dialect
        })

# Save as JSON file
with open("dataset.json", "w", encoding="utf-8") as json_file:
    json.dump(data_entries, json_file, ensure_ascii=False, indent=4)

print("Data saved to dataset.json")


Data saved to dataset.json


In [5]:
pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting huggingface-hub>=0.23.0 (from datasets)
  Downloading huggingface_hub-0.26.2-py3-none-any.whl.metadata (13 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading huggingface_hub-0.26.2-py3-none-any.whl (447 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m447.5/447.5 kB[0m [31m39.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K

In [7]:
import os
import requests
import json
import random
from datasets import Dataset, DatasetDict

# GitHub raw file URLs
base_url = "https://raw.githubusercontent.com/laith85/Transformer_NMT_AD/main/"
data_files = {
    "Gulf": ("Gulf_Dialects.txt", "MSA_For_Gulf_Dialects.txt"),
    "Iraqi": ("Iraqi_Dialects.txt", "MSA_For_Iraqi_Dialects.txt"),
    "Levantine": ("Levantine_Dialects.txt", "MSA_For_Levantine_Dialects.txt"),
    "Nile Basin": ("Nile Basin_ Dialects.txt", "MSA_For_Nile_Basin_Dialects.txt"),
    "North Africa": ("North_Africa _Dialect.txt", "MSA_For_North_Africa_Dialects.txt")
}

# Initialize list to hold each data entry as a dictionary
data_entries = []

# Function to download and read content from GitHub
def fetch_content(file_url):
    response = requests.get(file_url)
    response.raise_for_status()  # Raise error for bad status
    return response.text.splitlines()

# Load data from files in GitHub
for dialect, (dialect_file, msa_file) in data_files.items():
    dialect_url = base_url + dialect_file
    msa_url = base_url + msa_file
    
    dialect_sentences = fetch_content(dialect_url)
    msa_sentences = fetch_content(msa_url)
    
    # Ensure same number of sentences in dialect and MSA files
    assert len(dialect_sentences) == len(msa_sentences), \
        f"Mismatch in lines for {dialect_file} and {msa_file}"
    
    # Combine text, translation, and label into a dictionary for each sentence pair
    for dialect_sentence, msa_sentence in zip(dialect_sentences, msa_sentences):
        data_entries.append({
            "text": dialect_sentence.strip(),
            "translation": msa_sentence.strip(),
            "label": dialect
        })

# Now, take 5% from each category
sampled_entries = []

# Iterate over each dialect and take 5% of the data entries
for dialect in data_files.keys():
    # Filter data for the current dialect
    dialect_data = [entry for entry in data_entries if entry['label'] == dialect]
    
    # Calculate 5% of the dialect data
    sample_size = int(len(dialect_data) * 0.05)
    
    # Randomly select 5% of the data
    sampled_dialect_data = random.sample(dialect_data, sample_size)
    
    # Append the sampled data to the final list
    sampled_entries.extend(sampled_dialect_data)

# Save the sampled data as a new JSON file
with open("sampled_dataset.json", "w", encoding="utf-8") as json_file:
    json.dump(sampled_entries, json_file, ensure_ascii=False, indent=4)

print("Sampled data saved to sampled_dataset.json")


Sampled data saved to sampled_dataset.json


In [17]:
data_entries

[{'text': 'موجود في نهاية الممر، بجيب لك شوي الحين. اذا احتجت شي ثاني خبرني.',
  'translation': 'إنها في أخر القاعة . سوف آتي لك ببعض منها الآن . إذا أردت أي شيئاً آخر فقط أعلمني .',
  'label': 'Gulf'},
 {'text': 'تسوي تعديلات؟',
  'translation': 'هل تقومون بعمل تعديلات ؟',
  'label': 'Gulf'},
 {'text': 'بغينا طاولة يم الدريشة.',
  'translation': 'نريد مائدة بجانب النافذة .',
  'label': 'Gulf'},
 {'text': 'هو ذاك الصوب، بالضبط جدام استعلامات السياح بالضبط.',
  'translation': 'هناك ، أمام بيانات السائح تماما .',
  'label': 'Gulf'},
 {'text': 'ما قط سمعت بهالعنوان في هالمنطقة من قبل.',
  'translation': 'لم اسمع بهذا العنوان من قبل بالقرب من هنا .',
  'label': 'Gulf'},
 {'text': 'امش سيده لين تشوف صيدلية.',
  'translation': 'استمر في السير في هذا الطريق حتى تجد صيدلية .',
  'label': 'Gulf'},
 {'text': 'شنو اللون اللي هابين فيه هالفترة؟',
  'translation': 'ما هو أحدث لون هذا الموسم .',
  'label': 'Gulf'},
 {'text': 'في حالتي انا دايماً حق شغل، نادراً حق استانس.',
  'translation': 'في حالتي

In [15]:
with open("./dataset.json", "w", encoding="utf-8") as json_file:
    json.dump(data_entries, json_file, ensure_ascii=False, indent=4)

print("Data saved to dataset.json")

Data saved to dataset.json


In [1]:
import zipfile
import os
import json

# Define paths and files for each dialect
data_files = {
    "Gulf": ("Gulf_Dialects.txt", "MSA_For_Gulf_Dialects.txt"),
    "Iraqi": ("Iraqi_Dialects.txt", "MSA_For_Iraqi_Dialects.txt"),
    "Levantine": ("Levantine_Dialects.txt", "MSA_For_Levantine_Dialects.txt"),
    "Egyptian": ("Nile Basin_ Dialects.txt", "MSA_For_Nile_Basin_Dialects.txt"),
    "Maghrebian": ("North_Africa _Dialect.txt", "MSA_For_North_Africa_Dialects.txt")
}

# Unzip dataset
with zipfile.ZipFile("dialects_datasets.zip", "r") as zip_ref:
    zip_ref.extractall("dialects_data")

# Initialize list to hold each data entry as a dictionary
data_entries = []

# Function to read lines from a file
def read_lines(filepath):
    with open(filepath, "r", encoding="utf-8") as file:
        return file.readlines()

# Process each dialect's files
for dialect, (dialect_file, msa_file) in data_files.items():
    dialect_path = os.path.join("dialects_data", dialect_file)
    msa_path = os.path.join("dialects_data", msa_file)
    
    # Read lines from each file
    dialect_sentences = read_lines(dialect_path)
    msa_sentences = read_lines(msa_path)
    
    # Ensure same number of sentences in dialect and MSA files
    assert len(dialect_sentences) == len(msa_sentences), \
        f"Mismatch in lines for {dialect_file} and {msa_file}"
    
    # Combine text, translation, and label into a dictionary for each sentence pair
    for dialect_sentence, msa_sentence in zip(dialect_sentences, msa_sentences):
        data_entries.append({
            "text": dialect_sentence.strip(),
            "translation": msa_sentence.strip(),
            "label": dialect
        })

# Save as JSON file
with open("dataset.json", "w", encoding="utf-8") as json_file:
    json.dump(data_entries, json_file, ensure_ascii=False, indent=4)

print("Data saved to dataset.json")


FileNotFoundError: [Errno 2] No such file or directory: 'dialects_datasets.zip'

In [23]:
!ls './'

dataset.json
