In [None]:
import os
from huggingface_hub import snapshot_download
from datasets import load_dataset, concatenate_datasets, Dataset, get_dataset_config_names
from tqdm.auto import tqdm
import pandas as pd
from dotenv import load_dotenv
load_dotenv()
# Check if the datasets library is installed, and install it if not.
try:
    import datasets
except ImportError:
    print("The 'datasets' library is not installed. Installing now...")
    os.system("pip install datasets")
    from datasets import load_dataset, concatenate_datasets, Dataset, get_dataset_config_names

# --- Dataset Configuration ---
# List of datasets to load from Hugging Face.
HF_DATASETS_TO_LOAD = [
    "AzizBelaweid/Tunisian_Language_Dataset",
    "arbml/Tunisian_Dialect_Corpus",
    "hamzabouajila/Sample_Tunisiya_Dataset",
    "abdouuu/tunisian_chatbot_data",
    "linagora/Tunisian_Derja_Dataset",
]

# The name of the column that contains the text.
TEXT_COLUMN_NAME = "text"
# A list of possible text column names to check for.
POSSIBLE_TEXT_COLUMN_NAMES = ["text", "content", "snippet", "description", "tweets", "sentence", "Expanded Context", "input", "instruction"]

# --- Main Script Functions ---

def find_text_column(dataset_columns):
    """Finds the most suitable text column from a list of column names."""
    for col in POSSIBLE_TEXT_COLUMN_NAMES:
        if col in dataset_columns:
            return col
    # Fallback to a case-insensitive check
    for col in dataset_columns:
        if col.lower() in [s.lower() for s in POSSIBLE_TEXT_COLUMN_NAMES]:
            return col
    return None

def load_and_preprocess_datasets():
    """
    Loads specified datasets, renames their text columns for consistency,
    and returns a list of loaded datasets.
    """
    all_datasets = []
    
    print("Starting to load datasets from Hugging Face...")
    for dataset_name in tqdm(HF_DATASETS_TO_LOAD, desc="Loading datasets"):
        try:
            # Handle the specific case for the uploaded XLSX file
            if dataset_name == "hamzabouajila/Sample_Tunisiya_Dataset":
                print(f"Loading specific CSV file for {dataset_name} using pandas.")
                snapshot_download("hamzabouajila/Sample_Tunisiya_Dataset",local_dir="hamzabouajila/Sample_Tunisiya_Dataset")
                df = pd.read_csv("search_results.xlsx")
                ds = Dataset.from_pandas(df)
      
            
            # Handle the specific case for the linagora dataset with multiple configs
            elif dataset_name == "linagora/Tunisian_Derja_Dataset":
                print(get_dataset_split_names("linagora/Tunisian_Derja_Dataset", config_name="Tunisian_Dialectic_English_Derja"))

                print(f"Loading all configurations for {dataset_name}.")
                configs = get_dataset_config_names(dataset_name)
                for config in configs:
                    print(f"  - Loading config: {config}")
                    ds = load_dataset(dataset_name, config, split="train")
                    
                    if TEXT_COLUMN_NAME not in ds.column_names:
                        found_text_column = find_text_column(ds.column_names)
                        if found_text_column:
                            ds = ds.rename_column(found_text_column, TEXT_COLUMN_NAME)
                        else:
                            print(f"Warning: Could not find a suitable text column in config '{config}'. Skipping.")
                            continue
                    
                    all_datasets.append(ds.select_columns([TEXT_COLUMN_NAME]))
                continue # Move to the next dataset in the main loop

            else:
                # Load other datasets directly.
                ds = load_dataset(dataset_name, split="train")
            
            # Check for the text column and rename it to a standard name.
            if TEXT_COLUMN_NAME not in ds.column_names:
                found_text_column = find_text_column(ds.column_names)
                if found_text_column:
                    print(f"Renaming '{found_text_column}' to '{TEXT_COLUMN_NAME}' in {dataset_name}.")
                    ds = ds.rename_column(found_text_column, TEXT_COLUMN_NAME)
                else:
                    print(f"Warning: Could not find a suitable text column in {dataset_name}. Skipping.")
                    continue
            
            all_datasets.append(ds.select_columns([TEXT_COLUMN_NAME]))

        except Exception as e:
            print(f"Error loading {dataset_name}: {e}. Skipping this dataset.")
    
    return all_datasets

def combine_and_deduplicate(datasets_list):
    """
    Combines all datasets into a single one, removes duplicates, and shuffles.
    """
    if not datasets_list:
        print("No datasets to combine. Exiting.")
        return None
        
    print("\nConcatenating datasets...")
    
    # Filter out streaming datasets for concatenation
    non_streaming_datasets = [ds for ds in datasets_list if not isinstance(ds, datasets.iterable_dataset.IterableDataset)]
    
    if not non_streaming_datasets:
        print("No non-streaming datasets to combine. Exiting.")
        return None

    combined_dataset = concatenate_datasets(non_streaming_datasets)
    
    print(f"Initial combined dataset size: {len(combined_dataset)} rows.")
    
    print("Removing duplicate rows...")
    df = combined_dataset.to_pandas()
    original_size = len(df)
    df.drop_duplicates(subset=[TEXT_COLUMN_NAME], inplace=True)
    deduplicated_size = len(df)
    print(f"Removed {original_size - deduplicated_size} duplicates. Final size: {deduplicated_size} rows.")
    
    # Convert back to a Hugging Face dataset.
    final_dataset = Dataset.from_pandas(df)
    
    print("Shuffling the final dataset...")
    final_dataset = final_dataset.shuffle(seed=42)
    
    return final_dataset

def save_dataset(dataset, output_path="tunbert_distillation_corpus.parquet"):
    """
    Saves the final prepared dataset to a Parquet file.
    """
    if dataset:
        print(f"\nSaving the final dataset to '{output_path}'...")
        dataset.to_parquet(output_path)
        print("Dataset saved successfully!")


In [None]:
import os
from huggingface_hub import snapshot_download
from datasets import load_dataset, concatenate_datasets, Dataset, get_dataset_config_names, get_dataset_split_names
from tqdm.auto import tqdm
import pandas as pd

# Check if the datasets library is installed, and install it if not.
try:
    import datasets
except ImportError:
    print("The 'datasets' library is not installed. Installing now...")
    os.system("pip install datasets")
    from datasets import load_dataset, concatenate_datasets, Dataset, get_dataset_config_names, get_dataset_split_names

# --- Dataset Configuration ---
# List of datasets to load from Hugging Face.
HF_DATASETS_TO_LOAD = [
    "AzizBelaweid/Tunisian_Language_Dataset",
    "arbml/Tunisian_Dialect_Corpus",
    "hamzabouajila/Sample_Tunisiya_Dataset",
    "abdouuu/tunisian_chatbot_data",
    "linagora/Tunisian_Derja_Dataset",
    "khaled123/tuniset"
]

# The name of the column that contains the text.
TEXT_COLUMN_NAME = "text"
# A list of possible text column names to check for.
POSSIBLE_TEXT_COLUMN_NAMES = ["text", "content", "snippet", "description", "tweets", "sentence", "Expanded Context", "input", "instruction"]

# --- Main Script Functions ---

def find_text_column(dataset_columns):
    """Finds the most suitable text column from a list of column names."""
    for col in POSSIBLE_TEXT_COLUMN_NAMES:
        if col in dataset_columns:
            return col
    # Fallback to a case-insensitive check
    for col in dataset_columns:
        if col.lower() in [s.lower() for s in POSSIBLE_TEXT_COLUMN_NAMES]:
            return col
    return None

def ensure_text_column_type(dataset, dataset_name):
    """
    Ensures the text column is of string type to avoid type mismatch during concatenation.
    """
    if TEXT_COLUMN_NAME in dataset.column_names:
        # Convert to pandas to check and convert data type
        df = dataset.to_pandas()
        if df[TEXT_COLUMN_NAME].dtype != "object":  # Check if not string-like
            print(f"Converting '{TEXT_COLUMN_NAME}' column to string in {dataset_name} (original type: {df[TEXT_COLUMN_NAME].dtype}).")
            df[TEXT_COLUMN_NAME] = df[TEXT_COLUMN_NAME].astype(str).replace("nan", None)
            dataset = Dataset.from_pandas(df)
    return dataset

def load_and_preprocess_datasets():
    """
    Loads specified datasets, renames their text columns for consistency,
    ensures text column is string type, and returns a list of loaded datasets.
    """
    all_datasets = []
    hf_token = os.getenv("HF_TOKEN")  # Get Hugging Face token for authentication
    
    print("Starting to load datasets from Hugging Face...")
    for dataset_name in tqdm(HF_DATASETS_TO_LOAD, desc="Loading datasets"):
        try:
            if dataset_name == "hamzabouajila/Sample_Tunisiya_Dataset":
                print(f"Loading specific CSV file for {dataset_name} using pandas.")
                snapshot_download(repo_id=dataset_name,
                                 repo_type="dataset",
                                 local_dir="hamzabouajila/Sample_Tunisiya_Dataset",
                                 token=hf_token)
                file_path = "hamzabouajila/Sample_Tunisiya_Dataset/search_results.xlsx"
                if not os.path.exists(file_path):
                    print(f"Error: File {file_path} not found. Skipping {dataset_name}.")
                    continue
                df = pd.read_csv(file_path,encoding='utf-8')
                # Ensure the text column exists and is string type
                text_col = find_text_column(df.columns)
                if text_col:
                    print(f"Found text column '{text_col}' in {dataset_name}.")
                    df = df.rename(columns={text_col: TEXT_COLUMN_NAME})
                    df[TEXT_COLUMN_NAME] = df[TEXT_COLUMN_NAME].astype(str).replace("nan", None)
                    ds = Dataset.from_pandas(df)
                else:
                    print(f"Warning: Could not find a suitable text column in {dataset_name}. Skipping.")
                    continue
            
            # Handle the specific case for the linagora dataset with multiple configs
            elif dataset_name == "linagora/Tunisian_Derja_Dataset":
                print(f"Loading all configurations for {dataset_name}.")
                configs = get_dataset_config_names(dataset_name, token=hf_token)
                for config in configs:
                    try:
                        print(f"  - Loading config: {config}")
                        # Check available splits
                        available_splits = get_dataset_split_names(dataset_name, config_name=config, token=hf_token)
                        print(f"    Available splits: {available_splits}")
                        split_to_use = "train" if "train" in available_splits else available_splits[0] if available_splits else None
                        
                        if not split_to_use:
                            print(f"    No valid splits found for config '{config}'. Skipping.")
                            continue
                        
                        ds = load_dataset(dataset_name, config, split=split_to_use, token=hf_token)
                        
                        if TEXT_COLUMN_NAME not in ds.column_names:
                            found_text_column = find_text_column(ds.column_names)
                            if found_text_column:
                                print(f"    Renaming '{found_text_column}' to '{TEXT_COLUMN_NAME}' in config {config}.")
                                ds = ds.rename_column(found_text_column, TEXT_COLUMN_NAME)
                            else:
                                print(f"    Warning: Could not find a suitable text column in config '{config}'. Skipping.")
                                continue
                        
                        # Ensure text column is string type
                        ds = ensure_text_column_type(ds, f"{dataset_name}/{config}")
                        all_datasets.append(ds.select_columns([TEXT_COLUMN_NAME]))
                        print(f"    Successfully loaded config '{config}' with {len(ds)} rows.")
                    except Exception as e:
                        print(f"    An error of type {type(e).__name__} occurred while loading config '{config}' for {dataset_name}. Details: {e}. Skipping this config.")
                continue  # Move to the next dataset in the main loop

            else:
                # Load other datasets directly.
                available_splits = get_dataset_split_names(dataset_name, token=hf_token)
                print(f"Available splits for {dataset_name}: {available_splits}")
                split_to_use = "train" if "train" in available_splits else available_splits[0] if available_splits else None
                
                if not split_to_use:
                    print(f"No valid splits found for {dataset_name}. Skipping.")
                    continue
                
                ds = load_dataset(dataset_name, split=split_to_use, token=hf_token)
            
            # Check for the text column and rename it to a standard name.
            if TEXT_COLUMN_NAME not in ds.column_names:
                found_text_column = find_text_column(ds.column_names)
                if found_text_column:
                    print(f"Renaming '{found_text_column}' to '{TEXT_COLUMN_NAME}' in {dataset_name}.")
                    ds = ds.rename_column(found_text_column, TEXT_COLUMN_NAME)
                else:
                    print(f"Warning: Could not find a suitable text column in {dataset_name}. Skipping.")
                    continue
            
            # Ensure text column is string type
            ds = ensure_text_column_type(ds, dataset_name)
            all_datasets.append(ds.select_columns([TEXT_COLUMN_NAME]))
            print(f"Successfully loaded {dataset_name} with {len(ds)} rows.")

        except Exception as e:
            print(f"An error of type {type(e).__name__} occurred while loading {dataset_name}. Details: {e}. Skipping this dataset.")
    
    return all_datasets

def combine_and_deduplicate(datasets_list):
    """
    Combines all datasets into a single one, removes duplicates, and shuffles.
    """
    if not datasets_list:
        print("No datasets to combine. Exiting.")
        return None
        
    print("\nConcatenating datasets...")
    
    # Filter out streaming datasets for concatenation
    non_streaming_datasets = [ds for ds in datasets_list if not isinstance(ds, datasets.iterable_dataset.IterableDataset)]
    
    if not non_streaming_datasets:
        print("No non-streaming datasets to combine. Exiting.")
        return None

    # Log feature types for debugging
    for i, ds in enumerate(non_streaming_datasets):
        print(f"Dataset {i+1} features: {ds.features}")

    combined_dataset = concatenate_datasets(non_streaming_datasets)
    
    print(f"Initial combined dataset size: {len(combined_dataset)} rows.")
    
    print("Removing duplicate rows...")
    df = combined_dataset.to_pandas()
    original_size = len(df)
    df.drop_duplicates(subset=[TEXT_COLUMN_NAME], inplace=True)
    deduplicated_size = len(df)
    print(f"Removed {original_size - deduplicated_size} duplicates. Final size: {deduplicated_size} rows.")
    
    # Convert back to a Hugging Face dataset.
    final_dataset = Dataset.from_pandas(df)
    
    print("Shuffling the final dataset...")
    final_dataset = final_dataset.shuffle(seed=42)
    
    # Print a sample for debugging
    print(f"Sample from combined dataset: {final_dataset[:5][TEXT_COLUMN_NAME]}")
    
    return final_dataset

def save_dataset(dataset, output_path="tunbert_distillation_corpus.parquet"):
    """
    Saves the final prepared dataset to a Parquet file.
    """
    if dataset:
        print(f"\nSaving the final dataset to '{output_path}'...")
        dataset.to_parquet(output_path)
        print("Dataset saved successfully!")

In [25]:
# The Tunisiya Corpus is not on Hugging Face.
# The user has provided an uploaded version of this.
# We are now using available Hugging Face datasets.

# Load and preprocess all specified datasets.
loaded_datasets = load_and_preprocess_datasets()

# Combine, deduplicate, and shuffle the datasets.
final_corpus = combine_and_deduplicate(loaded_datasets)

# Save the final prepared corpus.
# save_dataset(final_corpus)

Starting to load datasets from Hugging Face...


Loading datasets:   0%|          | 0/5 [00:00<?, ?it/s]

Available splits for AzizBelaweid/Tunisian_Language_Dataset: ['train']


Successfully loaded AzizBelaweid/Tunisian_Language_Dataset with 269773 rows.


Repo card metadata block was not found. Setting CardData to empty.


Available splits for arbml/Tunisian_Dialect_Corpus: ['train']


Repo card metadata block was not found. Setting CardData to empty.


Loading specific CSV file for hamzabouajila/Sample_Tunisiya_Dataset using pandas.


Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

An error of type UnicodeDecodeError occurred while loading hamzabouajila/Sample_Tunisiya_Dataset. Details: 'utf-8' codec can't decode byte 0xaf in position 10: invalid start byte. Skipping this dataset.
Available splits for abdouuu/tunisian_chatbot_data: ['train']
Renaming 'input' to 'text' in abdouuu/tunisian_chatbot_data.
Converting 'text' column to string in abdouuu/tunisian_chatbot_data (original type: float64).
Successfully loaded abdouuu/tunisian_chatbot_data with 1426 rows.
Loading all configurations for linagora/Tunisian_Derja_Dataset.
  - Loading config: Derja_tunsi
    Available splits: ['train']
    Successfully loaded config 'Derja_tunsi' with 13037 rows.
  - Loading config: HkayetErwi
    Available splits: ['train']
    Successfully loaded config 'HkayetErwi' with 946 rows.
  - Loading config: Sentiment_Derja
    Available splits: ['train']
    Successfully loaded config 'Sentiment_Derja' with 22890 rows.
  - Loading config: TA_Segmentation
    Available splits: ['train']


Generating train split: 0 examples [00:00, ? examples/s]

    An error of type ExpectedMoreSplitsError occurred while loading config 'Tunisian_Dialectic_English_Derja' for linagora/Tunisian_Derja_Dataset. Details: {'Tunisian_Dialectic_English_Derja'}. Skipping this config.
  - Loading config: Tweet_TN
    Available splits: ['train']
    Successfully loaded config 'Tweet_TN' with 39637 rows.

Concatenating datasets...
Dataset 1 features: {'text': Value('string')}
Dataset 2 features: {'text': Value('null')}
Dataset 3 features: {'text': Value('string')}
Dataset 4 features: {'text': Value('string')}
Dataset 5 features: {'text': Value('string')}
Dataset 6 features: {'text': Value('string')}
Dataset 7 features: {'text': Value('string')}
Dataset 8 features: {'text': Value('string')}
Dataset 9 features: {'text': Value('string')}
Dataset 10 features: {'text': Value('string')}
Dataset 11 features: {'text': Value('string')}
Dataset 12 features: {'text': Value('string')}
Dataset 13 features: {'text': Value('string')}
Initial combined dataset size: 125575

In [26]:
len(final_corpus)

785709

In [28]:
final_corpus.column_names

['text', '__index_level_0__']

In [29]:
final_corpus["text"][:5]

['الأمازيغ يحتفلون اليوم بقدوم سنة 2969 - Tounes 24\nازمة الكورونا .. وزير الصحة : " تونس لم تدخل...\nMar 30, 2020 521\nإستئناف الدروس بالمؤسسات التعليمية أبرز محاور...\nMar 30, 2020 301\nتسجيل 5 إصابات جديدة بفيروس كورونا في ليبيا\nMar 29, 2020 72\nDec 5, 2019 499\nNov 29, 2019 576\nMar 28, 2020 146\nMar 27, 2020 545\nMar 25, 2020 375\nصورة اليوم : شرطي يرتدي قناع كورونا لتخويف أصحاب...\nترامب : إذا تحكمنا في وفايات كورونا في حدود 100...\nهيثم المكي : النائب مبروك كورشيد صاحب مجلة إلكترونية...\nMar 30, 2020 269\nهيثم المكي: باعثي القنوات التلفزية ماعندهم حتى...\nسمير بن عمر لمبروك كورشيد :إذا عدت أيها الفاسد...\nMar 30, 2020 313\nMar 30, 2020 382\nMar 30, 2020 422\nMar 30, 2020 1022\nكيف يقتل الجسم نفسه في محاولة الشفاء من الانفلونزا...\nعلى عكس المتداول... فيروس كورونا يموت بحرارة...\nMar 20, 2020 363\nFeb 28, 2020 794\nFeb 28, 2020 266\nFeb 21, 2020 216\nMar 27, 2020 379\nMar 27, 2020 1311\nMar 26, 2020 715\nJan 12, 2019 617\nيحتفل الأمازيغ، الذين يتركزون بالأساس في شمال أفريقيا، بر

In [30]:
type(final_corpus)

datasets.arrow_dataset.Dataset

In [31]:
from datasets import load_dataset

ds = load_dataset("arbml/TArC", split="train")

# Option: pick Arabizi sentences
arabizi_texts = ds["arabish"]

# Or pair Arabizi + Arabic transliteration
combined_texts = [
    f"{a} ; {c}"
    for a, c in zip(ds["arabish"], ds["CODA"])
]

# Write to a corpus file
with open("tarc_corpus.txt", "w", encoding="utf-8") as f:
    for line in combined_texts:
        line = line.strip()
        if line:
            f.write(line + "\n")


README.md: 0.00B [00:00, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


data/train-00000-of-00001-78418db42dbbfa(…):   0%|          | 0.00/1.12M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/48126 [00:00<?, ? examples/s]

In [32]:
from datasets import load_dataset

dataset = load_dataset("text", data_files={"train": "tarc_corpus.txt"}, split="train")


Generating train split: 0 examples [00:00, ? examples/s]

In [33]:
dataset

Dataset({
    features: ['text'],
    num_rows: 48126
})

In [34]:
from datasets import concatenate_datasets

# Suppose 'arabic_dataset' already exists with column 'text'
combined = concatenate_datasets([final_corpus, dataset])


In [35]:
len(combined)

833835

In [48]:
print("Removing duplicate rows...")
df = combined.to_pandas()
original_size = len(df)
df.drop_duplicates(subset=[TEXT_COLUMN_NAME], inplace=True)
deduplicated_size = len(df)
print(f"Removed {original_size - deduplicated_size} duplicates. Final size: {deduplicated_size} rows.")
# Convert back to a Hugging Face dataset.
final_dataset = Dataset.from_pandas(df.iloc[:,0].to_frame())

print("Shuffling the final dataset...")
final_dataset = final_dataset.shuffle(seed=42)

# Print a sample for debugging
print(f"Sample from combined dataset: {final_dataset[:5][TEXT_COLUMN_NAME]}")

Removing duplicate rows...


Removed 31176 duplicates. Final size: 802659 rows.
Shuffling the final dataset...
Sample from combined dataset: ['و هيا هوما يقولولها أه درا شنيا و هيا تقولله نحلبوه', 'لم أصدق حتى استمعت وبإمعان لخطاب السيد عبو على الجزيرة | tadwinet\nالكاتب نُشرت في 2018-09-28 2018-10-19 التصنيفات تدوينات تونسية\nالسابق المقالة السابقة: لماذا انهار حزب النّداء ؟\nالتالي المقالة التالية: يوم إغتال الباجي الأب السبسي الرئيس', 'قلتلها راهو هكة قلتلها راني توا مشنوقة', "I'm going to copy what I put in another comment:", 'أه لا يضحك الصباح']


In [47]:
df.iloc[:,0].to_frame()

Unnamed: 0,text
0,الأمازيغ يحتفلون اليوم بقدوم سنة 2969 - Tounes...
1,حتي ملحدين الحاسيلو م الاخر اختكم بنت عايلة ال...
2,المظيلة .. تعرض عون بشركة فسفاط قفصة الى حادث ...
3,قلع التيفاف توى الله غالب تعدى على راس خوكم سع...
4,إذاعة قفصة | كاس رابطة ابطال افريقيا المجموعة ...
...,...
833824,ga3ed ; قاعد
833826,zanga ; زنقة
833827,marfoud ; مرفود
833832,met3addi ; متعدّي


In [49]:
from huggingface_hub import login

# Authenticate
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [50]:
final_dataset.push_to_hub("hamzabouajila/tunisian-derja-unified-raw-corpus", private=False)

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/402 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|1         | 2.63MB /  173MB            

Creating parquet from Arrow format:   0%|          | 0/402 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|1         | 2.63MB /  172MB            

CommitInfo(commit_url='https://huggingface.co/datasets/hamzabouajila/tunisian-derja-unified-raw-corpus/commit/6966a3ab422e9c4bceaa9b101f62483206c74ddd', commit_message='Upload dataset', commit_description='', oid='6966a3ab422e9c4bceaa9b101f62483206c74ddd', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/hamzabouajila/tunisian-derja-unified-raw-corpus', endpoint='https://huggingface.co', repo_type='dataset', repo_id='hamzabouajila/tunisian-derja-unified-raw-corpus'), pr_revision=None, pr_num=None)