<a href="https://colab.research.google.com/github/Nde-Dilan/no_name_for_now_dataset/blob/main/Fulfulde_Dictionary_Creation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Install the dependencies

In [None]:
!pip install pandas deepl

# 2. Import and configure logging

In [None]:
import pandas as pd
import re
import os
from urllib.request import urlopen
from urllib.error import URLError, HTTPError
import logging

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("fulfulde_extraction.log"),
        logging.StreamHandler()
    ]
)


# 3. Method to download CSV data from hugginface

In [None]:
def download_csv(url):
    """Download CSV data from URL and return as string"""
    logging.info(f"Downloading data from {url}")
    try:
        with urlopen(url) as response:
            csv_data = response.read().decode('utf-8')
        logging.info("Download successful")
        return csv_data
    except (URLError, HTTPError) as e:
        logging.error(f"Failed to download data: {e}")
        raise

# 4. Extraction of the translations

Having the csv file, we can drop the useless columns like path and id...

In [None]:

def extract_translations(csv_data):
    """Extract Fulfulde words and French translations from CSV data"""
    logging.info("Parsing CSV data")

    # Read CSV with semicolon delimiter, keeping only necessary columns
    try:
        # First split by newlines to handle the data manually
        lines = csv_data.strip().split('\n')

        # Create DataFrame with properly parsed columns
        data = []
        for line in lines:
            # Split by semicolon
            parts = line.split(';')
            if len(parts) >= 3:  # Ensure we have at least id, fulfulde, and filename
                row = {
                    'id': parts[0],
                    'fulfulde': parts[1],
                    'filename': parts[3]
                }
                data.append(row)

        df = pd.DataFrame(data)
        logging.info(f"Successfully parsed CSV with {len(df)} entries")

        # Create output directory if it doesn't exist
        os.makedirs('output', exist_ok=True)

        # Save raw parsed data
        df.to_excel('output/1_raw_parsed_data.xlsx', index=False)
        logging.info("Saved raw parsed data to output/1_raw_parsed_data.xlsx")

        return df
    except Exception as e:
        logging.error(f"Error parsing CSV data: {e}")
        raise

# 5. Cleaning the data and data transformation

Remove the special characters where we don't need them, dérive the translation from the name of the audio file and construct the french corpus alligning with the fulfulde one.

In [None]:

def clean_and_transform(df):
    """Clean and transform the data to extract French translations"""
    logging.info("Cleaning and transforming data")

    try:
        # Extract French word from filename using regex
        # Pattern to match: number-FrenchWord.extension
        df['french'] = df['filename'].apply(
            lambda x: re.search(r'(\d+)-([^.]+)\.(aac|wav|mp3|ogg|flac)', x)
        ).apply(
            lambda match: match.group(2) if match else None
        )

        # Save data after extraction
        df_with_french = df[['fulfulde', 'french']].copy()
        df_with_french.to_excel('output/2_extracted_translations.xlsx', index=False)
        logging.info("Saved extracted translations to output/2_extracted_translations.xlsx")

        # Clean up French words (replace underscores with spaces, etc.)
        df_with_french['french'] = df_with_french['french'].apply(
            lambda x: x.replace('_', ' ') if isinstance(x, str) else x
        )

        # Clean up Fulfulde words if needed
        # Add any specific cleaning rules here

        # Remove any rows with missing values
        df_clean = df_with_french.dropna()
        logging.info(f"After cleaning, {len(df_clean)} entries remain")

        # Save final cleaned data
        df_clean.to_excel('output/3_final_fulfulde_french_translations.xlsx', index=False)
        logging.info("Saved final cleaned data to output/3_final_fulfulde_french_translations.xlsx")

        # Also save as CSV for easy use
        df_clean.to_csv('output/fulfulde_french_translations.csv', index=False)
        logging.info("Saved final data as CSV to output/fulfulde_french_translations.csv")

        return df_clean
    except Exception as e:
        logging.error(f"Error cleaning and transforming data: {e}")
        raise


# 6. Main Entry Point

Now we put all the piece together to form the system that will collect data for us.

In [None]:
def main():
    """Main function to run the extraction process"""
    url = "https://huggingface.co/datasets/BATBAINA/fulfulde-agri-asr/raw/main/Fulfulde-agri-asr.csv"

    try:
        csv_data = download_csv(url)
        df_raw = extract_translations(csv_data)
        df_final = clean_and_transform(df_raw)

        # Display a sample of the final data
        logging.info("\nSample of final translations:")
        logging.info(df_final.head(10))

        logging.info(f"\nExtraction complete. Total of {len(df_final)} Fulfulde-French translations extracted.")
        return df_final
    except Exception as e:
        logging.error(f"Process failed: {e}")
        return None

# Calling the main function and kicking on the workflow

In [None]:
main()

# 7. Translating French words into English

In [None]:
import deepl
import time
import pandas as pd

# Your DeepL API Key (replace with your actual API key)
DEEPL_API_KEY = "54743463-d3cc-49f6-9d83-c1adbd75e022:fx"  # Get it from https://www.deepl.com/pro-api

# Initialize DeepL Translator
def get_translator(api_key):
    """Initialize and return DeepL translator with the given API key"""
    try:
        return deepl.Translator(api_key)
    except Exception as e:
        logging.error(f"Failed to initialize DeepL translator: {e}")
        return None

def translate_french_to_english(french_text, translator, retries=3):
    """Translate a French sentence into English using DeepL with retries."""
    if not french_text or not isinstance(french_text, str):
        return ""

    for attempt in range(retries):
        try:
            result = translator.translate_text(french_text, source_lang="FR", target_lang="EN-US")
            return result.text  # Extract translated text
        except Exception as e:
            logging.warning(f"Attempt {attempt + 1}: Error translating '{french_text}': {e}")
            time.sleep(1)  # Wait before retrying
    return french_text  # Return original if translation fails

def add_english_translations(input_file='output/fulfulde_french_translations.csv',
                            output_file='output/fulfulde_french_english_translations.xlsx',
                            batch_size=25):
    """
    Add English translations to the Fulfulde-French dataset
    Using batched processing to avoid API rate limits
    """
    logging.info("Starting translation process from French to English")

    try:
        # Read the CSV file with Fulfulde and French translations
        df = pd.read_csv(input_file)
        logging.info(f"Loaded {len(df)} entries for translation")

        # Initialize the translator
        translator = get_translator(DEEPL_API_KEY)
        if not translator:
            logging.error("Cannot proceed with translations: Translator initialization failed")
            return None

        # Create output directory if needed
        os.makedirs('output', exist_ok=True)

        # Initialize English column
        df['english'] = ""

        # Process in batches to avoid API rate limits
        total_rows = len(df)
        for i in range(0, total_rows, batch_size):
            batch_end = min(i + batch_size, total_rows)
            logging.info(f"Processing batch {i//batch_size + 1}: rows {i} to {batch_end}")

            # Process each row in the batch
            for idx in range(i, batch_end):
                french_text = df.loc[idx, 'french']
                english_text = translate_french_to_english(french_text, translator)
                df.loc[idx, 'english'] = english_text

            # Save intermediate results after each batch
            df.to_excel(output_file, index=False)
            logging.info(f"Saved progress after batch {i//batch_size + 1}")

            # Add a short delay between batches to respect API rate limits
            if batch_end < total_rows:
                time.sleep(2)

        # Save final results
        df.to_excel(output_file, index=False)
        df.to_csv('output/fulfulde_french_english_translations.csv', index=False)
        logging.info(f"Translation complete. Output saved to {output_file}")

        return df

    except Exception as e:
        logging.error(f"Error during translation process: {e}")
        return None