# English to Arabic Dialect Translation

This notebook translates English polarization comments to various Arabic dialects using weighted random selection.

## 1. Import Required Libraries

In [1]:
import requests
import pandas as pd
import numpy as np
from tqdm import tqdm
import time
from dotenv import load_dotenv
import os

## 2. Define Arabic Dialects with Weights

In [2]:
# Define Arabic dialects with their codes, names, and percentage weights
dialects = {
    'ar': {'name': 'Standard Arabic', 'weight': 0.3},
    'arz': {'name': 'Egyptian Arabic', 'weight': 0.225},
    'apc': {'name': 'North Levantine Arabic', 'weight': 0.225},
    'afb': {'name': 'Gulf Arabic', 'weight': 0.15},
    'acw': {'name': 'Hejazi Arabic', 'weight': 0.1}
}

# Extract codes and weights for random selection
dialect_codes = list(dialects.keys())
dialect_weights = [dialects[code]['weight'] for code in dialect_codes]

# Display dialect distribution
print("Arabic Dialect Distribution:")
print("="*50)
for code, info in dialects.items():
    print(f"{info['name']:30} ({code}): {info['weight']*100:5.1f}%")
print("="*50)
print(f"Total: {sum(dialect_weights)*100:.1f}%")

Arabic Dialect Distribution:
Standard Arabic                (ar):  30.0%
Egyptian Arabic                (arz):  22.5%
North Levantine Arabic         (apc):  22.5%
Gulf Arabic                    (afb):  15.0%
Hejazi Arabic                  (acw):  10.0%
Total: 100.0%


## 3. Load English Data

In [3]:
# Load the English dataset
df = pd.read_csv('eng.csv')

print(f"Dataset loaded: {len(df)} rows")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
df.head()

Dataset loaded: 2676 rows

Columns: ['id', 'text', 'polarization']

First few rows:


Unnamed: 0,id,text,polarization
0,eng_973938b90b0ff5d87d35a582f83f5c89,is defending imperialism in the dnd chat,0
1,eng_07dfd4600426caca6e2c5883fcbea9ea,Still playing with this. I am now following Ra...,0
2,eng_f14519ff2302b6cd47712073f13bc461,.senate.gov Theres 3 groups out there Republic...,0
3,eng_e48b7e7542faafa544ac57b64bc80daf,"""ABC MD, David Anderson, said the additional f...",0
4,eng_7c581fb77bce8033aeba3d6dbd6273eb,"""bad people"" I have some conservative values s...",0


## 4. Assign Random Dialects to Each Row

In [4]:
# Assign a random dialect to each row based on the weighted distribution
np.random.seed(42)  # For reproducibility
df['dialect_code'] = np.random.choice(dialect_codes, size=len(df), p=dialect_weights)
df['dialect_name'] = df['dialect_code'].map(lambda code: dialects[code]['name'])

# Show distribution of assigned dialects
print("Assigned Dialect Distribution:")
print("="*50)
dialect_counts = df['dialect_name'].value_counts()
for dialect_name, count in dialect_counts.items():
    percentage = (count / len(df)) * 100
    print(f"{dialect_name:30}: {count:5} ({percentage:5.1f}%)")
print("="*50)
print(f"Total: {len(df)} rows")

Assigned Dialect Distribution:
Standard Arabic               :   815 ( 30.5%)
North Levantine Arabic        :   608 ( 22.7%)
Egyptian Arabic               :   584 ( 21.8%)
Gulf Arabic                   :   410 ( 15.3%)
Hejazi Arabic                 :   259 (  9.7%)
Total: 2676 rows


## 5. Define Translation Function

In [5]:
# API configuration
load_dotenv()  # loads .env into system environment

API_KEY = os.getenv("API_KEY")
API_URL = "https://openl-translate.p.rapidapi.com/translate/bulk"

def translate_batch(texts, target_lang, max_retries=3, retry_delay=2):
    """
    Translate a batch of texts to the specified target language with retry logic.
    
    Args:
        texts: List of text strings to translate
        target_lang: Target language code
        max_retries: Maximum number of retry attempts
        retry_delay: Delay between retries in seconds
        
    Returns:
        List of translated texts or None if error
    """
    payload = {
        "target_lang": target_lang,
        "text": texts
    }
    
    headers = {
        "x-rapidapi-key": API_KEY,
        "x-rapidapi-host": "openl-translate.p.rapidapi.com",
        "Content-Type": "application/json"
    }
    
    for attempt in range(max_retries):
        try:
            response = requests.post(API_URL, json=payload, headers=headers, timeout=30)
            
            # Check if response is OK
            if response.status_code == 200:
                try:
                    result = response.json()
                    translated = result.get('translatedTexts', None)
                    if translated and len(translated) == len(texts):
                        return translated
                    else:
                        print(f"Warning: Response missing translations. Got {len(translated) if translated else 0}, expected {len(texts)}")
                except ValueError as json_err:
                    print(f"JSON decode error: {json_err}")
                    print(f"Response text: {response.text[:200]}")
            else:
                print(f"HTTP {response.status_code}: {response.text[:200]}")
            
            # If we got here, something went wrong - retry if attempts remain
            if attempt < max_retries - 1:
                print(f"Retrying in {retry_delay} seconds... (Attempt {attempt + 2}/{max_retries})")
                time.sleep(retry_delay)
            
        except requests.exceptions.Timeout:
            print(f"Request timeout (attempt {attempt + 1}/{max_retries})")
            if attempt < max_retries - 1:
                time.sleep(retry_delay)
        except requests.exceptions.RequestException as e:
            print(f"Request error: {e}")
            if attempt < max_retries - 1:
                time.sleep(retry_delay)
        except Exception as e:
            print(f"Unexpected error: {type(e).__name__}: {e}")
            if attempt < max_retries - 1:
                time.sleep(retry_delay)
    
    print(f"‚ùå Failed after {max_retries} attempts")
    return None

# Test with a sample
test_texts = ["Hello, how are you?", "The weather is nice today."]
test_result = translate_batch(test_texts, "ar")
print("Test translation:")
print(test_result)

Test translation:
['ŸÖÿ±ÿ≠ÿ®Ÿãÿßÿå ŸÉŸäŸÅ ÿ≠ÿßŸÑŸÉÿü', 'ÿßŸÑÿ∑ŸÇÿ≥ ŸÑÿ∑ŸäŸÅ ÿßŸÑŸäŸàŸÖ.']


## 6. Translate Texts in Batches

In [6]:
def translate_dataframe(df, batch_size=50, delay=2, checkpoint_file='translation_checkpoint.csv'):
    """
    Translate all texts in the dataframe to their assigned dialects with checkpoint saving.
    
    Args:
        df: DataFrame with 'text' and 'dialect_code' columns
        batch_size: Number of texts to translate in one API call
        delay: Delay in seconds between API calls to avoid rate limiting
        checkpoint_file: File to save progress periodically
        
    Returns:
        DataFrame with added 'translated_text' column
    """
    df = df.copy()
    
    # Initialize translated_text column if it doesn't exist
    if 'translated_text' not in df.columns:
        df['translated_text'] = None
    
    total_translated = 0
    total_failed = 0
    
    # Group by dialect to minimize API calls
    for dialect_idx, dialect_code in enumerate(dialect_codes):
        dialect_df = df[df['dialect_code'] == dialect_code]
        
        if len(dialect_df) == 0:
            continue
            
        print(f"\n[{dialect_idx+1}/{len(dialect_codes)}] Translating {len(dialect_df)} texts to {dialects[dialect_code]['name']} ({dialect_code})...")

        #dialect_df = dialect_df.reset_index(drop=True)
        
        indices = dialect_df.index.tolist()
        texts = dialect_df['text'].tolist()
        
        dialect_translated = 0
        dialect_failed = 0
        
        # Process in batches
        for i in tqdm(range(0, len(texts), batch_size), desc=f"{dialects[dialect_code]['name'][:20]}"):
            batch_indices = indices[i:i+batch_size]
            batch_texts = texts[i:i+batch_size]
            
            translated = translate_batch(batch_texts, dialect_code, max_retries=3, retry_delay=2)
            
            if translated and len(translated) == len(batch_texts):
                for idx, trans_text in zip(batch_indices, translated):
                    df.at[idx, 'translated_text'] = trans_text
                dialect_translated += len(translated)
                total_translated += len(translated)
            else:
                print(f"\n‚ö†Ô∏è  Failed to translate batch {i//batch_size + 1} (indices {i} to {i+len(batch_texts)-1})")
                dialect_failed += len(batch_texts)
                total_failed += len(batch_texts)
                
                # Save checkpoint on failure
                df.to_csv(checkpoint_file, index=False, encoding='utf-8-sig')
                print(f"üíæ Checkpoint saved to {checkpoint_file}")
            
            # Delay to avoid rate limiting
            if i + batch_size < len(texts):
                time.sleep(delay)
        
        # Save checkpoint after completing each dialect
        df.to_csv(checkpoint_file + f"_{dialect_code}.csv", index=False, encoding='utf-8-sig')
        print(f"‚úÖ {dialects[dialect_code]['name']}: {dialect_translated} succeeded, {dialect_failed} failed")
        print(f"üíæ Checkpoint saved ({total_translated} total translated so far)")
    
    print(f"\n{'='*60}")
    print(f"Overall: {total_translated} succeeded, {total_failed} failed")
    print(f"{'='*60}")

    # Save final checkpoint
    df.to_csv("final_translated.csv", index=False, encoding='utf-8-sig')
    print(f"üíæ Final checkpoint saved to final_translated.csv")

    return df

# Note: Uncomment the line below to start translation
# This may take a while depending on the dataset size and API limits
# df_translated = translate_dataframe(df, batch_size=50, delay=2)

## 7. Run Translation (Execute this cell to start)

In [8]:
# Start the translation process
# Adjust batch_size and delay based on API rate limits
df_translated = translate_dataframe(df, batch_size=3, delay=0.5)

# Check for any failed translations
failed_count = df_translated['translated_text'].isna().sum()
print(f"\n{'='*50}")
print(f"Translation Complete!")
print(f"Total rows: {len(df_translated)}")
print(f"Successfully translated: {len(df_translated) - failed_count}")
print(f"Failed: {failed_count}")
print(f"{'='*50}")


[1/5] Translating 815 texts to Standard Arabic (ar)...


Standard Arabic: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 272/272 [37:48<00:00,  8.34s/it]


‚úÖ Standard Arabic: 815 succeeded, 0 failed
üíæ Checkpoint saved (815 total translated so far)

[2/5] Translating 584 texts to Egyptian Arabic (arz)...


Egyptian Arabic:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 86/195 [11:52<14:40,  8.08s/it]

HTTP 502: {"messages":"The API is unreachable, please contact the API provider", "info": "Your Client (working) ---> Gateway (working) ---> API (not working)"}
Retrying in 2 seconds... (Attempt 2/3)
HTTP 502: {"messages":"The API is unreachable, please contact the API provider", "info": "Your Client (working) ---> Gateway (working) ---> API (not working)"}
Retrying in 2 seconds... (Attempt 3/3)
Request timeout (attempt 3/3)
‚ùå Failed after 3 attempts

‚ö†Ô∏è  Failed to translate batch 87 (indices 258 to 260)
üíæ Checkpoint saved to translation_checkpoint.csv


Egyptian Arabic: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 195/195 [29:27<00:00,  9.06s/it]


‚úÖ Egyptian Arabic: 581 succeeded, 3 failed
üíæ Checkpoint saved (1396 total translated so far)

[3/5] Translating 608 texts to North Levantine Arabic (apc)...


North Levantine Arab: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 203/203 [28:07<00:00,  8.31s/it]


‚úÖ North Levantine Arabic: 608 succeeded, 0 failed
üíæ Checkpoint saved (2004 total translated so far)

[4/5] Translating 410 texts to Gulf Arabic (afb)...


Gulf Arabic: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 137/137 [20:30<00:00,  8.98s/it]


‚úÖ Gulf Arabic: 410 succeeded, 0 failed
üíæ Checkpoint saved (2414 total translated so far)

[5/5] Translating 259 texts to Hejazi Arabic (acw)...


Hejazi Arabic: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 87/87 [12:38<00:00,  8.72s/it]

‚úÖ Hejazi Arabic: 259 succeeded, 0 failed
üíæ Checkpoint saved (2673 total translated so far)

Overall: 2673 succeeded, 3 failed
üíæ Final checkpoint saved to final_translated.csv

Translation Complete!
Total rows: 2676
Successfully translated: 2673
Failed: 3





## 8. Preview Results

In [None]:
# Display sample translations
print("Sample Translations:")
print("="*100)

# Show examples from different dialects
for dialect_code in dialect_codes[:3]:  # Show first 3 dialects as examples
    sample = df_translated[df_translated['dialect_code'] == dialect_code].head(2)
    
    if len(sample) > 0:
        print(f"\n{dialects[dialect_code]['name']} ({dialect_code}):")
        print("-"*100)
        
        for idx, row in sample.iterrows():
            print(f"Original:  {row['text'][:80]}...")
            print(f"Translated: {row['translated_text'][:80] if pd.notna(row['translated_text']) else 'N/A'}...")
            print()

# Show full dataframe structure
print("\nDataFrame columns:")
print(df_translated.columns.tolist())
print(f"\nShape: {df_translated.shape}")
df_translated.head()

## 9. Save Results to CSV

In [None]:
# Prepare final dataframe with desired columns
output_df = df_translated[['id', 'translated_text', 'polarization', 'dialect_code', 'dialect_name']].copy()

# Rename 'translated_text' to 'text' for consistency
output_df = output_df.rename(columns={'translated_text': 'text'})

# Reorder columns: id, text, polarization, dialect_code, dialect_name
output_df = output_df[['id', 'text', 'polarization', 'dialect_code', 'dialect_name']]

# Save to CSV
output_filename = 'arb_translated.csv'
output_df.to_csv(output_filename, index=False, encoding='utf-8-sig')

print(f"Results saved to: {output_filename}")
print(f"Total rows saved: {len(output_df)}")
print(f"\nFinal DataFrame structure:")
output_df.head(10)

## 10. Summary Statistics

In [None]:
# Final statistics
print("="*70)
print("TRANSLATION SUMMARY")
print("="*70)

print("\n1. Dialect Distribution in Output:")
print("-"*70)
dialect_dist = output_df['dialect_name'].value_counts().sort_index()
for dialect_name, count in dialect_dist.items():
    percentage = (count / len(output_df)) * 100
    print(f"{dialect_name:30}: {count:5} rows ({percentage:5.1f}%)")

print("\n2. Polarization Distribution:")
print("-"*70)
pol_dist = output_df['polarization'].value_counts().sort_index()
for pol, count in pol_dist.items():
    percentage = (count / len(output_df)) * 100
    print(f"Polarization {pol:3}: {count:5} rows ({percentage:5.1f}%)")

print("\n3. Missing Translations:")
print("-"*70)
missing = output_df['text'].isna().sum()
print(f"Missing translations: {missing} ({(missing/len(output_df)*100):.2f}%)")

print("\n4. Dialect Code Distribution:")
print("-"*70)
code_dist = output_df['dialect_code'].value_counts().sort_index()
for code, count in code_dist.items():
    print(f"{code}: {count} rows")

print("\n" + "="*70)
print(f"Total Rows: {len(output_df)}")
print("="*70)

In [None]:
import requests

# API configuration
API_KEY = "39d5a0f514msh69f245174f96dcep1c9f91jsnb0e702698bf4"
API_URL = "https://openl-translate.p.rapidapi.com/translate/bulk"

# Test texts - one for each dialect
test_data = [
    {"text": "Republicans want to defund public schools AND make public schools polling places? And invite bomb threats on schools? WTF. azmirror.combriefstena...", "dialect": "arz", "dialect_name": "Egyptian Arabic"},
    {"text": "Thats what I dont understand. When did we become the snowflake pussies that the Republicans have been calling us? Where is the spine in our elected members of Congress and the military???", "dialect": "apc", "dialect_name": "North Levantine Arabic"},
    {"text": "The Felon Trump and his chief minion Musk should be dragged out of the white house in handcuffs. A stolen election which Trump admitted to was not enough? Hes now using Putins playbook in making us less safe and less financially stable! trumpmusk treason", "dialect": "afb", "dialect_name": "Gulf Arabic"},
    {"text": "The only thing we get from them is oil and election interference. They have the GDP of Iowa. No loss to us. Buh bye.", "dialect": "ayn", "dialect_name": "Yemeni Arabic"}
]

print("Testing translation API with 4 different dialects...")

for item in test_data:
    print(f"\nTranslating to {item['dialect_name']} ({item['dialect']}):")
    print(f"Original: {item['text']}")
    
    # Prepare API request
    payload = {
        "target_lang": item['dialect'],
        "text": [item['text']]
    }
    
    headers = {
        "x-rapidapi-key": API_KEY,
        "x-rapidapi-host": "openl-translate.p.rapidapi.com",
        "Content-Type": "application/json"
    }
    
    try:
        response = requests.post(API_URL, json=payload, headers=headers)
        response.raise_for_status()
        result = response.json()
        
        if 'translatedTexts' in result and len(result['translatedTexts']) > 0:
            translated = result['translatedTexts'][0]
            print(f"Translated: {translated}")
            print("‚úì Success")
        else:
            print(f"‚úó No translation returned")
            print(f"Response: {result}")
            
    except Exception as e:
        print(f"‚úó Error: {e}")
        if hasattr(e, 'response'):
            print(f"Response: {e.response.text}")
    
    print("-"*70)