In [None]:
import pandas as pd
from pathlib import Path
from googletrans import Translator
import time
import asyncio
import nest_asyncio
from tqdm.auto import tqdm

# Apply nest_asyncio to allow asyncio.run in Jupyter
nest_asyncio.apply()

# ---------- CONFIG ----------
lexicon_csv = Path.cwd() / 'marathi_lexicon_correct_pos.csv'  # Fixed the correct file name
output_csv = Path.cwd() / 'marathi_sentiwordnet_google.csv'
batch_size = 100  # small batch to avoid API limits
start_from_batch = 0# Start from batch  (resuming from previous run)

# ---------- LOAD LEXICON ----------
df = pd.read_csv(lexicon_csv)
print(f"Loaded {len(df)} Marathi words")

# ---------- LOAD EXISTING TRANSLATIONS ----------
# If output file exists, load it to preserve existing translations
if output_csv.exists():
    saved_df = pd.read_csv(output_csv)
    if 'english_word' in saved_df.columns:
        english_words = saved_df['english_word'].tolist()
        print(f"Loaded {len(english_words)} existing translations")
    else:
        english_words = [''] * len(df)
else:
    english_words = [''] * len(df)

# Ensure english_words has the correct length
if len(english_words) < len(df):
    english_words.extend([''] * (len(df) - len(english_words)))

# ---------- TRANSLATE ----------
translator = Translator()

# Fix: Use a non-async translate function that wraps the async translator
def translate_word(word, src='mr', dest='en'):
    try:
        # Create a new translator instance for each word to avoid issues
        t = Translator().translate(word, src=src, dest=dest)
        return t.text.lower()
    except Exception as e:
        print(f"Error translating '{word}': {e}")
        return word  # fallback to original word

# Process batches
def translate_all():
    total_batches = (len(df) - 1) // batch_size + 1
    start_index = start_from_batch * batch_size
    
    print(f"Starting from batch {start_from_batch} (index {start_index}) out of {total_batches} total batches")
    
    for start in range(start_index, len(df), batch_size):
        end = min(start + batch_size, len(df))
        batch = df['marathi_word'].iloc[start:end].astype(str).tolist()
        batch_num = start // batch_size + 1
        
        print(f"Processing batch {batch_num} / {total_batches} (indices {start} to {end-1})")
        
        # Process words one by one (to maintain order and handle errors individually)
        for i, word in enumerate(batch):
            english_words[start + i] = translate_word(word)
            time.sleep(0.1)  # delay to avoid throttling
        
        # Save progress after each batch
        df_temp = df.copy()
        df_temp['english_word'] = english_words
        df_temp.to_csv(output_csv, index=False, encoding='utf-8')
        
        print(f"Completed batch {batch_num} / {total_batches}")

# Run the function (no asyncio needed now)
translate_all()

# Final save
df['english_word'] = english_words
df.to_csv(output_csv, index=False, encoding='utf-8')
print(f"✅ Saved translations to: {output_csv}")

Loaded 52325 Marathi words
Loaded 42998 existing translations
Starting from batch 430 (index 43000) out of 524 total batches
Processing batch 431 / 524 (indices 43000 to 43099)
Completed batch 431 / 524
Processing batch 432 / 524 (indices 43100 to 43199)
Completed batch 432 / 524
Processing batch 433 / 524 (indices 43200 to 43299)
Error translating 'जेस्सोर': The read operation timed out
Completed batch 433 / 524
Processing batch 434 / 524 (indices 43300 to 43399)
Completed batch 434 / 524
Processing batch 435 / 524 (indices 43400 to 43499)
Error translating 'जरतार': [SSL: DECRYPTION_FAILED_OR_BAD_RECORD_MAC] decryption failed or bad record mac (_ssl.c:1016)
Completed batch 435 / 524
Processing batch 436 / 524 (indices 43500 to 43599)
Error translating 'अरण्यसेवन': [SSL: DECRYPTION_FAILED_OR_BAD_RECORD_MAC] decryption failed or bad record mac (_ssl.c:1016)
Completed batch 436 / 524
Processing batch 437 / 524 (indices 43600 to 43699)
Error translating 'सुवर्णरेतस': [SSL: DECRYPTION_FAIL