In [3]:
import torch
print("GPU Available:", torch.cuda.is_available())
print("GPU Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")

GPU Available: True
GPU Name: Tesla T4


In [4]:
import pandas as pd
from transformers import pipeline
import time

In [5]:
# step 1
print("🔄 Loading Helsinki translation model on GPU...")
translator = pipeline("translation",
                     model="Helsinki-NLP/opus-mt-en-es",
                     device=0)  # device=0 = GPU

🔄 Loading Helsinki translation model on GPU...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/312M [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/826k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0


In [6]:
print("✅ Model loaded successfully on GPU!")

✅ Model loaded successfully on GPU!


In [7]:
# Step 2: Load the Helsinki translation model
def translate_to_spanish(text):
    """Translate English text to Spanish"""
    try:
        if pd.isna(text) or text.strip() == "": # here we are cleaning data remove null value remove space
            return ""

        result = translator(str(text)) # text convert into the string or imp thing translator just accept string and other type may be crash
         # inside translator -> tokenizer -> string , convert into  numbers [2, 7592, 2088, 3]
         # NN calculation -> GPU main
         # genrate spanish tokenizer
         # detokenizer -> number convert into the spanish string
        return result[0]['translation_text']
    except Exception as e:
        print(f"Error translating: {text[:50]}... -> {e}")
        return f"[Translation Error]"

In [8]:
# Step 3: Process CSV file - Modified to save only selected columns
def translate_csv(csv_file_path, english_column_name, output_file_name):
    """
    Translate CSV file from English to Spanish and save only selected columns

    Parameters:
    - csv_file_path: Your CSV file path (string)
    - english_column_name: Column name containing English text (string)
    - output_file_name: Name for translated CSV file (string)
    """

    print(f"📖 Reading CSV file: {csv_file_path}")

    # Read CSV
    try:
        df = pd.read_csv(csv_file_path)
        print(f"✅ Successfully loaded {len(df)} rows")
        print(f"📋 Columns found: {list(df.columns)}")
    except Exception as e:
        print(f"❌ Error reading CSV: {e}")
        return

    # Check if English column exists
    if english_column_name not in df.columns:
        print(f"❌ Column '{english_column_name}' not found!")
        print(f"Available columns: {list(df.columns)}")
        return

    # Start translation
    print(f"\n🚀 Starting translation of column '{english_column_name}'...")
    print(f"Total rows to translate: {len(df)}")

    # Translate in batches for efficiency
    spanish_translations = []
    batch_size = 50  # Process 50 rows at a time
    total_rows = len(df)

    start_time = time.time()

    for i in range(0, total_rows, batch_size):
        batch_end = min(i + batch_size, total_rows)
        batch_texts = df[english_column_name].iloc[i:batch_end].tolist()

        print(f"🔄 Processing rows {i+1} to {batch_end}...")

        # Batch translation for speed
        try:
            batch_results = translator([str(text) for text in batch_texts])
            batch_spanish = [result['translation_text'] for result in batch_results]
            spanish_translations.extend(batch_spanish)
        except Exception as e:
            print(f"❌ Batch error: {e}")
            # Fallback to individual translation
            for text in batch_texts:
                spanish_translations.append(translate_to_spanish(text))

        # Progress update
        progress = ((i + batch_size) / total_rows) * 100
        print(f"✅ Progress: {min(progress, 100):.1f}% complete")

    # Add Spanish column to dataframe
    df['spanish_translation'] = spanish_translations

    # *** MODIFIED PART: Select only specific columns ***
    # Create new dataframe with only required columns
    columns_to_save = ['binary', 'multiclass', 'spanish_translation']

    # Check if required columns exist
    missing_columns = [col for col in columns_to_save if col not in df.columns]
    if missing_columns:
        print(f"⚠️ Warning: These columns not found: {missing_columns}")
        # Only keep columns that actually exist
        columns_to_save = [col for col in columns_to_save if col in df.columns]

    print(f"📝 Saving only these columns: {columns_to_save}")

    # Create new dataframe with selected columns only
    final_df = df[columns_to_save].copy()

    # Save translated CSV with selected columns only
    final_df.to_csv(output_file_name, index=False)

    # Summary
    end_time = time.time()
    duration = end_time - start_time

    print(f"\n🎉 Translation Complete!")
    print(f"📁 Output saved as: {output_file_name}")
    print(f"📋 Columns saved: {list(final_df.columns)}")
    print(f"⏱️  Total time: {duration:.2f} seconds")
    print(f"🚀 Average speed: {len(df)/duration:.2f} translations/second")

    # Show sample results (only selected columns)
    print(f"\n📋 Sample Results (Selected Columns Only):")
    for i in range(min(5, len(final_df))):
        print(f"{i+1}. ", end="")
        for col in final_df.columns:
            print(f"{col}: {final_df[col].iloc[i]} | ", end="")
        print()
        print("-" * 60)

    return final_df

In [9]:
# Call the function with STRING parameters
translate_csv(
    csv_file_path='sarcasm_filtered.csv',           # Your input file
    english_column_name='text',                     # Your column name
    output_file_name='sarcasm_english_spanish.csv'  # Your output file
)

📖 Reading CSV file: sarcasm_filtered.csv
✅ Successfully loaded 692 rows
📋 Columns found: ['text', 'binary', 'multiclass']

🚀 Starting translation of column 'text'...
Total rows to translate: 692
🔄 Processing rows 1 to 50...
✅ Progress: 7.2% complete
🔄 Processing rows 51 to 100...
✅ Progress: 14.5% complete
🔄 Processing rows 101 to 150...
✅ Progress: 21.7% complete
🔄 Processing rows 151 to 200...
✅ Progress: 28.9% complete
🔄 Processing rows 201 to 250...
✅ Progress: 36.1% complete
🔄 Processing rows 251 to 300...
✅ Progress: 43.4% complete
🔄 Processing rows 301 to 350...
✅ Progress: 50.6% complete
🔄 Processing rows 351 to 400...
✅ Progress: 57.8% complete
🔄 Processing rows 401 to 450...
✅ Progress: 65.0% complete
🔄 Processing rows 451 to 500...


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


✅ Progress: 72.3% complete
🔄 Processing rows 501 to 550...
✅ Progress: 79.5% complete
🔄 Processing rows 551 to 600...
✅ Progress: 86.7% complete
🔄 Processing rows 601 to 650...
✅ Progress: 93.9% complete
🔄 Processing rows 651 to 692...
✅ Progress: 100.0% complete
📝 Saving only these columns: ['binary', 'multiclass', 'spanish_translation']

🎉 Translation Complete!
📁 Output saved as: sarcasm_english_spanish.csv
📋 Columns saved: ['binary', 'multiclass', 'spanish_translation']
⏱️  Total time: 344.74 seconds
🚀 Average speed: 2.01 translations/second

📋 Sample Results (Selected Columns Only):
1. binary: Hope | multiclass: Sarcasm | spanish_translation: Sería bueno si missguided tuviera stock por una vez. | 
------------------------------------------------------------
2. binary: Not Hope | multiclass: Sarcasm | spanish_translation: Encuéntrame un tiroteo masivo en Utah donde el transporte oculto en el campus es legal. | 
------------------------------------------------------------
3. binary: 

Unnamed: 0,binary,multiclass,spanish_translation
0,Hope,Sarcasm,Sería bueno si missguided tuviera stock por un...
1,Not Hope,Sarcasm,Encuéntrame un tiroteo masivo en Utah donde el...
2,Not Hope,Sarcasm,"Oh, me encantan los juegos semánticos Bueno, l..."
3,Not Hope,Sarcasm,"¿Entonces tu respuesta a la pregunta es ""Nada""..."
4,Not Hope,Sarcasm,"Claramente no te importa un bledo, porque si l..."
...,...,...,...
687,Not Hope,Sarcasm,¿Y si el estado decide mantener el aborto lega...
688,Not Hope,Sarcasm,"Arrepiéntete, tú que niegas a Cristo, gay que ..."
689,Not Hope,Sarcasm,El verdadero materialismo dialéctico está de a...
690,Not Hope,Sarcasm,Me encanta cuando los cristianos dicen esto......
