# Translat the Data (English to French)

In [2]:
import pandas as pd

#load the dataset
df = pd.read_csv("../original-data/helpdesk_dataset.csv")
df.head()

Unnamed: 0,user_query,intent,solution
0,"My password isn't working, can you help me res...",reset_password,Please follow the steps in the password reset ...
1,I can't log in; I think I forgot my password. ...,reset_password,A password reset link has been sent to your em...
2,I'm unable to access my account because my pas...,reset_password,Try resetting your password using the 'Forgot ...
3,Can you reset my password for me? I’ve been lo...,reset_password,Please check your email for a reset link and f...
4,I can't remember my password and need to reset...,reset_password,A reset link has been sent to your email. Use ...


In [4]:
#load the dataset
df_augmented = pd.read_csv("../Open Router/data/augmented_helpdesk_dataset.csv")
df_augmented.head()

Unnamed: 0,user_query,intent,solution
0,My password isn't working; can you assist me i...,reset_password,Please follow the steps in the password reset ...
1,I’m having trouble with my password; could you...,reset_password,Please follow the steps in the password reset ...
2,Can you help me reset my password?,reset_password,Please follow the steps in the password reset ...
3,It seems my password isn’t functioning; how ca...,reset_password,Please follow the steps in the password reset ...
4,My password has stopped working; can you guide...,reset_password,Please follow the steps in the password reset ...


## Combine Data

In [None]:
final_df = pd.concat([df, df_augmented], ignore_index=True)

# Remove duplicates (if any)
final_df = final_df.drop_duplicates(subset=["user_query"])

final_df.to_csv("data/finaL_helpdesk_dataset_v1.csv", index=False)
print(f"Original dataset size: {len(df)} \n Augmented dataset size: {len(df_augmented)} \nFinal dataset size: {len(final_df)}")

Original dataset size: 1022 
 Augmented dataset size: 13798 
Final dataset size: 13225


In [8]:
# Remove duplicates (if any)
final_df_augmented = df_augmented.drop_duplicates(subset=["user_query"])
len(final_df_augmented)

12217

In [18]:
final_df_french = final_df.copy()
final_df_french_2 = final_df_french

## Translation

-----

## Using DeepL API

In [None]:
import pandas as pd
import deepl

api_key = "" # insert your api key
# Initialize DeepL translator
translator = deepl.Translator(api_key)  # Replace with your DeepL API key

def translate_to_french(text):
    """Translate text to French using DeepL API"""
    try:
        result = translator.translate_text(text, target_lang="FR")
        return result.text
    except Exception as e:
        print(f"Translation error: {e}")
        return text  # Return original text if translation fails

In [17]:
# # Translate the 'user_query' and 'solution' columns
# final_df_french["user_query_fr"] = final_df_french["user_query"].apply(translate_to_french)
# final_df_french["solution_fr"] = final_df_french["solution"].apply(translate_to_french)

# # Save the translated dataset
# final_df_french.to_csv("augmented_data_french.csv", index=False)
# print("Translation complete. Dataset saved as 'augmented_data_french.csv'.")

----------
## Translate Using deep_translator

## Function Extract French Version

In [56]:
import pandas as pd

def extract_french_version(input_file, output_file):
    # Read the original dataset
    df = pd.read_csv(input_file)
    
    # Select only the French columns and intent
    french_df = df[['user_query_fr', 'intent', 'solution_fr']].copy()
    
    # Rename columns to remove _fr suffix
    french_df = french_df.rename(columns={
        'user_query_fr': 'user_query',
        'solution_fr': 'solution'
    })
    
    # Save to new file
    french_df.to_csv(output_file, index=False)
    print(f"French version extracted and saved to {output_file}")
    
    return french_df

## Version 1 (Original Data + GPT-4o mini Data)

In [25]:
from deep_translator import GoogleTranslator
import pandas as pd
import time
from tqdm import tqdm
import logging

def setup_logging():
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s',
        filename='translation_log.txt'
    )

def safe_translate(text, max_retries=3, delay=2):
    """
    Safely translate text with retries and error handling
    """
    if not isinstance(text, str) or text.strip() == '':
        return ''
    
    for attempt in range(max_retries):
        try:
            result = GoogleTranslator(source="en", target="fr").translate(text.strip())
            return result
        except Exception as e:
            logging.warning(f"Translation failed on attempt {attempt + 1}: {str(e)}\nText: {text[:100]}...")
            if attempt < max_retries - 1:
                time.sleep(delay)
            else:
                logging.error(f"All translation attempts failed for text: {text[:100]}...")
                return f"TRANSLATION_FAILED: {text}"  # Return original text with error flag

def batch_translate_dataframe(df, columns_to_translate, batch_size=100):
    """
    Translate dataframe columns in batches with progress tracking
    """
    setup_logging()
    df_copy = df.copy()
    
    for column in columns_to_translate:
        target_column = f"{column}_fr"
        logging.info(f"Starting translation of column: {column}")
        
        # Initialize empty target column
        df_copy[target_column] = ''
        
        # Process in batches with progress bar
        for i in tqdm(range(0, len(df_copy), batch_size), desc=f"Translating {column}"):
            batch = df_copy.iloc[i:i+batch_size]
            
            # Translate each item in the batch
            for idx, text in batch[column].items():
                translation = safe_translate(text)
                df_copy.at[idx, target_column] = translation
                
            # Add delay between batches to avoid rate limiting
            time.sleep(1)
            
        # Log completion of column
        logging.info(f"Completed translation of column: {column}")
        
        # Save intermediate results
        df_copy.to_csv("translation_checkpoint.csv", index=False)
        
    return df_copy

In [26]:
def main():
    try:
        
        # Specify columns to translate
        columns_to_translate = ["user_query", "solution"]
        
        # Perform translation
        translated_df = batch_translate_dataframe(
            final_df_french_2,
            columns_to_translate,
            batch_size=50  # Adjust batch size as needed
        )
        
        # Save final results
        translated_df.to_csv("data/data-translated/augmented_data_english_french_v1.csv", index=False)
        logging.info("Translation complete. Dataset saved successfully.")
        print("Translation complete. Dataset saved as 'data/data-translated/augmented_data_english_french_v1.csv'")
        
    except Exception as e:
        logging.error(f"An error occurred: {str(e)}")
        raise

if __name__ == "__main__":
    main()

Translating user_query: 100%|██████████| 265/265 [1:57:07<00:00, 26.52s/it]
Translating solution: 100%|██████████| 265/265 [50:25<00:00, 11.42s/it] 


Translation complete. Dataset saved as 'augmented_data_french_2.csv'


In [31]:
input_file = "data/data-translated/augmented_data_english_french_v1.csv"  
output_file = "data/data-translated/augmented_data_french_v1.csv"  

french_dataset = extract_french_version(input_file, output_file)

French version extracted and saved to data/augmented_data_french.csv


In [38]:
df_en_fr = pd.read_csv(input_file)
df_en = pd.read_csv("data/finaL_helpdesk_dataset_v1.csv")
df_fr = pd.read_csv(output_file)

print("Version 1")
print(f"En/Fr : {len(df_en_fr)} /n En: {len(df_en)} /n Fr : {len(df_fr)}")

Version 1
En/Fr : 13225 /n En: 13225 /n Fr : 13225


--------------
# Version 2 (version 1 + rogue data)

In [40]:
df_rogue = pd.read_csv("../Open Router/data/rogue_rose_data.csv")
df_rogue.head()

Unnamed: 0,user_query,intent,solution
0,"Hey there, I'm having trouble logging in. My p...",reset_password,Please follow the steps in the password reset ...
1,"Hi, I'm having an issue with my account. I'm s...",reset_password,Please follow the steps in the password reset ...
2,"Hi, I'm having some trouble with my password. ...",reset_password,Please follow the steps in the password reset ...
3,"Hello, I'm having a problem with my account lo...",reset_password,Please follow the steps in the password reset ...
4,"Hi, I'm having difficulty logging in to my acc...",reset_password,Please follow the steps in the password reset ...


In [39]:
len(df_rogue)

961

In [52]:
final_df_2 = pd.concat([df_en, df_rogue], ignore_index=True)

# Remove duplicates (if any)
final_df_2 = final_df_2.drop_duplicates(subset=["user_query"])

final_df_2.to_csv("data/finaL_helpdesk_dataset_v2.csv", index=False)
print(f"Original dataset size: {len(df)} \n GPT-4o mini dataset size: {len(df_en)} \n Rogue dataset size: {len(df_rogue)} \nFinal dataset size: {len(final_df_2)}")

Original dataset size: 1022 
 GPT-4o mini dataset size: 13225 
 Rogue dataset size: 961 
Final dataset size: 14165


In [44]:
def main():
    try:
        
        # Specify columns to translate
        columns_to_translate = ["user_query", "solution"]
        
        # Perform translation
        translated_df = batch_translate_dataframe(
            final_df_2,
            columns_to_translate,
            batch_size=50  # Adjust batch size as needed
        )
        
        # Save final results
        translated_df.to_csv("data/data-translated/augmented_data_english_french_v2.csv", index=False)
        logging.info("Translation complete. Dataset saved successfully.")
        print("Translation complete. Dataset saved as 'data/data-translated/augmented_data_english_french_v2.csv'")
        
    except Exception as e:
        logging.error(f"An error occurred: {str(e)}")
        raise

if __name__ == "__main__":
    main()

Translating user_query: 100%|██████████| 284/284 [1:18:52<00:00, 16.66s/it] 
Translating solution: 100%|██████████| 284/284 [50:55<00:00, 10.76s/it]


Translation complete. Dataset saved as 'augmented_data_french_2.csv'


In [46]:
input_file = "data/data-translated/augmented_data_english_french_v2.csv"  
output_file = "data/data-translated/augmented_data_french_v2.csv"  

french_dataset = extract_french_version(input_file, output_file)

French version extracted and saved to data/augmented_data_french_v2.csv


In [47]:
df_en_fr_2 = pd.read_csv(input_file)
df_en_2 = pd.read_csv("data/finaL_helpdesk_dataset_v2.csv")
df_fr_2 = pd.read_csv(output_file)

print("Version 2")
print(f"En/Fr : {len(df_en_fr_2)} /n En: {len(df_en_fr_2)} /n Fr : {len(df_fr_2)}")

Version 2
En/Fr : 14165 /n En: 14165 /n Fr : 14165


---
## Version 3 (version 2 + back transalated data)

In [48]:
df_back_translated = pd.read_csv("../3-Techniques/data/back-translated-100.csv")
df_back_translated.head()

Unnamed: 0,user_query,intent,solution
0,"My password is not working, can you help me re...",reset_password,Please follow the steps in the password reset ...
1,I can't log in. I think I forgot my password. ...,reset_password,A password reset link has been sent to your em...
2,I can't access my account because my password ...,reset_password,Try resetting your password using the 'Forgot ...
3,Can you reset my password for me? My account h...,reset_password,Please check your email for a reset link and f...
4,I don't remember my password and need to reset...,reset_password,A reset link has been sent to your email. Use ...


In [49]:
len(df_back_translated)

1022

In [53]:
final_df_3 = pd.concat([df_en_2, df_back_translated], ignore_index=True)

# Remove duplicates (if any)
final_df_3 = final_df_3.drop_duplicates(subset=["user_query"])

final_df_3.to_csv("data/finaL_helpdesk_dataset_v3.csv", index=False)
print(f"Original dataset size: {len(df)} \n GPT-4o mini dataset size: {len(df_en)} \n Rogue dataset size: {len(df_rogue)} \n Back Translated dataset size: {len(df_back_translated)} \nFinal dataset size: {len(final_df_3)}")

Original dataset size: 1022 
 GPT-4o mini dataset size: 13225 
 Rogue dataset size: 961 
 Back Translated dataset size: 1022 
Final dataset size: 14952


In [55]:
def main():
    try:
        
        # Specify columns to translate
        columns_to_translate = ["user_query", "solution"]
        
        # Perform translation
        translated_df = batch_translate_dataframe(
            final_df_3,
            columns_to_translate,
            batch_size=50  # Adjust batch size as needed
        )
        
        # Save final results
        translated_df.to_csv("data/data-translated/augmented_data_english_french_v3.csv", index=False)
        logging.info("Translation complete. Dataset saved successfully.")
        print("Translation complete. Dataset saved as 'data/data-translated/augmented_data_english_french_v3.csv'")
        
    except Exception as e:
        logging.error(f"An error occurred: {str(e)}")
        raise

if __name__ == "__main__":
    main()

Translating user_query: 100%|██████████| 300/300 [1:07:39<00:00, 13.53s/it]
Translating solution: 100%|██████████| 300/300 [56:23<00:00, 11.28s/it] 


Translation complete. Dataset saved as 'augmented_data_french_2.csv'


In [58]:
input_file = "data/data-translated/augmented_data_english_french_v3.csv"  
output_file = "data/data-translated/augmented_data_french_v3.csv"  

french_dataset = extract_french_version(input_file, output_file)

French version extracted and saved to data/data-translated/augmented_data_french_v3.csv


In [59]:
df_en_fr_3 = pd.read_csv(input_file)
df_en_3 = pd.read_csv("data/finaL_helpdesk_dataset_v3.csv")
df_fr_3 = pd.read_csv(output_file)

print("Version 3")
print(f"En/Fr : {len(df_en_fr_3)} /n En: {len(df_en_fr_3)} /n Fr : {len(df_fr_3)}")

Version 3
En/Fr : 14952 /n En: 14952 /n Fr : 14952
