In [1]:
import pandas as pd
from transformers import MBartTokenizer, MBartForConditionalGeneration, XLMRobertaTokenizer, XLMRobertaModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

# Load translation model and tokenizer for Nepali
translation_model_name = "facebook/mbart-large-50-many-to-many-mmt"
translation_tokenizer = MBartTokenizer.from_pretrained(translation_model_name)
translation_model = MBartForConditionalGeneration.from_pretrained(translation_model_name)

# Load XLM-RoBERTa model for embeddings (multilingual including Nepali)
embedding_model_name = "xlm-roberta-base"
embedding_tokenizer = XLMRobertaTokenizer.from_pretrained(embedding_model_name)
embedding_model = XLMRobertaModel.from_pretrained(embedding_model_name)


tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'MBart50Tokenizer'. 
The class this function is called from is 'MBartTokenizer'.


model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

In [2]:
dataset_path="/kaggle/input/nepali-hs/nepali.csv"

In [3]:
# Function for translation
def translate(text, src_lang, tgt_lang):
    if isinstance(text, str) and text.strip():  # Ensure text is a valid non-empty string
        try:
            translation_tokenizer.src_lang = src_lang
            encoded = translation_tokenizer(text, return_tensors="pt")
            generated_tokens = translation_model.generate(**encoded, forced_bos_token_id=translation_tokenizer.lang_code_to_id[tgt_lang])
            return translation_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
        except Exception as e:
            print(f"Translation error for text '{text}': {e}")
            return None
    else:
        return None  # Skip invalid text

# Back-translation function
def back_translate(text, src_lang="ne_NP", pivot_lang="en_XX"):
    translated_to_pivot = translate(text, src_lang=src_lang, tgt_lang=pivot_lang)
    if translated_to_pivot is None:
        return None  # Skip if translation to pivot fails
    back_translated = translate(translated_to_pivot, src_lang=pivot_lang, tgt_lang=src_lang)
    return back_translated

# Function to get embeddings using XLM-RoBERTa
def get_embedding(text):
    if isinstance(text, str) and text.strip():  # Ensure text is a valid non-empty string
        try:
            inputs = embedding_tokenizer(text, return_tensors="pt", padding=True, truncation=True)
            with torch.no_grad():
                outputs = embedding_model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1)  # Use the mean of the last hidden states as the embedding
            return embeddings
        except Exception as e:
            print(f"Embedding error for text '{text}': {e}")
            return None
    else:
        return None  # Skip invalid text

In [4]:
df = pd.read_csv(dataset_path)  # Replace with the path to your dataset


In [5]:
# List to store augmented data
augmented_data = []

# Iterate over the dataset and perform back-translation
for index, row in df.iterrows():
    original_text = row['tweet']  # Replace 'tweet' with the column name of your text data
    
    # Skip invalid or non-string entries
    if not isinstance(original_text, str) or not original_text.strip():
        print(f"Skipping row {index}: Invalid text format.")
        continue
    
    # Augment the text via back-translation
    augmented_text = back_translate(original_text, src_lang="ne_NP", pivot_lang="en_XX")
    
    # Skip if augmentation fails
    if augmented_text is None:
        print(f"Skipping row {index}: Back-translation failed.")
        continue
    
    # Get embeddings for original and augmented text
    original_embedding = get_embedding(original_text)
    augmented_embedding = get_embedding(augmented_text)
    
    # Skip if embedding fails
    if original_embedding is None or augmented_embedding is None:
        print(f"Skipping row {index}: Embedding failed.")
        continue
    
    # Calculate cosine similarity
    similarity = cosine_similarity(original_embedding, augmented_embedding)[0][0]
    
    # If similarity is greater than 0.9, save the augmented text
    if similarity > 0.9:
        augmented_data.append({
            'original_text': original_text,
            'augmented_text': augmented_text,
            'similarity': similarity
        })
    else:
        print(f"Skipping row {index}: Similarity below threshold ({similarity}).")

# Convert augmented data to DataFrame and save to a new CSV
augmented_df = pd.DataFrame(augmented_data)
augmented_df.to_csv("/kaggle/working/augmented_nepali.csv", index=False)

print("Augmented data saved to 'augmented_nepali.csv'")

Augmented data saved to 'augmented_nepali.csv'
