In [2]:
import pandas as pd
import os
import tqdm as tqdm
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

In [3]:
from googletrans import Translator
import random

In [4]:
WORKING_ENV = 'local'  # Change as needed

if WORKING_ENV == 'cluster':
    content_path = '/vol/bitbucket/pvr24/nlp_cw_et1224_pvr24/'
    data_path = f'{content_path}data'
    cache_dir = f'{content_path}huggingface_cache'  # Define a cache directory on Bitbucket

elif WORKING_ENV == 'local':
    content_path = './'
    data_path = './data/'
    cache_dir = './huggingface_cache'
    os.makedirs(data_path, exist_ok=True)

else:
    raise NotImplementedError()

# Ensure cache directory exists
os.makedirs(cache_dir, exist_ok=True)

# Set Hugging Face cache directory
os.environ["HF_HOME"] = cache_dir

In [5]:
tsv_path = data_path + '/dontpatronizeme_pcl.tsv'
train_csv = data_path + '/train.csv'
dev_official_csv = data_path + '/test.csv'


train_df = pd.read_csv(train_csv)
#dev_official_df = pd.read_csv(dev_official_csv)

train_df = train_df.dropna(subset=['text'])
#dev_official_df = dev_official_df.dropna(subset=['text'])


In [6]:
import random
import pandas as pd
import os
from googletrans import Translator  # Ensure this is installed and working

translator = Translator()
intermediate_langs = ["fr", "es", "de"]

def back_translate_google(text):
    if not isinstance(text, str) or text.strip() == "":
        print(f"Skipping invalid text: {text}")  # Debugging
        return text  # Return unchanged
    
    try:
        intermediate_lang = random.choice(intermediate_langs)
        translated = translator.translate(text, src='en', dest=intermediate_lang).text
        if translated is None:
            print(f"Translation failed for text: {text}")
            return text

        back_translated = translator.translate(translated, src=intermediate_lang, dest='en').text
        if back_translated is None:
            print(f"Back-translation failed for text: {text}")
            return text

        return back_translated
    except Exception as e:
        print(f"Error translating text: {text}, Error: {e}")
        return text  # Return original text if an error occurs

# Define checkpoint number
checkpoint_row = 3900
checkpoint_path = data_path + f"/augmented_data_checkpoint_{checkpoint_row}.csv"

# Load from checkpoint if exists, otherwise start fresh
if os.path.exists(checkpoint_path):
    print(f"Loading from checkpoint: {checkpoint_path}")
    augmented_train_df = pd.read_csv(checkpoint_path)
else:
    print("No checkpoint found, starting from scratch.")
    augmented_train_df = train_df.copy()
    augmented_train_df["translated_text"] = ""

# Resume processing from the checkpoint row
for i in range(checkpoint_row, len(train_df)):  
    text = train_df.loc[i, 'text']
    augmented_train_df.loc[i, "translated_text"] = back_translate_google(text)
    
    # Save progress every 100 rows
    if (i + 1) % 100 == 0 or i == len(train_df) - 1:
        save_path = data_path + f"/augmented_data_checkpoint_{i+1}.csv"
        augmented_train_df.to_csv(save_path, index=False)
        print(f"Checkpoint saved at row {i+1}: {save_path}")

# Final save
final_save_path = data_path + "/augmented_data.csv"
augmented_train_df.to_csv(final_save_path, index=False)
print(f"Final augmented data saved to {final_save_path}")


Loading from checkpoint: ./data//augmented_data_checkpoint_3900.csv
Checkpoint saved at row 4000: ./data//augmented_data_checkpoint_4000.csv
Checkpoint saved at row 4100: ./data//augmented_data_checkpoint_4100.csv
Checkpoint saved at row 4200: ./data//augmented_data_checkpoint_4200.csv
Checkpoint saved at row 4300: ./data//augmented_data_checkpoint_4300.csv
Checkpoint saved at row 4400: ./data//augmented_data_checkpoint_4400.csv
Checkpoint saved at row 4500: ./data//augmented_data_checkpoint_4500.csv
Checkpoint saved at row 4600: ./data//augmented_data_checkpoint_4600.csv
Checkpoint saved at row 4700: ./data//augmented_data_checkpoint_4700.csv
Checkpoint saved at row 4800: ./data//augmented_data_checkpoint_4800.csv
Checkpoint saved at row 4900: ./data//augmented_data_checkpoint_4900.csv
Checkpoint saved at row 5000: ./data//augmented_data_checkpoint_5000.csv
Checkpoint saved at row 5100: ./data//augmented_data_checkpoint_5100.csv
Checkpoint saved at row 5200: ./data//augmented_data_che

In [10]:
import nltk
from nltk.corpus import wordnet
nltk.download('wordnet')
nltk.download('omw-1.4')

def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonym = lemma.name().replace('_', ' ')
            if synonym.lower() != word.lower():  # Avoid identical replacements
                synonyms.add(synonym)
    return list(synonyms)

def synonym_replace(sentence, ratio=0.1):
    words = sentence.split()
    eligible_words = [word for word in words if get_synonyms(word)]  # Words with synonyms
    
    if not eligible_words:
        return sentence  # Return original if no words can be replaced
    
    n = max(1, int(len(words) * ratio))  # Determine number of words to replace based on ratio
    words_to_replace = random.sample(eligible_words, min(n, len(eligible_words)))
    
    new_sentence = []
    for word in words:
        if word in words_to_replace:
            new_sentence.append(random.choice(get_synonyms(word)))
        else:
            new_sentence.append(word)
    
    return ' '.join(new_sentence)



synonym_train_df = train_df.copy()
# Apply to train_df
synonym_train_df["augmented_text"] = synonym_train_df["text"].apply(lambda x: synonym_replace(x, ratio=0.05))

save_path = data_path + "/synonym_augmented_data_005.csv"
synonym_train_df.to_csv(save_path, index=False)


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\paulr\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\paulr\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
