<a href="https://colab.research.google.com/github/CoffeeTulip/CS39AA-Project/blob/main/ProjectPart2b.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Imports

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
# import all of the python modules/packages you'll need here
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
!pip install datasets
!pip install nltk
!pip install sentencepiece
import datasets
from datasets import load_dataset
from datasets import DatasetDict
import nltk

nltk.download('punkt')
nltk.download('stopwords')
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

#Data Pre-processing

In [3]:
# Convert dataset dict to dataset, seperate the splits from the whole so I can view it and alter it better
df = load_dataset("Amani27/massive_translation_dataset")
df = DatasetDict(df)

df_train = pd.DataFrame.from_dict(df['train'])
df_validation = pd.DataFrame.from_dict(df['validation'])
df_test = pd.DataFrame.from_dict(df['test'])
df = pd.concat([df_train, df_validation, df_test], ignore_index=True)


In [4]:
columns_to_drop = ['id', 'split', 'de_DE', 'hi_IN', 'fr_FR', 'it_IT', 'ar_SA', 'nl_NL', 'ja_JP', 'pt_PT' ]
df = df.drop(columns = columns_to_drop)


# df.to_csv('/content/gdrive/My Drive/Colab Notebooks/CS39AA/En_ES_Datafram.csv', index=False)


In [5]:
df.head()

Unnamed: 0,en_US,es_ES
0,wake me up at nine am on friday,despiértame a las nueve de la mañana el viernes
1,set an alarm for two hours from now,pon una alarma en dos horas desde ahora
2,olly quiet,olly silencio
3,stop,parar
4,olly pause for ten seconds,olly para por un minuto


#Text cleaning
*the text is pretty clean already... but just for practice :D*

In [6]:
english_column = 'en_US'
spanish_column = 'es_ES'

# Tokenize English. The text dataset looked cleaned up already but just to be safe
english_text = ' '.join(df[english_column].astype(str).values)
english_tokens = word_tokenize(english_text.lower())

# Remove stopwords and punctuation
stop_words = set(stopwords.words('english') + list(string.punctuation))
english_filtered_tokens = [word for word in english_tokens if word.isalnum() and word not in stop_words]

# Get the most common words in English
english_common_words = Counter(english_filtered_tokens).most_common(10)

# Tokenize Spanish
spanish_text = ' '.join(df[spanish_column].astype(str).values)
spanish_tokens = word_tokenize(spanish_text.lower())  # Convert to lowercase for consistency

# Remove stopwords and punctuation
stop_words = set(stopwords.words('spanish') + list(string.punctuation))
spanish_filtered_tokens = [word for word in spanish_tokens if word.isalnum() and word not in stop_words]

# Get the most common words in Spanish
spanish_common_words = Counter(spanish_filtered_tokens).most_common(10)


# Display the results
print("Most common words in English:")
print(english_common_words)

print("\nMost common words in Spanish:")
print(spanish_common_words)


Most common words in English:
[('please', 1440), ('play', 1173), ('today', 849), ('list', 770), ('email', 718), ('new', 701), ('tell', 697), ('time', 650), ('set', 563), ('olly', 534)]

Most common words in Spanish:
[('favor', 1284), ('hoy', 865), ('lista', 859), ('correo', 859), ('mañana', 765), ('pon', 728), ('hora', 591), ('electrónico', 554), ('cuál', 542), ('puedes', 511)]


In [7]:
from sklearn.model_selection import train_test_split

X = df['en_US'].values
y = df['es_ES'].values

# Split data into training and validation data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [8]:
X_train = X_train.tolist()
X_val = X_val.tolist()
y_train = y_train.tolist()
y_val = y_val.tolist()

#Quick Model
##T5-Small Hugging Face Translator model

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("translation", model="t5-small")

# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

In [11]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW


In [40]:
class CustomDataset(Dataset):
    def __init__(self, input_texts, target_texts, tokenizer, max_length=128):
        self.input_texts = input_texts
        self.target_texts = target_texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.input_texts)

    def __getitem__(self, idx):
        input_text = self.input_texts[idx]
        target_text = self.target_texts[idx]

        # Encode both input and target texts
        encoding = self.tokenizer(
            input_text,
            target_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': encoding['labels'].squeeze()
        }

In [36]:
tokenizer = T5Tokenizer.from_pretrained('t5-small')
# english_text = df[english_column].tolist()
# english_tokens = tokenizer(english_text, return_tensors='pt', padding=True, truncation=True)

english_tokens = tokenizer(X_train, return_tensors='pt', padding=True, truncation=True)
valid_english_tokens = tokenizer(X_val, return_tensors='pt', padding=True, truncation=True)
# spanish_text = df[spanish_column].tolist()
# spanish_tokens = tokenizer(spanish_text, return_tensors='pt', padding=True, truncation=True)

spanish_tokens = tokenizer(y_train, return_tensors='pt', padding=True, truncation=True)
valid_spanish_tokens = tokenizer(y_val, return_tensors='pt', padding=True, truncation=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [37]:
# dataset = CustomDataset(X_train, y_train, tokenizer)
# valid_dataset = CustomDataset(X_val, y_val, tokenizer)

# dataloader = DataLoader(dataset, batch_size=64, shuffle=True)
# valid_dataloader = DataLoader(valid_dataset, batch_size=64, shuffle=True)

# model = T5ForConditionalGeneration.from_pretrained('t5-small')

# optimizer = AdamW(model.parameters(), lr=.01)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)

In [41]:
learning_rates = [.01, .001, .0001]
batch_sizes = [16, 32, 64]

num_epochs = 3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


dataset = CustomDataset(X_train, y_train, tokenizer)
valid_dataset = CustomDataset(X_val, y_val, tokenizer)


for lr in learning_rates:
    for batch_size in batch_sizes:

      dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
      valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=True)

      model = T5ForConditionalGeneration.from_pretrained('t5-small')

      optimizer = AdamW(model.parameters(), lr=lr)
      scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)
      #------------------------------------------------------------------------------
      model.to(device)
      model.train()

      for epoch in range(num_epochs):
          # Training phase
          for batch in dataloader:
              input_ids = batch['input_ids'].to(device)
              attention_mask = batch['attention_mask'].to(device)
              labels = batch['labels'].to(device)

              outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
              loss = outputs.loss

              optimizer.zero_grad()
              loss.backward()
              optimizer.step()

          # Validation phase
          model.eval()
          total_validation_loss = 0.0

          with torch.no_grad():
              for batch_val in valid_dataloader:
                  input_ids_val = batch_val['input_ids'].to(device)
                  attention_mask_val = batch_val['attention_mask'].to(device)
                  labels_val = batch_val['labels'].to(device)

                  outputs_val = model(input_ids_val, attention_mask=attention_mask_val, labels=labels_val)
                  loss_val = outputs_val.loss

                  total_validation_loss += loss_val.item()

          average_validation_loss = total_validation_loss / len(valid_dataloader)
          print(f'Epoch {epoch + 1}/{num_epochs}, Training Loss: {loss.item()}, Validation Loss: {average_validation_loss}')

          # Set the model back to training mode
          model.train()

          # Save the trained model
          model.save_pretrained('/content/gdrive/My Drive/Colab Notebooks/CS39AA/En_ES_Dataframe')
          tokenizer.save_pretrained('/content/gdrive/My Drive/Colab Notebooks/CS39AA/En_ES_Dataframe_tokens')

KeyError: ignored

#Load from saved model

In [21]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load the saved model and tokenizer
model_path = ('/content/gdrive/MyDrive/Colab Notebooks/CS39AA/En_ES_Dataframe')
token_path = ('/content/gdrive/MyDrive/Colab Notebooks/CS39AA/En_ES_Dataframe_tokens')

model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = T5Tokenizer.from_pretrained(token_path)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
!pip install sacrebleu

In [19]:
test_dataset = CustomDataset(english_text, spanish_text, tokenizer)

In [None]:
from sacrebleu import corpus_bleu
from tqdm import tqdm

# test_dataset = CustomDataset(english_text, spanish_text, tokenizer)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Assuming you have a test dataset named test_dataset
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=False)

references = []
translations = []

model.eval()

with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Translating"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        model.to(device)

        # Generate translations
        translation_ids = model.generate(input_ids, attention_mask=attention_mask, max_length=50, num_beams=4)

        # Decode the generated translation
        translation_text = [tokenizer.decode(ids, skip_special_tokens=True) for ids in translation_ids]

        # Fetch corresponding Spanish translations from DataFrame using indices or keys
        batch_indices = batch['labels']  # Replace with actual batch indices or keys
        batch_references = spanish_column

        # Tokenize references if needed
        batch_references = [word_tokenize(ref) for ref in batch_references]

        # Collect references and translations for BLEU score
        references.extend(batch_references)
        translations.extend(translation_text)

# Calculate BLEU score
bleu = corpus_bleu(translations, [references])
print("BLEU Score:", bleu.score)


In [23]:
input_text = "hows it going!"
input_tokens = tokenizer(input_text, return_tensors='pt', padding=True, truncation=True)
with torch.no_grad():
    model.eval()
    input_ids = input_tokens['input_ids'].to(device)
    attention_mask = input_tokens['attention_mask'].to(device)

    model = model.to(device)

    # Generate translations
    translation_ids = model.generate(input_ids, attention_mask=attention_mask, max_length=50, num_beams=4)

# Decode the generated translation
translation_text = tokenizer.decode(translation_ids[0], skip_special_tokens=True)
print("Input Text:", input_text)
print("Translated Text:", translation_text)


Input Text: hows it going!
Translated Text: cómo está el tiempo
