<a href="https://colab.research.google.com/github/CoffeeTulip/CS39AA-Project/blob/main/ProjectPart2b.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Imports

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
# import all of the python modules/packages you'll need here
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
!pip install datasets
!pip install nltk
!pip install sentencepiece
import datasets
from datasets import load_dataset
from datasets import DatasetDict
import nltk

nltk.download('punkt')
nltk.download('stopwords')
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#Data Pre-processing

In [3]:
# Convert dataset dict to dataset, seperate the splits from the whole so I can view it and alter it better
df = load_dataset("Amani27/massive_translation_dataset")
df = DatasetDict(df)

df_train = pd.DataFrame.from_dict(df['train'])
df_validation = pd.DataFrame.from_dict(df['validation'])
df_test = pd.DataFrame.from_dict(df['test'])
df = pd.concat([df_train, df_validation, df_test], ignore_index=True)


In [4]:
columns_to_drop = ['id', 'split', 'de_DE', 'hi_IN', 'fr_FR', 'it_IT', 'ar_SA', 'nl_NL', 'ja_JP', 'pt_PT' ]
df = df.drop(columns = columns_to_drop)


# df.to_csv('/content/gdrive/My Drive/Colab Notebooks/CS39AA/En_ES_Datafram.csv', index=False)


In [5]:
df.head()

Unnamed: 0,en_US,es_ES
0,wake me up at nine am on friday,despiértame a las nueve de la mañana el viernes
1,set an alarm for two hours from now,pon una alarma en dos horas desde ahora
2,olly quiet,olly silencio
3,stop,parar
4,olly pause for ten seconds,olly para por un minuto


#Tokenizing

In [6]:
english_column = 'en_US'
spanish_column = 'es_ES'

# Tokenize English. The text dataset looked cleaned up already but just to be safe
english_text = ' '.join(df[english_column].astype(str).values)
english_tokens = word_tokenize(english_text.lower())

# Remove stopwords and punctuation
stop_words = set(stopwords.words('english') + list(string.punctuation))
english_filtered_tokens = [word for word in english_tokens if word.isalnum() and word not in stop_words]

# Get the most common words in English
english_common_words = Counter(english_filtered_tokens).most_common(10)

# Tokenize Spanish
spanish_text = ' '.join(df[spanish_column].astype(str).values)
spanish_tokens = word_tokenize(spanish_text.lower())  # Convert to lowercase for consistency

# Remove stopwords and punctuation
stop_words = set(stopwords.words('spanish') + list(string.punctuation))
spanish_filtered_tokens = [word for word in spanish_tokens if word.isalnum() and word not in stop_words]

# Get the most common words in Spanish
spanish_common_words = Counter(spanish_filtered_tokens).most_common(10)


# Display the results
print("Most common words in English:")
print(english_common_words)

print("\nMost common words in Spanish:")
print(spanish_common_words)


Most common words in English:
[('please', 1440), ('play', 1173), ('today', 849), ('list', 770), ('email', 718), ('new', 701), ('tell', 697), ('time', 650), ('set', 563), ('olly', 534)]

Most common words in Spanish:
[('favor', 1284), ('hoy', 865), ('lista', 859), ('correo', 859), ('mañana', 765), ('pon', 728), ('hora', 591), ('electrónico', 554), ('cuál', 542), ('puedes', 511)]


#Quick Model
##T5-small Hugging Face Translator model

In [7]:
# # Use a pipeline as a high-level helper
# from transformers import pipeline

# pipe = pipeline("translation", model="t5-small")

# # Load model directly
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# tokenizer = AutoTokenizer.from_pretrained("t5-small")
# model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

#-----------------------------
# It's advisable to create a new python environment and install simplet5
!pip install simplet5



In [8]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW


In [9]:
class CustomDataset(Dataset):
    def __init__(self, input_texts, target_texts, tokenizer, max_length=128):
        self.input_texts = input_texts
        self.target_texts = target_texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.input_texts)

    def __getitem__(self, idx):
        input_text = self.input_texts[idx]
        target_text = self.target_texts[idx]

        inputs = self.tokenizer.encode_plus(
            input_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        targets = self.tokenizer.encode(
            target_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': targets.squeeze()
        }

In [11]:
tokenizer = T5Tokenizer.from_pretrained('t5-small')
english_text = df[english_column].tolist()
english_tokens = tokenizer(english_text, return_tensors='pt', padding=True, truncation=True)

spanish_text = df[spanish_column].tolist()
spanish_tokens = tokenizer(spanish_text, return_tensors='pt', padding=True, truncation=True)

In [31]:
# dataset = CustomDataset(english_text, spanish_text, tokenizer)
# dataloader = DataLoader(dataset, batch_size=34, shuffle=True)

# model = T5ForConditionalGeneration.from_pretrained('t5-small')

# optimizer = AdamW(model.parameters(), lr=.01)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)

#-----------------------------

# import
from simplet5 import SimpleT5

# instantiate
model = SimpleT5()

# load (supports t5, mt5, byT5 and CodeT5 models)
model.from_pretrained("t5","t5-base")

# train
model.train(train_df=train_df, # pandas dataframe with 2 columns: source_text & target_text
            eval_df=eval_df, # pandas dataframe with 2 columns: source_text & target_text
            source_max_token_len = 500,
            target_max_token_len = 500,
            batch_size = 34,
            max_epochs = 3,
            use_gpu = True,
            outputdir = "outputs",
            early_stopping_patience_epochs = 0,
            precision = 32
            )

# load trained T5 model
model.load_model("t5","path/to/trained/model/directory", use_gpu=False)

# predict
model.predict("input text for prediction")




In [None]:
num_epochs = 3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)
model.train()

for epoch in range(num_epochs):
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}')

# Save the trained model
model.save_pretrained('/content/gdrive/My Drive/Colab Notebooks/CS39AA/En_ES_Datafram.csv')
tokenizer.save_pretrained('/content/gdrive/My Drive/Colab Notebooks/CS39AA/En_ES_Datafram.csv')

In [None]:
!pip install sacrebleu


In [None]:
from sacrebleu import corpus_bleu
from tqdm import tqdm

# Assuming you have a test dataset named test_dataset
test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=False)

references = []
translations = []

model.eval()

with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Translating"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        # Generate translations
        translation_ids = model.generate(input_ids, attention_mask=attention_mask, max_length=50, num_beams=4)

        # Decode the generated translation
        translation_text = tokenizer.decode(translation_ids, skip_special_tokens=True)

        # Collect references and translations for BLEU score
        references.extend(batch['target_texts'])
        translations.extend(translation_text)

# Calculate BLEU score
bleu = corpus_bleu(translations, [references])
print("BLEU Score:", bleu.score)


In [None]:
input_text = "This is a test sentence to be translated."
input_tokens = tokenizer(input_text, return_tensors='pt', padding=True, truncation=True)
with torch.no_grad():
    model.eval()
    input_ids = input_tokens['input_ids'].to(device)
    attention_mask = input_tokens['attention_mask'].to(device)

    # Generate translations
    translation_ids = model.generate(input_ids, attention_mask=attention_mask, max_length=50, num_beams=4)

# Decode the generated translation
translation_text = tokenizer.decode(translation_ids[0], skip_special_tokens=True)
print("Input Text:", input_text)
print("Translated Text:", translation_text)


In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW
import sentencepiece as spm
# Assuming your DataFrame is named 'df'
english_column = 'en_US'
spanish_column = 'es_ES'

class CustomDataset(Dataset):
    def __init__(self, input_texts, target_texts, tokenizer, max_length=128):
        self.input_texts = input_texts
        self.target_texts = target_texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.input_texts)

    def __getitem__(self, idx):
        input_text = self.input_texts[idx]
        target_text = self.target_texts[idx]

        inputs = self.tokenizer.encode_plus(
            input_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        targets = self.tokenizer.encode(
            target_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': targets.squeeze()
        }

# Initialize the T5 tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

# Create DataLoader for training
dataset = CustomDataset(df[english_column].tolist(), df[spanish_column].tolist(), tokenizer)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Set up optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-4)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)

# Training loop
num_epochs = 1
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)
model.train()

for epoch in range(num_epochs):
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}')

# Save the trained model
model.save_pretrained('/content/gdrive/My Drive/Colab Notebooks/CS39AA/En_ES_Datafram.csv')
tokenizer.save_pretrained('/content/gdrive/My Drive/Colab Notebooks/CS39AA/En_ES_Datafram.csv')
