In [None]:
%%capture
!pip install transformer
!pip install pytorch-lighting

In [None]:
import os
import matplotlib.pyplot as plt
from google.colab import drive

In [None]:
drive.mount('/content/drive', force_remount=False)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
data = pd.read_csv('/content/drive/MyDrive/NLP_PROJECT/data.csv')
print(data.columns)


Index(['id', 'en'], dtype='object')


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9268181 entries, 0 to 9268180
Data columns (total 2 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   id      object
 1   en      object
dtypes: object(2)
memory usage: 141.4+ MB


In [None]:
downSampled_data = data.sample(n=50000, random_state=42)

In [None]:
downSampled_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 50000 entries, 1558706 to 4222544
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      50000 non-null  object
 1   en      50000 non-null  object
dtypes: object(2)
memory usage: 1.1+ MB


In [None]:
from torch.utils.data import Dataset

class TranslationDataset(Dataset):
    def __init__(self, data, tokenizer, max_token_len):

        self.data = downSampled_data
        self.tokenizer = tokenizer
        self.max_token_len = max_token_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        row = self.data.iloc[index]
        source_text = str(row['en'])
        target_text = str(row['id'])

        source_encoding = self.tokenizer(
            source_text,
            max_length=self.max_token_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

        target_encoding = self.tokenizer(
            target_text,
            max_length=self.max_token_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

        labels = target_encoding["input_ids"]
        labels[labels == self.tokenizer.pad_token_id] = -100

        return {
            "input_ids": source_encoding["input_ids"].squeeze(),
            "attention_mask": source_encoding["attention_mask"].squeeze(),
            "labels": labels.squeeze(),
        }


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [None]:
%%capture
pip install pytorch-lightning

In [None]:
import pytorch_lightning as pl
import torch
import torch.nn as nn

In [None]:
import torch.nn as nn
class TranslationModel(pl.LightningModule):
    def __init__(self, model_name, learning_rate):
        super().__init__()
        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.loss_fn = nn.CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id)
        self.lr = learning_rate
    def generate(self, input_ids, attention_mask=None, **kwargs):
        return self.model.generate(input_ids=input_ids, attention_mask=attention_mask, **kwargs)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits
        return loss, logits

    def training_step(self, batch, batch_idx):
        loss, _ = self(
            input_ids=batch['input_ids'],
            attention_mask=batch['attention_mask'],
            labels=batch['labels']
        )
        self.log("train_loss", loss)
        return loss

    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=self.lr)


In [None]:
from torch.utils.data import DataLoader

from sklearn.model_selection import train_test_split

class TranslationDataModule(pl.LightningDataModule):
    def __init__(self, dataset_path, tokenizer, batch_size, max_token_len, val_size=0.2, random_state=42):
        super().__init__()
        self.dataset_path = '/content/drive/MyDrive/NLP_PROJECT/data.csv'
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        self.max_token_len = max_token_len
        self.val_size = val_size
        self.random_state = random_state

    def setup(self, stage=None):

        data = pd.read_csv(self.dataset_path)
        data = data.sample(n=50000, random_state=42)

        train_data, val_data = train_test_split(
            data, test_size=self.val_size, random_state=self.random_state
        )

        self.train_dataset = TranslationDataset(train_data, self.tokenizer, self.max_token_len)
        self.val_dataset = TranslationDataset(val_data, self.tokenizer, self.max_token_len)

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size, shuffle=False)

data_module = TranslationDataModule(
    dataset_path="data.csv",
    tokenizer=tokenizer,
    batch_size=16,
    max_token_len=128,
    val_size=0.2
)

model = TranslationModel(model_name="Helsinki-NLP/opus-mt-en-fr", learning_rate=5e-5)
trainer = pl.Trainer(max_epochs=3, accelerator="gpu", devices=1)
trainer.fit(model, data_module)


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.11/dist-packages/pytorch_lightning/trainer/configuration_validator.py:68: You passed in a `val_dataloader` but have no `validation_step`. Skipping val loop.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name    | Type             | Params | Mode 
-----------------------------------------------------
0 | model   | MarianMTModel    | 75.1 M | eval 
1 | loss_fn | CrossEntropyLoss | 0      | train
-----------------------------------------------------
74.6 M    Trainable params
524 K     Non-trainable params
75.1 M    Total params
300.536   Total estimated model params size (MB)
1         Modules in train mode
178       Modules in eval mode

Training: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=3` reached.


In [None]:
source_sentences = ["Hello my name is aaron and i am a student taking computer science as a major"]
inputs = tokenizer(source_sentences, return_tensors="pt", padding=True, truncation=True)
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]

outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask)
translations = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print(translations)


['Halo nama saya adalah Aaron dan saya seorang mahasiswa mengambil komputer sebagai masalah']
