In [1]:
pip install hazm

Collecting hazm
  Downloading hazm-0.10.0-py3-none-any.whl (892 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m892.6/892.6 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fasttext-wheel<0.10.0,>=0.9.2 (from hazm)
  Downloading fasttext_wheel-0.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.4/4.4 MB[0m [31m28.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting flashtext<3.0,>=2.7 (from hazm)
  Downloading flashtext-2.7.tar.gz (14 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting numpy==1.24.3 (from hazm)
  Downloading numpy-1.24.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m50.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting python-crfsuite<0.10.0,>=0.9.9 (from hazm)
  Downloading python_crfsuite-0.9.10-cp310-cp310-manylinux_2_17_x86_64

In [1]:
!pip install gdown
import gdown



In [None]:
from google.colab import drive
drive.mount('/content/drive/MyDrive/ANN_Assignments/5')

In [None]:
import os
data_dir = '/content/drive/MyDrive/ANN_Assignments/5'
filenames = [f for f in os.listdir(data_dir) if f.endswith('.txt')]


In [None]:
filenames[0]

'Persian-WikiText-1.txt'

In [None]:
import os
from hazm import Normalizer, word_tokenize
import torch
from torch.utils.data import Dataset
from tqdm import tqdm

normalizer = Normalizer()

def load_and_preprocess_data(data_dir):
    corpus = ""
    filenames = [f for f in os.listdir(data_dir) if f.endswith('.txt')]
    for filename in tqdm(filenames, desc="Processing files"):
        with open(os.path.join(data_dir, filename), 'r', encoding='utf-8') as file:
            text = file.read()
            text = normalizer.normalize(text)
            text = ' '.join(word_tokenize(text))
            corpus += text + " "
    return corpus

data_dir = '/content/drive/MyDrive/ANN_Assignments/5'
text = load_and_preprocess_data(data_dir)


In [None]:
save_folder = '/content/drive/MyDrive/ANN_Assignments/5'
os.makedirs(save_folder, exist_ok=True)
save_path = os.path.join(save_folder, 'preprocessed_text_one.txt')

with open(save_path, 'w', encoding='utf-8') as f:
    f.write(text)

print(f"Preprocessed text has been saved to {save_path}")

Preprocessed text has been saved to /content/drive/MyDrive/ANN_Assignments/5/preprocessed_text_one.txt


In [2]:
def load_preprocessed_text_from_drive(save_path):
    with open(save_path, 'r', encoding='utf-8') as f:
        return f.read()

text = load_preprocessed_text_from_drive('/content/drive/MyDrive/ANN_Assignments/5/preprocessed_text.txt')

In [None]:
import torch.nn as nn

seq_length = 10

class PersianWikipediaDataset(Dataset):
    def __init__(self, text, seq_length):
        self.text = text
        self.seq_length = seq_length
        self.words = text.split()
        self.vocab = sorted(list(set(self.words)))
        self.word_to_idx = {word: idx for idx, word in enumerate(self.vocab)}
        self.idx_to_word = {idx: word for idx, word in enumerate(self.vocab)}

    def __len__(self):
        return len(self.words) - self.seq_length

    def __getitem__(self, idx):
        seq = self.words[idx:idx + self.seq_length]
        label = self.words[idx + self.seq_length]
        x = torch.tensor([self.word_to_idx[word] for word in seq])
        y = torch.tensor(self.word_to_idx[label])
        return x, y

dataset = PersianWikipediaDataset(text, seq_length)

In [None]:
len(dataset)

11889408

In [None]:
torch.save(dataset, '/content/drive/MyDrive/ANN_Assignments/5/persian_wikipedia_dataset.pth')

In [None]:
len(dataset)

99535978

In [None]:
dataset = torch.load('/content/drive/MyDrive/ANN_Assignments/5/persian_wikipedia_dataset.pth')

In [None]:
import torch.nn as nn

class TextGenerationModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super(TextGenerationModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)
        self.num_layers = num_layers
        self.hidden_size = hidden_size

    def forward(self, x, hidden):
        x = self.embedding(x)
        out, hidden = self.lstm(x, hidden)
        out = self.fc(out[:, -1, :])
        return out, hidden

    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.num_layers, batch_size, self.hidden_size).zero_(),
                  weight.new(self.num_layers, batch_size, self.hidden_size).zero_())
        return hidden

vocab_size = len(dataset.vocab)
embed_size = 128
hidden_size = 256
num_layers = 2
batch_size = 64
num_epochs = 20
seq_length = 100


In [None]:
model = torch.load('/content/drive/MyDrive/ANN_Assignments/5/model.pth')

In [None]:
import torch.optim as optim
from torch.utils.data import DataLoader

model = TextGenerationModel(vocab_size, embed_size, hidden_size, num_layers).to('cuda')
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

scaler = GradScaler()

model.train()
for epoch in range(num_epochs):
    hidden = model.init_hidden(batch_size)
    for i, (inputs, targets) in enumerate(dataloader):
        inputs, targets = inputs.to('cuda'), targets.to('cuda')

        optimizer.zero_grad()
        hidden = tuple([each.data for each in hidden])

        with autocast():
            outputs, hidden = model(inputs, hidden)
            loss = criterion(outputs, targets)

        scaler.scale(loss).backward()

        scaler.step(optimizer)

        scaler.update()

        if i % 100 == 0:
            print(f'Epoch {epoch+1}/{num_epochs}, Step {i}/{len(dataloader)}, Loss: {loss.item()}')

torch.save(model.state_dict(), '/content/drive/MyDrive/ANN_Assignments/5/model.pth')

In [None]:
torch.save(model, '/content/drive/MyDrive/ANN_Assignments/5/model.pth')

In [None]:
import math

def calculate_perplexity(model, dataset):
    model.eval()
    total_loss = 0
    total_words = 0
    with torch.no_grad():
        hidden = model.init_hidden(batch_size)
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
        criterion = nn.CrossEntropyLoss(reduction='sum')
        for inputs, targets in tqdm(dataloader):
            outputs, hidden = model(inputs, hidden)
            loss = criterion(outputs, targets)
            total_loss += loss.item()
            total_words += targets.size(0)
    perplexity = math.exp(total_loss / total_words)
    return perplexity

perplexity = calculate_perplexity(model, dataset)
print(f'Perplexity: {perplexity}')


In [None]:
pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [None]:
from rouge import Rouge
import torch

def generate_text(model, start_seq, length):
    model.eval()
    generated = start_seq
    hidden = model.init_hidden(1)
    input_seq = torch.tensor([dataset.word_to_idx[word] for word in start_seq]).unsqueeze(0)

    with torch.no_grad():
        for _ in range(length):
            output, hidden = model(input_seq, hidden)
            _, top_idx = output.topk(1)
            next_word = dataset.idx_to_word[top_idx.item()]
            generated.append(next_word)
            input_seq = torch.tensor([[top_idx.item()]])

    return ' '.join(generated)

def calculate_rouge(model, dataset, num_samples=100, generated_seq_length=50):
    rouge = Rouge()
    references = []
    hypotheses = []

    for i in tqdm(range(num_samples)):
        start_idx = torch.randint(0, len(dataset) - generated_seq_length - 1, (1,)).item()
        start_seq = dataset.words[start_idx:start_idx + seq_length]

        reference_seq = dataset.words[start_idx + seq_length:start_idx + seq_length + generated_seq_length]
        reference_text = ' '.join(reference_seq)
        references.append(reference_text)

        generated_text = generate_text(model, start_seq, generated_seq_length)
        hypotheses.append(generated_text)

    scores = rouge.get_scores(hypotheses, references, avg=True)
    return scores

num_samples = 100
generated_seq_length = 50
rouge_scores = calculate_rouge(model, dataset, num_samples, generated_seq_length)

print("ROUGE Scores:", rouge_scores)


100%|██████████| 100/100 [02:15<00:00,  1.36s/it]


ROUGE Scores: {'rouge-1': {'r': 0.3075437336161513, 'p': 0.17546876434234693, 'f': 0.22287045169731864}, 'rouge-2': {'r': 0.04610718420398572, 'p': 0.022814711833006415, 'f': 0.030503725244865613}, 'rouge-l': {'r': 0.26575825280050447, 'p': 0.15159458209273416, 'f': 0.19256247510868557}}


In [None]:
pretraining_data_dir = '/content/drive/MyDrive/ANN_Assignments/5/Pretrain-data'
pretraining_text = load_and_preprocess_data(pretraining_data_dir)

Processing files: 100%|██████████| 1/1 [08:31<00:00, 511.47s/it]


In [3]:
file_id = '1qW_FkyUGoTFCloHgzAzV9H2UfobmPxI2'

gdown_url = f'https://drive.google.com/uc?id={file_id}'

output_file = 'preprocessed_text_one.txt'
gdown.download(gdown_url, output_file, quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1qW_FkyUGoTFCloHgzAzV9H2UfobmPxI2
To: /content/preprocessed_text_one.txt
100%|██████████| 103M/103M [00:00<00:00, 123MB/s] 


'preprocessed_text_one.txt'

In [4]:
import random
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import DistilBertTokenizer, DistilBertForMaskedLM, AdamW
from tqdm import tqdm

# pretraining_text = load_preprocessed_text_from_drive('/content/drive/MyDrive/ANN_Assignments/5/preprocessed_text_one.txt')
pretraining_text = load_preprocessed_text_from_drive(output_file)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

class MaskedLanguageModelDataset(Dataset):
    def __init__(self, text, tokenizer, seq_length=128, mask_probability=0.15):
        self.text = text
        self.tokenizer = tokenizer
        self.seq_length = seq_length
        self.mask_probability = mask_probability
        self.token_ids = self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(self.text))
        self.num_examples = (len(self.token_ids) - 1) // self.seq_length

    def __len__(self):
        return self.num_examples

    def __getitem__(self, idx):
        start_idx = idx * self.seq_length
        end_idx = start_idx + self.seq_length
        input_ids = self.token_ids[start_idx:end_idx]
        labels = input_ids.copy()

        for j in range(self.seq_length):
            if random.random() < self.mask_probability:
                input_ids[j] = self.tokenizer.mask_token_id

        return torch.tensor(input_ids).to(device), torch.tensor(labels).to(device)

pretraining_dataset = MaskedLanguageModelDataset(pretraining_text, tokenizer)
pretraining_dataloader = DataLoader(pretraining_dataset, batch_size=16, shuffle=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]


saving pretraining_dataset

In [5]:
dataset_save_path = 'pretraining_dataset_distilbert.pth'
torch.save(pretraining_dataset, dataset_save_path)
print(f'Dataset saved to {dataset_save_path}')

Dataset saved to pretraining_dataset_distilbert.pth


loading pretraining_dataset

In [None]:
file_id = '1CGqAjYtFwAGtxjXbfuMC8t2qGhUdBCIb'

gdown_url = f'https://drive.google.com/uc?id={file_id}'

output_file = 'pretraining_dataset_distilbert.pth'
gdown.download(gdown_url, output_file, quiet=False)

Downloading...
From (original): https://drive.google.com/uc?id=1CGqAjYtFwAGtxjXbfuMC8t2qGhUdBCIb
From (redirected): https://drive.google.com/uc?id=1CGqAjYtFwAGtxjXbfuMC8t2qGhUdBCIb&confirm=t&uuid=5a6a0ac6-3a63-40ad-9bc6-10f9716f5262
To: /content/pretraining_dataset_distilbert.pth

  0%|          | 0.00/233M [00:00<?, ?B/s][A
  5%|▌         | 12.1M/233M [00:00<00:01, 120MB/s][A
 11%|█         | 25.7M/233M [00:00<00:01, 115MB/s][A
 17%|█▋        | 40.4M/233M [00:00<00:01, 127MB/s][A
 25%|██▍       | 58.2M/233M [00:00<00:01, 146MB/s][A
 34%|███▍      | 78.6M/233M [00:00<00:00, 164MB/s][A
 41%|████      | 95.4M/233M [00:00<00:00, 152MB/s][A
 48%|████▊     | 111M/233M [00:00<00:00, 139MB/s] [A
 54%|█████▍    | 126M/233M [00:00<00:00, 127MB/s][A
 61%|██████    | 141M/233M [00:01<00:00, 133MB/s][A
 69%|██████▊   | 160M/233M [00:01<00:00, 137MB/s][A
 79%|███████▉  | 185M/233M [00:01<00:00, 165MB/s][A
 88%|████████▊ | 205M/233M [00:01<00:00, 176MB/s][A
100%|██████████| 233M/233M [0

'pretraining_dataset_distilbert.pth'

In [None]:
dataset_load_path = '/content/drive/MyDrive/ANN_Assignments/5/pretraining_dataset_distilbert.pth'
loaded_dataset = torch.load(dataset_load_path)
print(f'Dataset loaded from {dataset_load_path}')
pretraining_dataloader = DataLoader(loaded_dataset, batch_size=32, shuffle=True)

Dataset loaded from /content/drive/MyDrive/ANN_Assignments/5/pretraining_dataset_distilbert.pth


In [None]:
pretraining_dataloader = DataLoader(output_file, batch_size=32, shuffle=True)

In [7]:
from torch.cuda.amp import GradScaler, autocast

model_finetuned = DistilBertForMaskedLM.from_pretrained('distilbert-base-uncased')

for param in model_finetuned.distilbert.transformer.layer[:4].parameters():
    param.requires_grad = False

optimizer = AdamW(filter(lambda p: p.requires_grad, model_finetuned.parameters()), lr=5e-5)

scaler = GradScaler()

num_epochs = 1
model_finetuned.train()
for epoch in range(num_epochs):
    loop = tqdm(pretraining_dataloader, leave=True)
    for batch in loop:
        input_ids, labels = batch

        input_ids = input_ids.to('cuda')
        labels = labels.to('cuda')
        model_finetuned.to('cuda')

        optimizer.zero_grad()

        with autocast():
            outputs = model_finetuned(input_ids=input_ids, labels=labels)
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

model_finetuned.save_pretrained('finetuned_distilbert_model')

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch 0: 100%|██████████| 21056/21056 [30:29<00:00, 11.51it/s, loss=0.221]


In [9]:
pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=ec0611fdde4cce2424b3e81bc2f2aab77140483c94bb79c8fd276c20a5ec0f8c
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [10]:
import torch
from rouge_score import rouge_scorer
from torch.utils.data import DataLoader

def evaluate_model(model, dataloader, tokenizer):
    model.eval()
    predictions = []
    references = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids, labels = batch
            input_ids, labels = input_ids.to('cuda'), labels.to('cuda')

            outputs = model(input_ids=input_ids)
            preds = torch.argmax(outputs.logits, dim=-1)

            predictions.extend(tokenizer.batch_decode(preds, skip_special_tokens=True))
            references.extend(tokenizer.batch_decode(labels, skip_special_tokens=True))

    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = [scorer.score(ref, pred) for ref, pred in zip(references, predictions)]
    avg_scores = {key: sum(score[key].fmeasure for score in scores) / len(scores) for key in scores[0]}
    return avg_scores

test_text = pretraining_text[:len(pretraining_text) // 10]
evaluation_dataset = MaskedLanguageModelDataset(test_text, tokenizer)
evaluation_dataloader = DataLoader(evaluation_dataset, batch_size=16)

# model_finetuned = DistilBertForMaskedLM.from_pretrained('/content/drive/MyDrive/ANN_Assignments/5/finetuned_distilbert_model').to('cuda')

rouge_scores = evaluate_model(model_finetuned, evaluation_dataloader, tokenizer)
print("ROUGE scores:", rouge_scores)


ROUGE scores: {'rouge1': 0.05171400419687792, 'rouge2': 0.024174201130257224, 'rougeL': 0.05170793018344105}
