In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision
import torch
from tqdm import tqdm 

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
%cd MyDrive/DL/HW4

[Errno 2] No such file or directory: 'MyDrive/DL/HW4'
/content


In [5]:
!ls

drive  sample_data


In [6]:
DEVICE = torch.device('cpu')
if torch.cuda.is_available():
    DEVICE = torch.device('cuda')

In [7]:
DATA_PATH = '/content/drive/MyDrive/DL/HW4/data/'
BATCH_SIZE = 64
POETS_LIST = ['khayyam_norm.txt','naserkhosro_norm.txt',
              'hafez_norm.txt','obeyd_norm.txt',
              'parvin_norm.txt', 'moulavi_norm.txt', 
              'roodaki_norm.txt', 'saadi_norm.txt', 
              'sanaee_norm.txt','shahriar_norm.txt']
EPOCHS = 1
LEARNING_RATE = 0.01

In [8]:
import random

random.seed(10)

##Data

In [9]:
import os
if not os.path.exists(DATA_PATH):
   os.makedirs(DATA_PATH)

In [10]:
%cd /content/drive/MyDrive/DL/HW4/data/

/content/drive/.shortcut-targets-by-id/1Fbn6Jg0Awtfoo0LH2k0RSeOQqVNpjrYl/HW4/data


In [None]:
# !git clone https://github.com/amnghd/Persian_poems_corpus.git

In [11]:
!ls /content/drive/MyDrive/DL/HW4/data/Persian_poems_corpus/normalized

abusaeed_norm.txt   hafez_norm.txt	  orfi_norm.txt
amir_norm.txt	    hatef_norm.txt	  ouhadi_norm.txt
anvari_norm.txt     helali_norm.txt	  parvin_norm.txt
asadi_norm.txt	    iqbal_norm.txt	  rahi_norm.txt
asad_norm.txt	    jami_norm.txt	  razi_norm.txt
attar_norm.txt	    kamal_norm.txt	  roodaki_norm.txt
babaafzal_norm.txt  khaghani_norm.txt	  saadi_norm.txt
bahaee_norm.txt     khajoo_norm.txt	  saeb_norm.txt
bahar_norm.txt	    khayyam_norm.txt	  salman_norm.txt
bidel_norm.txt	    khosro_norm.txt	  sanaee_norm.txt
eraghi_norm.txt     manoochehri_norm.txt  seyf_norm.txt
farrokhi_norm.txt   moulavi_norm.txt	  shabestari_norm.txt
ferdousi_norm.txt   naserkhosro_norm.txt  shahnematollah_norm.txt
feyz_norm.txt	    nezari_norm.txt	  shahriar_norm.txt
ghaani_norm.txt     obeyd_norm.txt	  vahshi_norm.txt
gilani_norm.txt     onsori_norm.txt	  zahir_norm.txt


In [12]:
poems_list = []
for i,poet in enumerate(POETS_LIST):
  poet_file = DATA_PATH+'Persian_poems_corpus/normalized/'+poet
  with open(poet_file) as file:
    for line in file:
      if line.rstrip() != '':
        poems_list.append((line.rstrip(),i))
random.shuffle(poems_list)      

In [13]:
from torch.utils.data import Dataset
from transformers import AutoTokenizer

class PoetDataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.tokenizer = AutoTokenizer.from_pretrained(
        "HooshvareLab/bert-fa-base-uncased-clf-digimag"
    )
        
    def tokenize(self, text):
      tokenized_text =  self.tokenizer(text, truncation=True, max_length=32, padding = 'max_length', return_tensors = 'pt')
      for key in ['input_ids', 'attention_mask']:
            tokenized_text[key] = tokenized_text[key][0]
      return tokenized_text

    def __getitem__(self, index):
        raw_text = self.data[index][0]
        x = self.tokenize(raw_text)
        
        y = self.data[index][1]
        return x, y
    
    def __len__(self):
        return len(self.data)

In [14]:
total_data_size = len(poems_list)
train_size = int(0.8 * total_data_size)
eval_size = int(0.1 * total_data_size)
test_size = total_data_size-(train_size+eval_size)

poet_dataset = PoetDataset(poems_list)

train_dataset, eval_dataset, test_dataset = torch.utils.data.random_split(poet_dataset, [train_size, eval_size, test_size])

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
eval_dataloader = torch.utils.data.DataLoader(eval_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [17]:
print(train_dataset[0])

({'input_ids': tensor([   2, 6094, 5355, 4643, 2861, 9002, 2885,    4,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])}, 5)


In [15]:
from transformers import BertModel

class BertPoet(torch.nn.Module):
    def __init__(self,train_bert = False):
        super(BertPoet, self).__init__()
        self.bert = BertModel.from_pretrained(
            "HooshvareLab/bert-fa-base-uncased-clf-digimag")
        self.dropout = nn.Dropout(0.5)
        self.linear = nn.Linear(768, 10)
        self.relu = nn.ReLU()
        if not train_bert:
          for name, param in self.bert.named_parameters():
           if 'classifier' not in name:
             param.requires_grad = False


    def forward(self, input_batch):
        input_ids = input_batch["input_ids"]
        mask = input_batch["attention_mask"]
        _, pooled_output = self.bert(input_ids= input_ids, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        output = self.relu(linear_output)
        return output

In [17]:
def calculate_accuracy(logits, labels):
  predictions = torch.argmax(logits, dim=1)
  true = torch.sum(predictions==labels)
  return true

def evaluate(dataloader, model):
  loss_function = nn.CrossEntropyLoss()
  eval_loss = 0
  total_true = 0
  for x,y in tqdm(dataloader):
    with torch.no_grad():
      x = x.to(DEVICE)
      y = y.to(DEVICE)
      logits = model(x)
      loss = loss_function(logits, y)
      eval_loss+= loss.item()
      total_true+= calculate_accuracy(logits, y)
  eval_acuuracy = total_true/len(dataloader.dataset)
  return eval_loss, eval_acuuracy

In [18]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
def test_model(model, test_dataloader):
  y_true = torch.tensor([])
  y_pred = torch.tensor([])
  total_true = 0
  for x,y in tqdm(test_dataloader):
    x = x.to(DEVICE)
    y = y.to(DEVICE)
    logits = model(x)
    total_true+= calculate_accuracy(logits, y)
    predictions = torch.argmax(logits, dim=1)
    y_pred = torch.cat((y_pred, torch.ravel(predictions).cpu()))
    y_true = torch.cat((y_true, torch.ravel(y).cpu()))
  test_acuuracy = total_true/len(test_dataloader.dataset)
  metrics = precision_recall_fscore_support(y_true, y_pred, average='micro')
  print()
  print('Accuracy/test', test_acuuracy)
  print('F1Score/test', metrics[2])
  print(confusion_matrix(y_true, y_pred))

In [19]:
from torch.optim import SGD

def train_loop(model, train_dataloader, val_dataloader, epochs, optimizer):
    # optimizer = SGD(model.parameters(), lr=LEARNING_RATE)
    loss_function = nn.CrossEntropyLoss(reduction='mean')
    for epoch_num in range(epochs):
        print("-"*10,epoch_num,"-"*10)
        total_loss_train = 0
        for input_batch, batch_labels in tqdm(train_dataloader):
            logits = model(input_batch.to(DEVICE))
            loss = loss_function(logits, batch_labels.to(DEVICE))
            total_loss_train += loss.item()
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_value_(model.parameters(), 6)
            optimizer.step()
        eval_loss, eval_acuuracy = evaluate(val_dataloader, model)
        print('train loss ', total_loss_train)
        print('eval loss ', eval_loss)
        print('accuracy ', eval_acuuracy)
    return model

In [20]:
%cd ..

/content/drive/.shortcut-targets-by-id/1Fbn6Jg0Awtfoo0LH2k0RSeOQqVNpjrYl/HW4


In [None]:
partial_model = BertPoet().to(DEVICE)
optimizer = SGD(partial_model.parameters(), lr=LEARNING_RATE)
model_partial_train = train_loop(partial_model, train_dataloader, eval_dataloader, 1, optimizer)
torch.save(model_partial_train, 'partial_train_model.pt')
test_model(model_partial_train, test_dataloader)

Some weights of the model checkpoint at HooshvareLab/bert-fa-base-uncased-clf-digimag were not used when initializing BertModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


---------- 0 ----------


100%|██████████| 2390/2390 [04:51<00:00,  8.19it/s]
100%|██████████| 299/299 [00:35<00:00,  8.42it/s]


train loss  4367.026273488998
eval loss  537.5387711524963
accuracy  tensor(0.3134, device='cuda:0')


100%|██████████| 299/299 [00:35<00:00,  8.37it/s]


Accuracy/test tensor(0.3155, device='cuda:0')
F1Score/test 0.31548117154811717
[[   0    0    0    0    0   83    0    2   43    0]
 [   0    7    0    0    1 1281    0   61  813    2]
 [   0    0    0    0    1  520    0   58  351    2]
 [   0    0    0    0    0  314    0   15  186    1]
 [   0    3    0    0    2  728    0   29  368    1]
 [   0    8    0    0    1 3801    0  120 1338    6]
 [   0    0    0    0    0  129    0    5   70    0]
 [   0    4    2    1    2 1888    0  151  973    2]
 [   0   12    3    1    1 3171    0  113 2069    0]
 [   0    0    2    0    0  212    0   25  136    2]]





In [None]:
complete_model = BertPoet(train_bert = True).to(DEVICE)
optimizer = SGD(complete_model.parameters(), lr=LEARNING_RATE)
complete_model_train_SGD = train_loop(complete_model, train_dataloader, eval_dataloader,1,optimizer)
torch.save(complete_model_train_SGD, 'complete_model_train_SGD.pt')
test_model(complete_model_train_SGD, test_dataloader)

Some weights of the model checkpoint at HooshvareLab/bert-fa-base-uncased-clf-digimag were not used when initializing BertModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


---------- 0 ----------


100%|██████████| 2390/2390 [12:35<00:00,  3.16it/s]
100%|██████████| 299/299 [00:35<00:00,  8.42it/s]


train loss  3769.652772605419
eval loss  445.78026354312897
accuracy  tensor(0.4382, device='cuda:0')


100%|██████████| 299/299 [00:35<00:00,  8.39it/s]


Accuracy/test tensor(0.4416, device='cuda:0')
F1Score/test 0.4415794979079498
[[   0   22    3    0    5    7    0   10   80    1]
 [   0  975    7    0   36   31    0   61 1055    0]
 [   0   53  229    4   21   25    0  145  440   15]
 [   0   29   25   10   32   12    0   71  328    9]
 [   0  155   10    4  199   73    0   89  600    1]
 [   0  201   28    0  102 1265    0  123 3546    9]
 [   0   32    1    0    4   11    0    7  148    1]
 [   0  307   98    2   95  128    0  988 1384   21]
 [   0  321   33    5   48  145    0  117 4694    7]
 [   0   14   43    1   14    4    0   50  168   83]]





In [None]:
complete_model = BertPoet(train_bert = True).to(DEVICE)
optimizer = torch.optim.Adam(complete_model.parameters(),lr=LEARNING_RATE,amsgrad=False)
complete_model_train_ADAM = train_loop(complete_model, train_dataloader, eval_dataloader,1,optimizer)
torch.save(complete_model_train_ADAM, 'complete_model_train_ADAM.pt')
test_model(complete_model_train_ADAM, test_dataloader)

Some weights of the model checkpoint at HooshvareLab/bert-fa-base-uncased-clf-digimag were not used when initializing BertModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


---------- 0 ----------


100%|██████████| 2390/2390 [13:19<00:00,  2.99it/s]
100%|██████████| 299/299 [00:32<00:00,  9.07it/s]


train loss  5508.063760995865
eval loss  688.4728801250458
accuracy  tensor(0.0062, device='cuda:0')


100%|██████████| 299/299 [00:33<00:00,  8.89it/s]



Accuracy/test tensor(0.0067, device='cuda:0')
F1Score/test 0.0066945606694560665
[[ 128    0    0    0    0    0    0    0    0    0]
 [2165    0    0    0    0    0    0    0    0    0]
 [ 932    0    0    0    0    0    0    0    0    0]
 [ 516    0    0    0    0    0    0    0    0    0]
 [1131    0    0    0    0    0    0    0    0    0]
 [5274    0    0    0    0    0    0    0    0    0]
 [ 204    0    0    0    0    0    0    0    0    0]
 [3023    0    0    0    0    0    0    0    0    0]
 [5370    0    0    0    0    0    0    0    0    0]
 [ 377    0    0    0    0    0    0    0    0    0]]


In [26]:
complete_model_train_SGD=torch.load('complete_model_train_SGD.pt')
# complete_model_train_SGD=torch.load('complete_model_train_SGD.pt', map_location=torch.device('cpu'))
# complete_model_train_ADAM = torch.load('complete_model_train_ADAM.pt')
model_partial_train = torch.load('partial_train_model.pt')
# model_partial_train = torch.load('partial_train_model.pt', map_location=torch.device('cpu'))

In [31]:
def get_perplexity(model, unbiased_dataloader):
    loss_function = nn.CrossEntropyLoss(reduction='mean')
    total_loss = 0
    for input, labels in unbiased_dataloader:
        input = input.to(DEVICE)
        labels = labels.to(DEVICE)
        logits = model(input)
        loss = loss_function(logits, labels)
        total_loss += loss.item()
    perplexity  = torch.exp(torch.tensor(total_loss)) 
    return perplexity
def get_unbiased_poem():
  poems_list = []
  for i,poet in enumerate(POETS_LIST):
    poet_file = DATA_PATH+'Persian_poems_corpus/normalized/'+poet
    with open(poet_file) as file:
      for j,line in enumerate(file):
        if line.rstrip() != '' and j<100:
          poems_list.append((line.rstrip(),i))
  random.shuffle(poems_list)
  return poems_list

In [32]:
import torch.nn.functional as F
unbiased_list = get_unbiased_poem()
unbiased_dataset = PoetDataset(unbiased_list)
unbiased_dataloader = torch.utils.data.DataLoader(unbiased_dataset, batch_size=BATCH_SIZE, shuffle=True)

with torch.no_grad():
    partial_perplexity = get_perplexity(model_partial_train, unbiased_dataloader)
    complate_perplexity = get_perplexity(complete_model_train_SGD, unbiased_dataloader)


In [33]:
print(partial_perplexity)
print(complate_perplexity)

tensor(6.1594e+18)
tensor(1.9575e+15)
