In [1]:
# Configs
dataset_dir = r'C:\Users\fardin\Projects\EnhanceSEO\datasets\extractedURLs\url_classes.csv'

In [2]:
import torch
from torch import nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from nltk.tokenize import word_tokenize, TweetTokenizer
from nltk.stem import WordNetLemmatizer
from transformers import BertTokenizer
from urllib.parse import urlparse
from sklearn.model_selection import train_test_split
import time

In [3]:
name = "microsoft/llmlingua-2-bert-base-multilingual-cased-meetingbank"
bert_tokenizer = BertTokenizer.from_pretrained(name)

In [4]:
df = pd.read_csv(dataset_dir)
class_list = df.Topic.unique()
class_id = {t:i for i, t in enumerate(class_list)}
id_class = {i:t for i, t in enumerate(class_list)}
df_train, df_test = train_test_split(df, test_size=0.1, shuffle=True)

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
batch_size = 1024
max_token_length = 64
device

device(type='cuda')

In [9]:
vocab_dict = {t: i for i, t in enumerate(bert_tokenizer.vocab)}
vocab_size = len(bert_tokenizer.vocab)

In [10]:
class CustomDataset(Dataset):
    
    def __init__(self, X, y, num_classes, dictionary, tokenizer, doc_length=256) -> None:
        super().__init__()
        self.doc_length = doc_length
        y = torch.from_numpy(np.array([class_id[c] for c in y], dtype=np.longlong))
        self.y = torch.nn.functional.one_hot(y, num_classes=num_classes).float()
        self.dictionary = dictionary
        self.tokenizer = tokenizer
        self.vocab_size = len(self.dictionary)
        
        self.X = torch.zeros((len(X), doc_length), dtype=torch.int)
        for i, doc in enumerate(X):
            indices = torch.from_numpy(np.array(self.tokenizer(doc)['input_ids'], dtype=np.longlong))
            pad_size = max(self.doc_length - len(indices), 0)
            self.X[i] = torch.nn.functional.pad(indices[:self.doc_length], (0,pad_size))
        
    def __getitem__(self, index):
        return self.X[index], self.y[index]
        
    def __len__(self):
        return len(self.y)
        

In [11]:
tokenizer = TweetTokenizer()
lemmatizer = WordNetLemmatizer()
train_dataset = CustomDataset(df_train.Address.values, df_train.Topic.values, len(class_id), vocab_dict, bert_tokenizer, doc_length=max_token_length)
test_dataset = CustomDataset(df_test.Address.values, df_test.Topic.values, len(class_id), vocab_dict, bert_tokenizer, doc_length=max_token_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, drop_last=True, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, drop_last=True, shuffle=False)

In [12]:
for val in df_train.Address.values[:1]:
    print(train_dataset.tokenizer(val).keys())
for val in df_train.Address.values[:2]:
    print(train_dataset.tokenizer(val)['input_ids'])

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
[101, 14120, 131, 120, 120, 10494, 119, 46484, 19094, 119, 10212, 120, 41163, 118, 11131, 120, 10751, 118, 32784, 118, 43045, 120, 72812, 10681, 118, 14609, 118, 10635, 12953, 118, 10155, 118, 16118, 102]
[101, 14120, 131, 120, 120, 17045, 14752, 57192, 119, 41181, 119, 10212, 120, 13617, 120, 68257, 69486, 10133, 118, 10143, 118, 38973, 10598, 118, 11303, 118, 187, 118, 66626, 10107, 120, 143, 10874, 12022, 10237, 11779, 10246, 10884, 10418, 19282, 36237, 10874, 136, 53264, 134, 10110, 102]


In [13]:
st = time.time()
X, y = next(iter(test_dataloader))
print(f'dataset time = {time.time() - st}')

dataset time = 0.0240631103515625


In [52]:
class CNN_for_Text(nn.Module):
    
    def __init__(self, num_embedding, max_token_count, embedding_dim=64, dropout=0.3, num_out_features=4, *args, **kwargs) -> None:
        super(CNN_for_Text, self).__init__(*args, **kwargs)
        self.max_token_count = max_token_count
        
        self.embedding = nn.Embedding(num_embedding, embedding_dim)
        self.conv1 = nn.Conv1d(embedding_dim, 64, kernel_size=5, padding=2)
        self.pool1 = nn.MaxPool1d(2)
        self.conv2 = nn.Conv1d(64, 64, kernel_size=3, padding=1)
        self.globalpool = nn.AdaptiveMaxPool1d(32)
        self.fc1 = nn.Linear(64 * max_token_count//2, 32)
        self.dropout = nn.Dropout(dropout)
        self.fc_out = nn.Linear(32, num_out_features)
        
    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(0, 2, 1)
        x = F.relu(self.conv1(x))
        x = self.dropout(x)
        x = self.pool1(x)
        x = F.relu(self.conv2(x))
        x = self.globalpool(x)
        x = F.relu(self.fc1(x.view(x.shape[0], -1)))
        x = self.dropout(x)
        x = self.fc_out(x)
        return x

In [53]:
model = CNN_for_Text(num_embedding=vocab_size, max_token_count=max_token_length, embedding_dim=96, num_out_features=len(class_id))

In [54]:
model(X).shape

torch.Size([1024, 12])

In [55]:
import numpy as np

In [56]:
import torchmetrics
import lightning as L
# from abc import abstractmethod

In [57]:

class ClassifierLightningModel(L.LightningModule):
    def __init__(
        self,
        model,
        num_classes,
        optimizer=None,
        loss_func=None,
        learning_rate=0.01,
        batch_size=64,
        lr_scheduler=None,
        user_lr_scheduler=False,
        min_lr=0.0,
    ):
        super(ClassifierLightningModel, self).__init__()
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.model = model
        self.min_lr = min_lr
        # self.save_hyperparameters(ignore=["model"])
        self.save_hyperparameters("model", logger=False)
        self.optimizer = self._get_optimizer(optimizer)
        self.lr_scheduler = (
            self._get_lr_scheduler(lr_scheduler) if user_lr_scheduler else None
        )
        self.loss_func = loss_func
        self.train_losses = []
        self.val_losses = []
        self.train_acc = torchmetrics.Accuracy(task="multiclass", num_classes=num_classes)
        self.val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=num_classes)
        self.test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=num_classes)

    def forward(self, x, *args, **kwargs):
        return self.model(x)

    def on_train_epoch_start(self) -> None:
        param_groups = next(iter(self.optimizer.param_groups))
        if "lr" in param_groups and param_groups["lr"] is not None:
            current_learning_rate = float(param_groups["lr"])
            self.log(
                "lr",
                current_learning_rate,
                batch_size=self.batch_size,
                on_epoch=True,
                on_step=False,
            )

    def training_step(self, batch, *args, **kwargs):
        X, y = batch
        X.to(self.device)
        y.to(self.device)
        
        self.model.train()
        y_out = self(X)

        loss = self.loss_func(y_out.view(y.shape), y )
        self.train_losses.append(loss.detach().item())
        self.log(
            "train_loss",
            loss,
            prog_bar=True,
            batch_size=self.batch_size,
            on_epoch=True,
            on_step=True,
        )
        
        self.train_acc(torch.argmax(y_out, dim=1), torch.argmax(y, dim=1))
        self.log('train_acc', self.train_acc, prog_bar=True, on_epoch=True, on_step=True, batch_size=self.batch_size)
        
        return loss

    def validation_step(self, batch, *args, **kwargs):
        X, y = batch
        X.to(self.device)
        y.to(self.device)
        
        self.model.eval()
        y_out = self(X)
        loss = self.loss_func(y_out.view(y.shape), y )
        self.val_losses.append(loss.detach().item())

        self.log(
            "val_loss",
            loss,
            prog_bar=True,
            batch_size=self.batch_size,
            on_epoch=True,
            on_step=True,
        )
        
        
        self.val_acc(torch.argmax(y_out, dim=1), torch.argmax(y, dim=1))
        self.log('val_acc', self.val_acc, prog_bar=True, on_epoch=True, on_step=True, batch_size=self.batch_size)

    def configure_optimizers(self):
        if self.lr_scheduler is None:
            return self.optimizer

        return {
            "optimizer": self.optimizer,
            "lr_scheduler": {
                "scheduler": self.lr_scheduler,
                "monitor": "train_loss",
                "interval": "epoch",
                "frequency": 1,
            },
        }

    def update_learning_rate(self, learning_rate: float):
        self.learning_rate = learning_rate
        for g in self.optimizer.param_groups:
            g["lr"] = learning_rate

    def _get_optimizer(self, optimizer):
        return (
            optimizer
            if optimizer is not None
            else torch.optim.Adam(self.model.parameters(), lr=self.learning_rate)
        )

    def _get_lr_scheduler(self, lr_scheduler):
        return (
            lr_scheduler
            if lr_scheduler is not None
            else torch.optim.lr_scheduler.ReduceLROnPlateau(
                self.optimizer, patience=5, factor=0.5, mode="min", min_lr=self.min_lr
            )
        )


In [58]:

lr=  0.001380384264602885
# 0.00010631317724117211
output_size = 128

In [59]:
len(bert_tokenizer.vocab)

119547

In [60]:
hidden_dim = 64
embedding_dim = 64
label_size = 1

classifier_torch_model = CNN_for_Text(num_embedding=vocab_size, max_token_count=max_token_length, num_out_features=len(class_id)).to(device)
optimizer = torch.optim.Adam(classifier_torch_model.parameters(), lr=lr, weight_decay=0.00012)
lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[30, 60, 90, 120],gamma=0.5)
loss_func = torch.nn.BCEWithLogitsLoss()
classfier_lightning_model = ClassifierLightningModel(classifier_torch_model, 
                                                     num_classes=len(class_id),
                                            learning_rate=lr,
                                            batch_size=batch_size,
                                            optimizer=optimizer,
                                            loss_func=loss_func,
                                            lr_scheduler=lr_scheduler,
                                            user_lr_scheduler=True
                                            ).to(device)

In [61]:
# from lightning.pytorch.callbacks import EarlyStopping, ModelCheckpoint
from lightning.pytorch.loggers import CSVLogger
import lightning as L

# from scripts.utils.CustomCallbacks.ModelCheckpoint import CustomModelCheckpoint

# callbacks = [
#         CustomModelCheckpoint(dirpath=r'models\model2_word_embedding-256-2', filename='str_embedding', every_n_epochs=1, mode='min', monitor='train_loss', save_on_train_epoch_end=True),
#         ModelCheckpoint(save_top_k=5, mode='min', monitor='train_loss', save_last=True)
#         ]
trainer = L.Trainer(
            # callbacks=callbacks,
            max_epochs=400,
            accelerator= 'gpu' if device==torch.device("cuda") else 'cpu',
            logger=CSVLogger(save_dir='logs/', name='log2'), 
            num_sanity_val_steps=0,
        #     default_root_dir='models\model2_word_embedding-256-2'
        )

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [62]:
# from lightning.pytorch.tuner import Tuner
# tuner = Tuner(trainer)
# tuning_result = tuner.lr_find(classfier_lightning_model, train_dataloaders=train_dataloader, val_dataloaders=test_dataloader, min_lr=0.00001,max_lr=0.01, num_training=100)

# fig = tuning_result.plot(suggest=True)
# fig.show()

In [1]:
# max_epochs = 1000
# trainer.fit_loop.max_epochs = max_epochs
trainer.fit(classfier_lightning_model, train_dataloaders=train_dataloader, val_dataloaders=test_dataloader)

NameError: name 'trainer' is not defined

In [64]:
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from torchmetrics import ConfusionMatrix

def calculate_metrics(cl_model):
    cm = ConfusionMatrix(task="multiclass", num_classes=len(class_id))

    y_pred = []
    y_true = []

    cl_model = cl_model.eval()
    cl_model.to(device)
    for X, y in tqdm(test_dataloader):
        X = X.to(device)
        with torch.no_grad():
            y_p = cl_model(X)
            y_p = y_p.cpu()
        y_pred.append(y_p)
        y_true.append(y)
    y_pred = torch.cat(y_pred, dim=0)
    y_true = torch.cat(y_true, dim=0)
    y_pred2 = torch.argmax(y_pred, dim=1)
    y_true2 = torch.argmax(y_true, dim=1)
    print(f'classification report: \n {classification_report(y_true2, y_pred2, digits=4)}')
    print(f'confusion matrix:\n {cm(y_pred2, y_true2)}')
    print('================================')


In [65]:
classfier_lightning_model.model = classfier_lightning_model.model.eval()
classfier_lightning_model = classfier_lightning_model.eval()
calculate_metrics(classfier_lightning_model)


  0%|                                                                                            | 0/3 [00:00<?, ?it/s][A
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 23.26it/s][A

classification report: 
               precision    recall  f1-score   support

           0     0.6679    0.7469    0.7052       245
           1     0.8045    0.7985    0.8015       268
           2     0.9218    0.8421    0.8802       266
           3     0.7888    0.7529    0.7704       263
           4     0.5966    0.7184    0.6519       245
           5     0.6884    0.6227    0.6539       220
           6     0.7925    0.7095    0.7487       296
           7     0.8161    0.8486    0.8320       251
           8     0.6830    0.6990    0.6909       299
           9     0.6382    0.6875    0.6619       272
          10     0.7787    0.7379    0.7578       248
          11     0.8913    0.8241    0.8564       199

    accuracy                         0.7480      3072
   macro avg     0.7557    0.7490    0.7509      3072
weighted avg     0.7543    0.7480    0.7498      3072

confusion matrix:
 tensor([[183,   5,   5,   2,   6,   1,   7,   1,  29,   4,   1,   1],
        [  9, 214, 




In [35]:
my_str = "Short story.~"
{c:ord(c) for c in my_str}

{'S': 83,
 'h': 104,
 'o': 111,
 'r': 114,
 't': 116,
 ' ': 32,
 's': 115,
 'y': 121,
 '.': 46,
 '~': 126}