In [None]:
%load_ext autoreload
%load_ext aymurai.devtools.magic
%autoreload 2

In [None]:
import torch
import torchtext
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from torchtext.data.functional import to_map_style_dataset

DEVICE = 'cuda'


train_dataset, test_dataset = torchtext.datasets.AG_NEWS()

train_dataset = to_map_style_dataset(train_dataset)
test_dataset = to_map_style_dataset(test_dataset)
train_dataset, val_dataset = train_test_split( train_dataset, test_size=0.1)

print(len(train_dataset))
print(len(val_dataset))
print(len(test_dataset))

In [None]:
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokenizer = get_tokenizer("basic_english")

def build_vocabulary(datasets):
    for dataset in datasets:
        for _, text in dataset:
            yield tokenizer(text)

vocab = build_vocab_from_iterator(build_vocabulary([train_dataset, val_dataset, test_dataset]), min_freq=1, specials=["<UNK>",])
vocab.set_default_index(vocab["<UNK>"])
len(vocab)

In [None]:

target_classes = ["World", "Sports", "Business", "Sci/Tech"]

max_tokens = 50

def vectorize_text(batch):
    Y, X = list(zip(*batch))
    X = [vocab(tokenizer(text)) for text in X]
    X = [tokens+([0]* (max_tokens-len(tokens))) if len(tokens)<max_tokens else tokens[:max_tokens] for tokens in X] ## Bringing all samples to max_tokens length.

    xx, yy = torch.tensor(X, dtype=torch.int32), torch.tensor(Y) - 1 ## We have deducted 1 from target names to get them in range [0,1,2,3] from [1,2,3,4]
    xx = xx.to(DEVICE)
    yy = yy.to(DEVICE)
    return xx, yy

train_loader = DataLoader(train_dataset, batch_size=1024, collate_fn=vectorize_text, shuffle=True)
val_loader  = DataLoader(val_dataset,  batch_size=1024, collate_fn=vectorize_text)
test_loader  = DataLoader(test_dataset,  batch_size=1024, collate_fn=vectorize_text)

In [None]:
for X, Y in train_loader:
    print(X.shape, Y.shape)
    break

In [None]:
cats = [data[0] for data in train_dataset]

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.utils import class_weight
import numpy as np

class_weights = class_weight.compute_class_weight(
    "balanced", classes=np.unique(cats), y=cats
)
# class_weights = {k: v for k, v in enumerate(class_weights)}
class_weights

In [None]:
import numpy as np
from torch.utils.data.sampler import WeightedRandomSampler

counts = np.bincount(cats)
labels_weights = 1. / counts
weights = labels_weights[cats]
train_sampler = WeightedRandomSampler(weights, len(weights), replacement=True)

In [None]:
# %%export aymurai.models.decision.conv1d

import pytorch_lightning as pl
import torch.nn.functional as F
import torchmetrics
from torch import nn
import torch
from torch.optim.lr_scheduler import ReduceLROnPlateau


class Conv1dTextClassifier(pl.LightningModule):
    def __init__(
        self,
        vocab_size: int,
        embed_len: int = 64,
        nfeatures: int = 64,
        num_classes: int = 4,
        lr_scheduler_patience: int = 2,
    ):
        self.vocab_size = vocab_size
        self.embed_len = embed_len
        self.nfeatures = nfeatures
        self.num_classes = num_classes
        self.lr = 1e-3
        self.lr_scheduler_patience = lr_scheduler_patience

        super().__init__()
        self.save_hyperparameters()

        # layers
        self.embedding_layer = nn.Embedding(
            num_embeddings=self.vocab_size,
            embedding_dim=self.embed_len,
        )
        self.conv1 = nn.Conv1d(self.embed_len, 64, kernel_size=7, padding="same")
        self.conv2 = nn.Conv1d(64, 32, kernel_size=7, padding="same")
        self.pooling = nn.MaxPool1d(2)

        self.linear1 = nn.Linear(32, 32)
        self.linear2 = nn.Linear(32, self.num_classes)
        # self.linear = nn.Linear(self.nfeatures, self.num_classes)

        self.class_weights = torch.tensor(class_weights, dtype=torch.float32)
        # self.loss = nn.CrossEntropyLoss(weight=self.class_weights)


        self.logsoftmax = nn.LogSoftmax(dim=1)
        self.loss = nn.NLLLoss(weight=self.class_weights)

        # metrics
        self.accuracy = torchmetrics.Accuracy(
            task="multiclass",
            num_classes=self.num_classes,
        )
        self.f1score = torchmetrics.F1Score(
            task="multiclass",
            num_classes=self.num_classes,
        )

    def forward(self, X_batch):
        x = self.embedding_layer(X_batch)
        x = x.reshape(
            len(x), self.embed_len, max_tokens
        )  ## Embedding Length needs to be treated as channel dimension
        x = F.relu(self.conv1(x))
        x = self.pooling(x)
        x = F.dropout(x, 0.5)
        x = F.relu(self.conv2(x))
        x, _ = x.max(dim=-1)

        # x = self.linear1(x)
        x = self.linear2(x)
        # x = self.linear(x)
        # x = F.linear(x, torch.tensor([self.nfeatures, 32]))
        # x = F.linear(x, torch.tensor([32, self.num_classes]))
        x = self.logsoftmax(x)

        return x

    def training_step(self, batch, batch_idx):
        # training_step defines the train loop.
        x, y = batch

        y_pred = self.forward(x)

        # loss = F.cross_entropy(y_pred, y, weight=self.class_weights)
        loss = self.loss(y_pred, y)
        acc = self.accuracy(y_pred, y)
        f1score = self.f1score(y_pred, y)

        self.log("loss", loss, on_epoch=True, prog_bar=True, logger=True)
        self.log("acc", acc, on_epoch=True, prog_bar=True, logger=True)
        self.log("f1score", f1score, on_epoch=True, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch

        y_pred = self.forward(x)

        # loss = F.cross_entropy(y_pred, y)
        loss = self.loss(y_pred, y)
        acc = self.accuracy(y_pred, y)
        f1score = self.f1score(y_pred, y)

        self.log("val_loss", loss, on_epoch=True, prog_bar=True, logger=True)
        self.log("val_acc", acc, on_epoch=True, prog_bar=True, logger=True)
        self.log("val_f1score", f1score, on_epoch=True, prog_bar=True, logger=True)

    def test_step(self, batch, batch_idx):
        x, y = batch

        y_pred = self.forward(x)

        # loss = F.cross_entropy(y_pred, y)
        loss = self.loss(y_pred, y)
        acc = self.accuracy(y_pred, y)
        f1score = self.f1score(y_pred, y)

        self.log("test_loss", loss, on_epoch=True, prog_bar=True, logger=True)
        self.log("test_acc", acc, on_epoch=True, prog_bar=True, logger=True)
        self.log("test_f1score", f1score, on_epoch=True, prog_bar=True, logger=True)

    def configure_optimizers(self):
        # optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.lr)
        return {
            "optimizer": optimizer,
            "lr_scheduler": {
                "scheduler": ReduceLROnPlateau(
                    optimizer,
                    patience=self.lr_scheduler_patience,
                ),
                "monitor": "val_loss",
                "frequency": 1
                # If "monitor" references validation metrics, then "frequency" should be set to a
                # multiple of "trainer.check_val_every_n_epoch".
            },
        }


In [None]:
# from aymurai.models.decision.conv1d import Conv1dTextClassifier

In [None]:

model = Conv1dTextClassifier(vocab_size=len(vocab), embed_len=128, num_classes=4)
model = model.to('cuda')

In [None]:
for batch in train_loader:
    x, y = batch
    # x = x.to('cuda')
    
    print(x.shape)
    b = model.forward(x)
    print(b)
    break

In [None]:
%load_ext tensorboard
%tensorboard --logdir lightning_logs

In [None]:
ltmodel = Conv1dTextClassifier(vocab_size=len(vocab), embed_len=128, num_classes=4)
ltmodel = ltmodel.to(DEVICE)


In [None]:
import os
import pytorch_lightning as pl
from pytorch_lightning.callbacks import (
    EarlyStopping,
    RichProgressBar,
    RichModelSummary,
    LearningRateFinder,
    LearningRateMonitor,
    StochasticWeightAveraging,
    ModelCheckpoint,
)


CHECKPOINT_PATH = "checkpoints/pl-emb-conv/"
checkpoint_callback = ModelCheckpoint(
    save_top_k=1,
    monitor="val_loss",
    mode="min",
    dirpath=CHECKPOINT_PATH,
    # filename="{epoch}-{val_loss:.2f}-{other_metric:.2f}",
)

# train model
trainer = pl.Trainer(
    accelerator=DEVICE,
    devices=1,
    callbacks=[
        EarlyStopping(
            monitor="val_loss",
            mode="min",
            min_delta=0.00,
            patience=10,
            verbose=False,
        ),
        checkpoint_callback,
        # StochasticWeightAveraging(swa_lrs=1e-2),
        LearningRateFinder(),
        LearningRateMonitor(),
        RichModelSummary(),
        # RichProgressBar(),
    ],
    max_epochs=50,
    min_epochs=5,
)


In [None]:

trainer.fit(
    model=ltmodel,
    train_dataloaders=train_loader,
    val_dataloaders=val_loader,
)



In [None]:

print(checkpoint_callback.best_model_path)  # prints path to the best model's checkpoint
print(checkpoint_callback.best_model_score)  # and prints it score
path = checkpoint_callback.best_model_path


In [None]:
# pl.seed_everything(42)
# path = '/workspace/notebooks/experiments/decision/test/conv/model.ckpt'
# path = '/workspace/notebooks/experiments/decision/checkpoints/pl-emb-conv/epoch=38-step=6981.ckpt'
# best_model = ltmodel.load_from_checkpoint(path, map_location='cpu')
best_model = ltmodel.eval()
best_model.eval()

trainer.test(ltmodel, dataloaders=test_loader)


In [None]:
xx = x[:1]
xx

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix

print('TRAIN')

reference = []
hypothesis = []

ltmodel = ltmodel.to('cuda')
for batch in train_loader:
    x, y = batch
    x = x.to('cuda')

    y_pred = ltmodel(x).exp().argmax(axis=1)

    hypothesis.append(y_pred.cpu())
    reference.append(y.cpu())

reference = np.concatenate(reference)
hypothesis = np.concatenate(hypothesis)

fig, ax = plt.subplots(1, 1, figsize=(8, 4))

confusion =  confusion_matrix(reference, hypothesis)
print(confusion)
sns.heatmap(confusion, annot=True, fmt='d', ax=ax)
ax.set_xlabel("hypothesis")
ax.set_ylabel("reference")
# ax.set_xticklabels(["None", "desicion/no_hace_lugar", "descion/hace_lugar"])
# ax.set_yticklabels(["None", "desicion/no_hace_lugar", "descion/hace_lugar"])
ax.set_title('TRAIN')

plt.tight_layout()

report = classification_report(reference, hypothesis, output_dict=True)
pd.DataFrame(report).T

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix

print('VAL')

reference = []
hypothesis = []

ltmodel = ltmodel.to('cuda')
for batch in val_loader:
    x, y = batch
    x = x.to('cuda')

    y_pred = ltmodel(x).exp().argmax(axis=1)

    hypothesis.append(y_pred.cpu())
    reference.append(y.cpu())

reference = np.concatenate(reference)
hypothesis = np.concatenate(hypothesis)

fig, ax = plt.subplots(1, 1, figsize=(8, 4))

confusion =  confusion_matrix(reference, hypothesis)
print(confusion)
sns.heatmap(confusion, annot=True, fmt='d', ax=ax)
ax.set_xlabel("hypothesis")
ax.set_ylabel("reference")
# ax.set_xticklabels(["None", "desicion/no_hace_lugar", "descion/hace_lugar"])
# ax.set_yticklabels(["None", "desicion/no_hace_lugar", "descion/hace_lugar"])
ax.set_title('VAL')

plt.tight_layout()

report = classification_report(reference, hypothesis, output_dict=True)
pd.DataFrame(report).T

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix

print('TEST')

reference = []
hypothesis = []
ltmodel = ltmodel.to('cuda')
for batch in test_loader:
    x, y = batch
    x = x.to('cuda')

    y_pred = ltmodel(x).exp().argmax(axis=1)

    hypothesis.append(y_pred.cpu())
    reference.append(y.cpu())

reference = np.concatenate(reference)
hypothesis = np.concatenate(hypothesis)

fig, ax = plt.subplots(1, 1, figsize=(8, 4))

confusion =  confusion_matrix(reference, hypothesis)
print(confusion)
sns.heatmap(confusion, annot=True, fmt='d', ax=ax)
ax.set_xlabel("hypothesis")
ax.set_ylabel("reference")
# ax.set_xticklabels(["None", "desicion/no_hace_lugar", "descion/hace_lugar"])
# ax.set_yticklabels(["None", "desicion/no_hace_lugar", "descion/hace_lugar"])
ax.set_title('TEST')

plt.tight_layout()

report = classification_report(reference, hypothesis, output_dict=True)
pd.DataFrame(report).T

In [None]:
len(hypothesis)

In [None]:
import pandas as pd

sentences = [pair[1] for pair in test_dataset]


def cat2label(cat):
    if cat == 0:
        return (False, False)
    if cat == 1:
        return (True, False)
    if cat == 2:
        return (True, True)


df = pd.DataFrame(
    {
        "sentence": sentences,
        "cat": reference,
        "pred_cat": hypothesis,
    }
)

# df[["pred_decision", "pred_hace_lugar"]] = [cat2label(d) for d in df["pred_cat"]]
df


In [None]:
pd.set_option(
    "display.max_columns",
    1000,
    "display.width",
    1000,
    "display.max_colwidth",
    None,
)


In [None]:
df['pred_ok'] = df['cat'] == df['pred_cat']
# test_

In [None]:
# test_.query('decision == 0 and pred_decision and not pred_hace_lugar').sample(1)
df.query('pred_ok == 0')