In [None]:
%load_ext autoreload
%load_ext aymurai.devtools.magic
%autoreload 2

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
DEVICE = 'cuda'

In [None]:
data = pd.read_csv('sentences-decision.csv')
target_classes = ["none", "decision:no_hace_lugar", "decision:hace_lugar"]

def get_category(pair):
    decision, hace_lugar = pair
    if not decision:
        cat = 0
    elif decision and not hace_lugar:
        cat = 1
    elif decision and hace_lugar == 1:
        cat = 2
    else:
        raise "not valid"
    return cat

data['category'] = data[['decision', 'hace_lugar']].apply(get_category, axis=1) 

data.drop_duplicates(subset='sentence', inplace=True)
print(len(data))
data['sentence'].apply(lambda x: len(x.split(' '))).hist(bins=[32*i for i in range(10)])

In [None]:
from ast import literal_eval
data = pd.read_csv("sentences-decision-manual.csv", usecols=['path', 'nro_registro', 'tomo', 'sentence', 'decision', 'hace_lugar'])


In [None]:

data.dropna(inplace=True)
# target_classes = ["none", "decision:no_hace_lugar", "decision:hace_lugar"]


def force_bool(value):
    return True if value in ['True', True, 1, "1"] else False


def get_category(pair):
    decision, hace_lugar = pair
    # print(decision, hace_lugar, type(decision), type(hace_lugar))
    if not decision:
        cat = 0
    elif decision and not hace_lugar:
        cat = 1
    elif decision and hace_lugar:
        cat = 2
    else:
        raise "not valid"
    return cat


# # data[['decision', 'hace_lugar']] = data[['decision', 'hace_lugar']].apply(lambda x: literal_eval(x), axis=1).astype(bool) 
data['decision'] = data['decision'].apply(force_bool).astype(bool) 
data['hace_lugar'] = data['hace_lugar'].apply(force_bool).astype(bool) 
data["category"] = data[["decision", "hace_lugar"]].apply(get_category, axis=1)
data.dropna(subset=['category'], inplace=True)

data.drop_duplicates(subset="sentence", inplace=True)
print(len(data))
data["sentence"].apply(lambda x: len(x.split(" "))).hist(
    bins=[32 * i for i in range(10)]
)


# Build train dataset

In [None]:
import numpy as np

train, test = train_test_split(
    data,
    test_size=0.2,
    random_state=42,
    stratify=data["category"],
)
test, val = train_test_split(
    test,
    test_size=0.5,
    random_state=42,
)


print("train:", len(train))
print("test:", len(test))
print("val:", len(val))


## class weights

In [None]:
print(f"cat 0: {len(train.query('category == 0'))} from {len(train)} sentences")
print(f"cat 1: {len(train.query('category == 1'))} from {len(train)} sentences")
print(f"cat 2: {len(train.query('category == 2'))} from {len(train)} sentences")

# manual train balance

In [None]:
# class_0 = train.query("decision == 0")
# class_1 = train.query("decision == 1")
# train = pd.concat(
#     [
#         class_0.sample(len(class_1), random_state=42),
#         class_1,
#     ]
# )

# print(f"decisiones: {len(train.query('decision'))} from {len(train)} sentences")
# print(f"hace lugar: {len(train.query('decision and hace_lugar'))} from {len(train.query('decision'))} decisiones")

In [None]:
from sklearn.utils import class_weight

class_weights = class_weight.compute_class_weight(
    "balanced", classes=np.unique(train['category']), y=train['category']
)
# class_weights = {k: v for k, v in enumerate(class_weights)}
class_weights

In [None]:
import numpy as np
from torch.utils.data.sampler import WeightedRandomSampler

counts = np.bincount(train['category'])
labels_weights = 1. / counts
weights = labels_weights[train['category']]
train_sampler = WeightedRandomSampler(weights, len(weights), replacement=True)

counts = np.bincount(val['category'])
labels_weights = 1. / counts
weights = labels_weights[val['category']]
val_sampler = WeightedRandomSampler(weights, len(weights), replacement=True)

# build vocab

In [None]:
import spacy
from torchtext.vocab import build_vocab_from_iterator

nlp = spacy.blank("es")


def encode_text(text):
    return [t.text for t in nlp.tokenizer(text.lower())]



def yield_tokens(data_iter):
    for text in data_iter:
        yield encode_text(text)


vocab = build_vocab_from_iterator(yield_tokens(train["sentence"]), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])
len(vocab)

In [None]:
%%export aymurai.models.decision.torch.tokenizer

import spacy
import torch

class Tokenizer(object):
    def __init__(self, vocab):
        self.max_len = 128
        self.nlp = spacy.blank('es')
        self.vocab = vocab
    
    def save(self, path: str):
        torch.save(self.vocab, path)
    
    @classmethod
    def load(cls, path:str):
        vocab = torch.load(path)
        return cls(vocab=vocab)
    
        
    def tokenize(self, text: str):
        return [t.text for t in self.nlp.tokenizer(text.lower())]

    def encode(self, text: int):
        tokens = self.tokenize(text)[:self.max_len]
        indices =  self.vocab(tokens)
        indices = torch.tensor(indices, dtype=torch.int64)
        indices = torch.nn.functional.pad(indices, (0, self.max_len-len(indices)))
        return indices
    
    def encode_batch(self, texts):
        indices = [self.encode(text) for text in texts]
        indices = torch.stack(indices)
        return indices
    
    

In [None]:
import torch
from torch.utils.data import Dataset

class DFtoDataset(Dataset):
    def __init__(self, texts: list[str], targets: list[int]):

        self.x_ = texts
        self.y_ = targets
    
    def __len__(self):
        return len(self.y_)

    def __getitem__(self, idx):
        return self.x_[idx], self.y_[idx]


In [None]:
from torch.utils.data import DataLoader
from torchtext.data.functional import to_map_style_dataset
from aymurai.models.decision.torch.tokenizer import Tokenizer

train_dataset = DFtoDataset(train["sentence"].values, train["category"].values)
val_dataset = DFtoDataset(val["sentence"].values, val["category"].values)
test_dataset = DFtoDataset(test["sentence"].values, test["category"].values)

train_dataset = to_map_style_dataset(train_dataset)
val_dataset = to_map_style_dataset(val_dataset)
test_dataset = to_map_style_dataset(test_dataset)

tokenizer = Tokenizer(vocab)


def vectorize_batch(batch):
    x, y = list(zip(*batch))

    x = tokenizer.encode_batch(x)
    x = x.to(DEVICE)

    y = torch.tensor(y, device=DEVICE)
    return x, y


train_loader = DataLoader(
    train_dataset,
    batch_size=64,
    collate_fn=vectorize_batch,
    # shuffle=True,
    sampler=train_sampler,
)
val_loader = DataLoader(
    val_dataset,
    batch_size=64,
    collate_fn=vectorize_batch,
    # sampler=val_sampler,
    # shuffle=True,
)
test_loader = DataLoader(
    test_dataset,
    batch_size=64,
    collate_fn=vectorize_batch,
)


In [None]:
tokenizer.save('tokenizer.pth')

In [None]:
tokenizer = Tokenizer.load('tokenizer.pth')

In [None]:
next(iter(val_loader))

In [None]:
next(iter(train_loader))[1]

In [None]:
# %%export aymurai.models.decision.torch.conv1d

# import pytorch_lightning as pl
# import torch.nn.functional as F
# import torchmetrics
# from torch import nn
# import torch


# class Conv1dTextClassifier(pl.LightningModule):
#     def __init__(
#         self,
#         vocab_size: int,
#         embed_len: int = 64,
#         nfeatures: int = 64,
#         num_classes: int = 3,
#     ):
#         self.vocab_size = vocab_size
#         self.embed_len = embed_len
#         self.nfeatures = nfeatures
#         self.num_classes = num_classes
#         self.lr = 1e-3

#         super().__init__()
#         self.save_hyperparameters()

#         # layers
#         self.embedding_layer = nn.Embedding(
#             num_embeddings=self.vocab_size,
#             embedding_dim=self.embed_len,
#         )
#         self.conv1 = nn.Conv1d(self.embed_len, self.nfeatures, kernel_size=7, padding="same")
#         self.linear1 = nn.Linear(self.nfeatures, 32)
#         self.linear2 = nn.Linear(32, self.num_classes)
#         # self.linear = nn.Linear(self.nfeatures, self.num_classes)

#         self.class_weights = torch.tensor(class_weights, dtype=torch.float32)
#         self.loss = nn.CrossEntropyLoss(weight=self.class_weights)

#         # metrics
#         self.accuracy = torchmetrics.Accuracy(
#             task="multiclass",
#             num_classes=self.num_classes,
#         )
#         self.f1score = torchmetrics.F1Score(
#             task="multiclass",
#             num_classes=self.num_classes,
#         )

#     def forward(self, X_batch):
#         x = self.embedding_layer(X_batch)
#         x = x.reshape(len(x), self.embed_len, 128) ## Embedding Length needs to be treated as channel dimension
#         x = F.relu(self.conv1(x))
#         x, _ = x.max(dim=-1)

#         x = self.linear1(x)
#         x = self.linear2(x)
#         # x = self.linear(x)
#         # x = F.linear(x, torch.tensor([self.nfeatures, 32]))
#         # x = F.linear(x, torch.tensor([32, self.num_classes]))
#         y_hat = F.log_softmax(x)

#         return y_hat

#     def training_step(self, batch, batch_idx):
#         # training_step defines the train loop.
#         x, y = batch

#         y_pred = self.forward(x)

#         # loss = F.cross_entropy(y_pred, y, weight=self.class_weights)
#         loss = self.loss(y_pred, y)
#         acc = self.accuracy(y_pred, y)
#         f1score = self.f1score(y_pred, y)

#         self.log("loss", loss, on_epoch=True, prog_bar=True, logger=True)
#         self.log("acc", acc, on_epoch=True, prog_bar=True, logger=True)
#         self.log("f1score", f1score, on_epoch=True, prog_bar=True, logger=True)
#         return loss

#     def validation_step(self, batch, batch_idx):
#         x, y = batch

#         y_pred = self.forward(x)

#         # loss = F.cross_entropy(y_pred, y)
#         loss = self.loss(y_pred, y)
#         acc = self.accuracy(y_pred, y)
#         f1score = self.f1score(y_pred, y)

#         self.log("val_loss", loss, on_epoch=True, prog_bar=True, logger=True)
#         self.log("val_acc", acc, on_epoch=True, prog_bar=True, logger=True)
#         self.log("val_f1score", f1score, on_epoch=True, prog_bar=True, logger=True)

#     def test_step(self, batch, batch_idx):
#         x, y = batch

#         y_pred = self.forward(x)

#         # loss = F.cross_entropy(y_pred, y)
#         loss = self.loss(y_pred, y)
#         acc = self.accuracy(y_pred, y)
#         f1score = self.f1score(y_pred, y)

#         self.log("test_loss", loss, on_epoch=True, prog_bar=True, logger=True)
#         self.log("test_acc", acc, on_epoch=True, prog_bar=True, logger=True)
#         self.log("test_f1score", f1score, on_epoch=True, prog_bar=True, logger=True)

#     def configure_optimizers(self):
#         # optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
#         optimizer = torch.optim.AdamW(self.parameters(), lr=self.lr)
#         return optimizer


In [None]:
# %%export aymurai.models.decision.torch.conv1d

import pytorch_lightning as pl
import torch.nn.functional as F
import torchmetrics
from torch import nn
import torch
from torch.optim.lr_scheduler import ReduceLROnPlateau


class Conv1dTextClassifier(pl.LightningModule):
    def __init__(
        self,
        vocab_size: int,
        embed_len: int = 64,
        nfeatures: int = 64,
        num_classes: int = 3,
        lr_scheduler_patience: int = 2,
    ):
        self.vocab_size = vocab_size
        self.embed_len = embed_len
        self.nfeatures = nfeatures
        self.num_classes = num_classes
        self.lr = 1e-3
        self.lr_scheduler_patience = lr_scheduler_patience

        super().__init__()
        self.save_hyperparameters()

        # layers
        self.embedding_layer = nn.Embedding(
            num_embeddings=self.vocab_size,
            embedding_dim=self.embed_len,
        )
        self.conv1 = nn.Conv1d(self.embed_len, 64, kernel_size=7, padding="same")
        self.conv2 = nn.Conv1d(64, 32, kernel_size=7, padding="same")
        self.pooling = nn.MaxPool1d(2)

        self.linear1 = nn.Linear(32, 32)
        self.linear2 = nn.Linear(32, self.num_classes)
        # self.linear = nn.Linear(self.nfeatures, self.num_classes)

        self.class_weights = torch.tensor(class_weights, dtype=torch.float32)
        # self.loss = nn.CrossEntropyLoss(weight=self.class_weights)


        self.logsoftmax = nn.LogSoftmax(dim=1)
        self.loss = nn.NLLLoss(weight=self.class_weights)

        # metrics
        self.accuracy = torchmetrics.Accuracy(
            task="multiclass",
            num_classes=self.num_classes,
        )
        self.f1score = torchmetrics.F1Score(
            task="multiclass",
            num_classes=self.num_classes,
        )

    def forward(self, X_batch):
        x = self.embedding_layer(X_batch)
        x = x.reshape(
            len(x), self.embed_len, 128
        )  ## Embedding Length needs to be treated as channel dimension
        x = F.relu(self.conv1(x))
        x = self.pooling(x)
        x = F.dropout(x, 0.5)
        x = F.relu(self.conv2(x))
        x, _ = x.max(dim=-1)

        # x = self.linear1(x)
        x = self.linear2(x)
        # x = self.linear(x)
        # x = F.linear(x, torch.tensor([self.nfeatures, 32]))
        # x = F.linear(x, torch.tensor([32, self.num_classes]))
        x = self.logsoftmax(x)

        return x

    def training_step(self, batch, batch_idx):
        # training_step defines the train loop.
        x, y = batch

        y_pred = self.forward(x)

        # loss = F.cross_entropy(y_pred, y, weight=self.class_weights)
        loss = self.loss(y_pred, y)
        acc = self.accuracy(y_pred, y)
        f1score = self.f1score(y_pred, y)

        self.log("loss", loss, on_epoch=True, prog_bar=True, logger=True)
        self.log("acc", acc, on_epoch=True, prog_bar=True, logger=True)
        self.log("f1score", f1score, on_epoch=True, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch

        y_pred = self.forward(x)

        # loss = F.cross_entropy(y_pred, y)
        loss = self.loss(y_pred, y)
        acc = self.accuracy(y_pred, y)
        f1score = self.f1score(y_pred, y)

        self.log("val_loss", loss, on_epoch=True, prog_bar=True, logger=True)
        self.log("val_acc", acc, on_epoch=True, prog_bar=True, logger=True)
        self.log("val_f1score", f1score, on_epoch=True, prog_bar=True, logger=True)

    def test_step(self, batch, batch_idx):
        x, y = batch

        y_pred = self.forward(x)

        # loss = F.cross_entropy(y_pred, y)
        loss = self.loss(y_pred, y)
        acc = self.accuracy(y_pred, y)
        f1score = self.f1score(y_pred, y)

        self.log("test_loss", loss, on_epoch=True, prog_bar=True, logger=True)
        self.log("test_acc", acc, on_epoch=True, prog_bar=True, logger=True)
        self.log("test_f1score", f1score, on_epoch=True, prog_bar=True, logger=True)

    def configure_optimizers(self):
        # optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.lr)
        return {
            "optimizer": optimizer,
            "lr_scheduler": {
                "scheduler": ReduceLROnPlateau(
                    optimizer,
                    patience=self.lr_scheduler_patience,
                ),
                "monitor": "val_loss",
                "frequency": 1
                # If "monitor" references validation metrics, then "frequency" should be set to a
                # multiple of "trainer.check_val_every_n_epoch".
            },
        }
        return optimizer


In [None]:
# from aymurai.models.decision.torch.conv1d import Conv1dTextClassifier

In [None]:

model = Conv1dTextClassifier(vocab_size=len(vocab))
model = model.to('cuda')

In [None]:
for batch in train_loader:
    x, y = batch
    
    print(x.shape)
    b = model.forward(x)
    print(b)
    break

In [None]:
%load_ext tensorboard
%tensorboard --logdir lightning_logs

In [None]:
ltmodel = Conv1dTextClassifier(vocab_size=len(vocab), num_classes=len(np.unique(train['category'])))
ltmodel = ltmodel.to(DEVICE)


In [None]:
import os
import pytorch_lightning as pl
from pytorch_lightning.callbacks import (
    EarlyStopping,
    RichProgressBar,
    RichModelSummary,
    LearningRateFinder,
    LearningRateMonitor,
    StochasticWeightAveraging,
    ModelCheckpoint,
)


CHECKPOINT_PATH = "checkpoints/pl-emb-conv/"
checkpoint_callback = ModelCheckpoint(
    save_top_k=1,
    monitor="val_loss",
    mode="min",
    dirpath=CHECKPOINT_PATH,
    # filename="{epoch}-{val_loss:.2f}-{other_metric:.2f}",
)

# train model
trainer = pl.Trainer(
    accelerator=DEVICE,
    devices=1,
    callbacks=[
        EarlyStopping(
            monitor="val_loss",
            mode="min",
            min_delta=0.00,
            patience=10,
            verbose=False,
        ),
        checkpoint_callback,
        # StochasticWeightAveraging(swa_lrs=1e-2),
        LearningRateFinder(),
        LearningRateMonitor(),
        RichModelSummary(),
        # RichProgressBar(),
    ],
    max_epochs=50,
    min_epochs=5,
)


In [None]:

trainer.fit(
    model=ltmodel,
    train_dataloaders=train_loader,
    val_dataloaders=val_loader,
)



In [None]:

print(checkpoint_callback.best_model_path)  # prints path to the best model's checkpoint
print(checkpoint_callback.best_model_score)  # and prints it score
path = checkpoint_callback.best_model_path


In [None]:
pl.seed_everything(42)
# path = '/workspace/notebooks/experiments/decision/test/conv/model.ckpt'
# path = '/workspace/notebooks/experiments/decision/checkpoints/pl-emb-conv/epoch=38-step=6981.ckpt'
# best_model = ltmodel.load_from_checkpoint(path, map_location='cpu')
best_model = ltmodel.eval()
best_model.eval()

trainer.test(ltmodel, dataloaders=test_loader)


In [None]:
xx = x[:1]
xx

In [None]:
with torch.no_grad():
    a = best_model(xx).exp().argmax(axis=1)
    print(a)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix

print('TRAIN')

x = tokenizer.encode_batch(train['sentence'].iloc[:])
hypothesis = best_model(x).argmax(axis=1)
reference = train['category']

fig, ax = plt.subplots(1, 1, figsize=(8, 4))

confusion =  confusion_matrix(reference, hypothesis)
print(confusion)
sns.heatmap(confusion, annot=True, fmt='d', ax=ax)
ax.set_xlabel("hypothesis")
ax.set_ylabel("reference")
# ax.set_xticklabels(["None", "desicion/no_hace_lugar", "descion/hace_lugar"])
# ax.set_yticklabels(["None", "desicion/no_hace_lugar", "descion/hace_lugar"])
ax.set_title('TRAIN')

plt.tight_layout()

report = classification_report(reference, hypothesis, output_dict=True)
pd.DataFrame(report).T

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix

print('VAL')

x = tokenizer.encode_batch(val['sentence'].iloc[:])
hypothesis = best_model(x).argmax(axis=1)
reference = val['category']

fig, ax = plt.subplots(1, 1, figsize=(8, 4))

confusion =  confusion_matrix(reference, hypothesis)
print(confusion)
sns.heatmap(confusion, annot=True, fmt='d', ax=ax)
ax.set_xlabel("hypothesis")
ax.set_ylabel("reference")
# ax.set_xticklabels(["None", "desicion/no_hace_lugar", "descion/hace_lugar"])
# ax.set_yticklabels(["None", "desicion/no_hace_lugar", "descion/hace_lugar"])
ax.set_title('VAL')

plt.tight_layout()

report = classification_report(reference, hypothesis, output_dict=True)
pd.DataFrame(report).T

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix

print('TEST')

x = tokenizer.encode_batch(test['sentence'].iloc[:])
hypothesis = best_model(x).argmax(axis=1)
reference = test['category']

fig, ax = plt.subplots(1, 1, figsize=(8, 4))

confusion =  confusion_matrix(reference, hypothesis)
print(confusion)
sns.heatmap(confusion, annot=True, fmt='d', ax=ax)
ax.set_xlabel("hypothesis")
ax.set_ylabel("reference")
# ax.set_xticklabels(["None", "desicion/no_hace_lugar", "descion/hace_lugar"])
# ax.set_yticklabels(["None", "desicion/no_hace_lugar", "descion/hace_lugar"])
ax.set_title('TEST')

plt.tight_layout()

report = classification_report(reference, hypothesis, output_dict=True)
pd.DataFrame(report).T

In [None]:
test_ = test.copy()
test_["pred_cat"] = hypothesis

def cat2label(cat):
    if cat == 0:
        return (False, False)
    if cat == 1:
        return (True, False)
    if cat == 2:
        return (True, True)

test_[['pred_decision', 'pred_hace_lugar']] = [cat2label(d) for d in test_['pred_cat']]
test_

In [None]:
pd.set_option(
    "display.max_columns",
    1000,
    "display.width",
    1000,
    "display.max_colwidth",
    None,
)


In [None]:
test_['pred_ok'] = test_['category'] == test_['pred_cat']
# test_

In [None]:
# test_.query('decision == 0 and pred_decision and not pred_hace_lugar').sample(1)
test_.query('pred_ok == 1 and decision')