In [None]:
import torch
import torchtext

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
DEVICE = 'cuda'

In [None]:
data = pd.read_csv('sentences-decision.csv')
target_classes = ["none", "decision:no_hace_lugar", "decision:hace_lugar"]

def get_category(pair):
    decision, hace_lugar = pair
    if not decision:
        cat = 0
    elif decision and not hace_lugar:
        cat = 1
    elif decision and hace_lugar == 1:
        cat = 1
    else:
        raise "not valid"
    return cat

data['category'] = data[['decision', 'hace_lugar']].apply(get_category, axis=1) 

data.drop_duplicates(subset='sentence', inplace=True)
print(len(data))
data['sentence'].apply(lambda x: len(x.split(' '))).hist(bins=[32*i for i in range(10)])

# Build train dataset

In [None]:
import numpy as np

train, test = train_test_split(
    data,
    test_size=0.2,
    random_state=42,
    stratify=data["category"],
)
test, val = train_test_split(
    train,
    test_size=0.5,
    random_state=42,
)


print("train:", len(train))
print("test:", len(test))
print("val:", len(val))


## class weights

In [None]:
print(f"cat 0: {len(train.query('category == 0'))} from {len(train)} sentences")
print(f"cat 1: {len(train.query('category == 1'))} from {len(train)} sentences")
print(f"cat 2: {len(train.query('category == 2'))} from {len(train)} sentences")

In [None]:
from sklearn.utils import class_weight

class_weights = class_weight.compute_class_weight(
    "balanced", classes=np.unique(train['category']), y=train['category']
)
# class_weights = {k: v for k, v in enumerate(class_weights)}
class_weights

In [None]:
import numpy as np
from torch.utils.data.sampler import WeightedRandomSampler

counts = np.bincount(train['category'])
labels_weights = 1. / counts
weights = labels_weights[train['category']]
sampler = WeightedRandomSampler(weights, len(weights), replacement=True)
sampler

# build vocab

In [None]:
import spacy
from torchtext.vocab import build_vocab_from_iterator

nlp = spacy.blank("es")


def encode_text(text):
    return [t.text for t in nlp.tokenizer(text)]



def yield_tokens(data_iter):
    for text in data_iter:
        yield encode_text(text)


vocab = build_vocab_from_iterator(yield_tokens(train["sentence"]), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])


In [None]:
import torch

class Tokenizer(object):
    def __init__(self, vocab, max_len: int = 128):
        self.nlp = spacy.blank('es')
        self.vocab = vocab
        self.max_len = max_len
    
    def save(self, path: str):
        torch.save(path)
    
    @classmethod
    def load(cls, path:str):
        vocab = torch.load(path)
        return cls(vocab=vocab)
    
        
    def tokenize(self, text: str):
        return [t.text for t in self.nlp.tokenizer(text)]

    def encode(self, text: int):
        tokens = self.tokenize(text)[:self.max_len]
        indices =  self.vocab(tokens)
        indices = torch.tensor(indices, dtype=torch.int64)
        indices = torch.nn.functional.pad(indices, (0, self.max_len-len(indices)))
        return indices
    
    def encode_batch(self, texts):
        indices = [self.encode(text) for text in texts]
        indices = torch.stack(indices)
        return indices
    
    

In [None]:
tokenizer = Tokenizer(vocab)

In [None]:
tokenizer.encode('hola como estas').shape

In [None]:
import torch
from torch.utils.data import Dataset

class DFtoDataset(Dataset):
    def __init__(self, texts: list[str], targets: list[int], max_tokens=128, device = 'cuda'):

        self.max_tokens = max_tokens

        self.x_ = texts
        self.y_ = targets
    
    def __len__(self):
        return len(self.y_)

    def __getitem__(self, idx):
        return self.x_[idx], self.y_[idx]


In [None]:
from torch.utils.data import DataLoader
from torchtext.data.functional import to_map_style_dataset

train_dataset = DFtoDataset(train["sentence"].values, train["category"].values)
val_dataset = DFtoDataset(val["sentence"].values, val["category"].values)
test_dataset = DFtoDataset(test["sentence"].values, test["category"].values)

train_dataset = to_map_style_dataset(train_dataset)
val_dataset = to_map_style_dataset(val_dataset)
test_dataset = to_map_style_dataset(test_dataset)

tokenizer = Tokenizer(vocab, max_len=128)


def vectorize_batch(batch):
    x, y = list(zip(*batch))

    x = tokenizer.encode_batch(x)
    x = x.to(DEVICE)

    y = torch.tensor(y, device=DEVICE)
    return x, y


train_loader = DataLoader(
    train_dataset,
    batch_size=24,
    collate_fn=vectorize_batch,
    sampler=sampler,
)
val_loader = DataLoader(
    val_dataset,
    batch_size=24,
    collate_fn=vectorize_batch,
    shuffle=True,
)
test_loader = DataLoader(
    test_dataset,
    batch_size=24,
    collate_fn=vectorize_batch,
)


In [None]:
next(iter(train_loader))

# torch lightning

In [None]:
next(iter(train_loader))[1]

In [None]:
import pytorch_lightning as pl
import torch.nn.functional as F
import torchmetrics
from torch import nn


class TextClassifier(pl.LightningModule):
    def __init__(
        self,
        embed_len: int = 50,
        hidden_dim: int = 50,
        n_layers: int = 1,
        num_classes: int = 3,
    ):
        self.embed_len = embed_len
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.num_classes = num_classes
        self.lr = 1e-3

        super().__init__()

        # layers
        self.embedding_layer = nn.Embedding(
            num_embeddings=len(vocab),
            embedding_dim=self.embed_len,
        )
        self.lstm = nn.LSTM(
            input_size=self.embed_len,
            hidden_size=self.hidden_dim,
            num_layers=self.n_layers,
            batch_first=True,
        )
        self.linear = nn.Linear(self.hidden_dim, 1)

        self.class_weights = torch.tensor(class_weights[1], dtype=torch.float32)
        # self.loss = nn.CrossEntropyLoss(weight=self.class_weights)
        # self.loss = nn.BCELoss(weight=self.class_weights)

        # self.loss = nn.BCELoss()
        self.loss = nn.BCEWithLogitsLoss(pos_weight=self.class_weights)

        # metrics
        self.accuracy = torchmetrics.Accuracy(task="binary")
        self.f1score = torchmetrics.F1Score(task="binary", average='macro')

    def forward(self, X_batch):
        embeddings = self.embedding_layer(X_batch)
        hidden = torch.randn(self.n_layers, len(X_batch), self.hidden_dim).to(DEVICE)
        carry = torch.randn(self.n_layers, len(X_batch), self.hidden_dim).to(DEVICE)

        output, (hidden, carry) = self.lstm(embeddings, (hidden, carry))
        pred = self.linear(output[:, -1])

        return pred

    def training_step(self, batch, batch_idx):
        # training_step defines the train loop.
        x, y = batch
        y = y.type(torch.float32)

        y_pred = self.forward(x)
        y_pred = y_pred.reshape((len(y_pred)))

        loss = self.loss(y_pred, y)
        acc = self.accuracy(y_pred, y)
        f1score = self.f1score(y_pred, y)

        self.log("loss", loss, on_epoch=True, prog_bar=True, logger=True)
        self.log("acc", acc, on_epoch=True, prog_bar=True, logger=True)
        self.log("f1score", f1score, on_epoch=True, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y = y.type(torch.float32)

        y_pred = self.forward(x)
        y_pred = y_pred.reshape((len(y_pred)))

        loss = self.loss(y_pred, y)
        acc = self.accuracy(y_pred, y)
        f1score = self.f1score(y_pred, y)

        self.log("val_loss", loss, on_epoch=True, prog_bar=True, logger=True)
        self.log("val_acc", acc, on_epoch=True, prog_bar=True, logger=True)
        self.log("val_f1score", f1score, on_epoch=True, prog_bar=True, logger=True)

    def test_step(self, batch, batch_idx):
        x, y = batch
        y = y.type(torch.float32)

        y_pred = self.forward(x)
        y_pred = y_pred.reshape((len(y_pred)))

        loss = self.loss(y_pred, y)
        acc = self.accuracy(y_pred, y)
        f1score = self.f1score(y_pred, y)

        self.log("test_loss", loss, on_epoch=True, prog_bar=True, logger=True)
        self.log("test_acc", acc, on_epoch=True, prog_bar=True, logger=True)
        self.log("test_f1score", f1score, on_epoch=True, prog_bar=True, logger=True)

    def configure_optimizers(self):
        # optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.lr)
        return optimizer


In [None]:
ltmodel = TextClassifier()
ltmodel = ltmodel.to(DEVICE)


In [None]:
for batch in train_loader:
    x, y = batch

    print(x.shape)
    b = ltmodel.forward(x)

    bb = b.reshape((len(b)))
    yy = y.type(torch.float32)
    ltmodel.loss(bb, yy)
    print(b.shape)
    break


In [None]:
%load_ext tensorboard
%tensorboard --logdir lightning_logs

In [None]:
import os

from pytorch_lightning.callbacks import (
    EarlyStopping,
    RichProgressBar,
    RichModelSummary,
    LearningRateFinder,
    LearningRateMonitor,
    StochasticWeightAveraging,
    ModelCheckpoint,
)


CHECKPOINT_PATH = "checkpoints/pl-emb-lstm/"
checkpoint_callback = ModelCheckpoint(
    save_top_k=1,
    monitor="val_loss",
    mode="min",
    dirpath=CHECKPOINT_PATH,
    # filename="{epoch}-{val_loss:.2f}-{other_metric:.2f}",
)

# train model
trainer = pl.Trainer(
    accelerator=DEVICE,
    devices=1,
    callbacks=[
        EarlyStopping(
            monitor="val_loss",
            mode="min",
            min_delta=0.00,
            patience=5,
            verbose=False,
        ),
        checkpoint_callback,
        StochasticWeightAveraging(swa_lrs=1e-2),
        LearningRateFinder(),
        LearningRateMonitor(),
        RichModelSummary(),
        # RichProgressBar(),
    ],
    max_epochs=50,
)

trainer.fit(
    model=ltmodel,
    train_dataloaders=train_loader,
    val_dataloaders=val_loader,
)


print(checkpoint_callback.best_model_path)  # prints path to the best model's checkpoint
print(checkpoint_callback.best_model_score)  # and prints it score
best_model = ltmodel.load_from_checkpoint(checkpoint_callback.best_model_path)

trainer.test(ltmodel, dataloaders=test_loader)


In [None]:
ltmodel.eval()
model = ltmodel.to(DEVICE)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix

print('TRAIN')

x = tokenizer.encode_batch(train['sentence'].iloc[:]).to(DEVICE)
hypothesis = model(x).argmax(axis=1)
reference = train['category']

fig, ax = plt.subplots(1, 1, figsize=(8, 4))

confusion =  confusion_matrix(reference, hypothesis)
print(confusion)
sns.heatmap(confusion, annot=True, fmt='d', ax=ax)
ax.set_xlabel("hypothesis")
ax.set_ylabel("reference")
ax.set_xticklabels(["None", "desicion/no_hace_lugar", "descion/hace_lugar"])
ax.set_yticklabels(["None", "desicion/no_hace_lugar", "descion/hace_lugar"])
ax.set_title('TRAIN')

plt.tight_layout()

report = classification_report(reference, hypothesis, output_dict=True)
pd.DataFrame(report).T

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix

print('VAL')

x = tokenizer.encode_batch(val['sentence'].iloc[:]).to(DEVICE)
hypothesis = [model(x[i:i+1]).argmax(axis=1).cpu()[0] for i in range(len(x))]
reference = val['category']

fig, ax = plt.subplots(1, 1, figsize=(8, 4))

confusion =  confusion_matrix(reference, hypothesis)
print(confusion)
sns.heatmap(confusion, annot=True, fmt='d', ax=ax)
ax.set_xlabel("hypothesis")
ax.set_ylabel("reference")
ax.set_xticklabels(["None", "desicion/no_hace_lugar", "descion/hace_lugar"])
ax.set_yticklabels(["None", "desicion/no_hace_lugar", "descion/hace_lugar"])
ax.set_title('VAL')

plt.tight_layout()

report = classification_report(reference, hypothesis, output_dict=True)
pd.DataFrame(report).T

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix

print('TEST')

x = tokenizer.encode_batch(test['sentence'].iloc[:])
hypothesis = model(x).argmax(axis=1)
reference = test['category']

fig, ax = plt.subplots(1, 1, figsize=(8, 4))

confusion =  confusion_matrix(reference, hypothesis)
print(confusion)
sns.heatmap(confusion, annot=True, fmt='d', ax=ax)
ax.set_xlabel("hypothesis")
ax.set_ylabel("reference")
ax.set_xticklabels(["None", "desicion/no_hace_lugar", "descion/hace_lugar"])
ax.set_yticklabels(["None", "desicion/no_hace_lugar", "descion/hace_lugar"])
ax.set_title('TRAIN')

plt.tight_layout()

report = classification_report(reference, hypothesis, output_dict=True)
pd.DataFrame(report).T

In [None]:
test_ = test.copy()
test_["pred_cat"] = hypothesis

def cat2label(cat):
    if cat == 0:
        return (False, False)
    if cat == 1:
        return (True, False)
    if cat == 2:
        return (True, True)

test_[['pred_decision', 'pred_hace_lugar']] = [cat2label(d) for d in test_['pred_cat']]
test_