In [None]:
%load_ext autoreload
%load_ext aymurai.devtools.magic
%autoreload 2

In [None]:
# !sudo python -m spacy download es

In [None]:
import torch
from torch.utils.data import Dataset

class DFtoDataset(Dataset):
    def __init__(self, texts: list[str], targets: list[int]):

        self.x_ = texts.values
        self.y_ = targets.values
    
    def __len__(self):
        return len(self.y_)

    def __getitem__(self, idx):
        # Note y and x are inverted to mimic AR_NEWS dataset format
        return self.y_[idx], self.x_[idx]


In [None]:
import pandas as pd

data = pd.read_csv(
    "sentences-decision-manual.csv",
    usecols=["path", "nro_registro", "tomo", "sentence", "decision", "hace_lugar"],
)


In [None]:
data.dropna(inplace=True)
# target_classes = ["none", "decision:no_hace_lugar", "decision:hace_lugar"]


def force_bool(value):
    return True if value in ['True', True, 1, "1"] else False


def get_category(pair):
    decision, hace_lugar = pair
    # print(decision, hace_lugar, type(decision), type(hace_lugar))
    if not decision:
        cat = 0
    elif decision and not hace_lugar:
        cat = 1
    elif decision and hace_lugar:
        cat = 1
    else:
        raise "not valid"
    return cat


# # data[['decision', 'hace_lugar']] = data[['decision', 'hace_lugar']].apply(lambda x: literal_eval(x), axis=1).astype(bool) 
data['decision'] = data['decision'].apply(force_bool).astype(bool) 
data['hace_lugar'] = data['hace_lugar'].apply(force_bool).astype(bool) 
data["category"] = data[["decision", "hace_lugar"]].apply(get_category, axis=1)
data.dropna(subset=['category'], inplace=True)

data.drop_duplicates(subset="sentence", inplace=True)
print(len(data))
data["sentence"].apply(lambda x: len(x.split(" "))).hist(
    bins=[32 * i for i in range(10)]
)


In [None]:
from sklearn.model_selection import train_test_split

dataset = DFtoDataset(data['sentence'], data['category'])

train, test = train_test_split(data, test_size=0.2, random_state=42)
test, val = train_test_split(test, test_size=0.5, random_state=42)

train_df = train.copy()
val_df = val.copy()
test_df = test.copy()

train = DFtoDataset(train['sentence'], train['category'])
val = DFtoDataset(val['sentence'], val['category'])
test = DFtoDataset(test['sentence'], test['category'])


print(len(train))
print(len(val))
print(len(test))

In [None]:

train[0][1]

In [None]:
import torch
import torchtext
from torch.utils.data import DataLoader
from torchtext.data.functional import to_map_style_dataset

DEVICE = 'cpu'


In [None]:
dataset[59]

In [None]:
from aymurai.models.decision.binregex import DecisionConv1dBinRegex

model = DecisionConv1dBinRegex(
    tokenizer_path="https://drive.google.com/uc?id=1eljQOinpObdfBREIKxVnC5Y2g_sbhPHT&confirm=true",
    model_checkpoint="https://drive.google.com/uc?id=19_YmBJnO06iS0qW8ak0zl0EIsJYin8kQ&confirm=true",
    device="cpu",
)


In [None]:
text = "1. DECLARAR EXTINGUIDA LA ACCIÓN PENAL en este caso por cumplimiento de la suspensión del proceso a prueba, y SOBRESEER a EZEQUIEL CAMILO MARCONNI, DNI 11.222.333, en orden a los delitos de lesiones leves agravadas, amenazas simples y agravadas por el uso de armas."

input_ids = model.tokenizer.encode_batch([text])
input_ids.shape

In [None]:
model.model(input_ids).exp().detach().numpy()

In [None]:
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from aymurai.models.decision.tokenizer import Tokenizer
from unidecode import unidecode

tokenizer = model.tokenizer 
vocab = tokenizer.vocab

len(vocab)

In [None]:

max_tokens = 128

def vectorize_text(batch):
    Y, X = list(zip(*batch))
    # X = [vocab(tokenizer(text)) for text in X]
    # X = [tokens+([0]* (max_tokens-len(tokens))) if len(tokens)<max_tokens else tokens[:max_tokens] for tokens in X] ## Bringing all samples to max_tokens length.
    X = tokenizer.encode_batch(X)

    xx, yy = torch.tensor(X, dtype=torch.int32), torch.tensor(Y) ## We have deducted 1 from target names to get them in range [0,1,2,3] from [1,2,3,4]
    xx = xx.to(DEVICE)
    yy = yy.to(DEVICE)
    return xx, yy

train_loader = DataLoader(train, batch_size=1024, collate_fn=vectorize_text, shuffle=True)
val_loader  = DataLoader(val,  batch_size=1024, collate_fn=vectorize_text)
test_loader  = DataLoader(test,  batch_size=1024, collate_fn=vectorize_text)

In [None]:
vectorize_text([[1, 'En función de tales motivos, dispondré la']])

In [None]:
for X, Y in train_loader:
    print(X.shape, Y.shape)
    break

In [None]:
cats = [data[0] for data in train]

In [None]:
for batch in train_loader:
    x, y = batch
    # x = x.to('cuda')
    
    print(x.shape)
    b = model.model.forward(x)
    print(b)
    break

In [None]:
model

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report

print("TRAIN")

reference = []
hypothesis = []
probs = []
ltmodel = model.model.to(DEVICE)
for batch in train_loader:
    x, y = batch
    x = x.to(DEVICE)

    y_pred = ltmodel(x)  # .exp().argmax(axis=1)

    hypothesis.append(y_pred.cpu().detach().numpy())
    reference.append(y.cpu().detach().numpy())


reference = np.concatenate(reference)
hypothesis = np.concatenate(hypothesis)

fig, ax = plt.subplots(1, 1, figsize=(8, 4))

confusion = confusion_matrix(reference, hypothesis.argmax(axis=1))
print(confusion)
sns.heatmap(confusion, annot=True, fmt="d", ax=ax)
ax.set_xlabel("hypothesis")
ax.set_ylabel("reference")
# ax.set_xticklabels(["None", "desicion/no_hace_lugar", "descion/hace_lugar"])
# ax.set_yticklabels(["None", "desicion/no_hace_lugar", "descion/hace_lugar"])
ax.set_title("TRAIN")

plt.tight_layout()

report = classification_report(reference, hypothesis.argmax(axis=1), output_dict=True)
pd.DataFrame(report).T

# threshold

In [None]:
np.exp(hypothesis)

score_class0 = np.exp(hypothesis)[:, 0]

df = pd.DataFrame(
    {
        'score': 1- score_class0,
        'true_class': reference
    }
)

In [None]:
df

In [None]:
df.hist()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

fig, subplot = plt.subplots(1, 2, figsize=(14, 4), sharey=True)


sns.histplot(
    data=df,
    x="score",
    hue='true_class'
    # color="r",
    # alpha=0.8,
    # label="class 0",
    # ax=subplot[0],
    # stat="probability",
)
# sns.histplot(
#     df.query("true_class == 1"),
#     x="score",
#     color="b",
#     alpha=0.5,
#     label="class 1",
#     ax=subplot[1],
#     stat="probability",
# )
# sns.histplot(class1, color='b', alpha=0.5)
subplot[0].set_yscale("log")
subplot[1].set_yscale("log")
# subplot.legend()


In [None]:
scores = {
    'threshold': [],
    'precision': [],
    'recall': [],
    'f1-score': []
}

y_true = df['true_class']
for cutoff in np.linspace(0, 1, 100):
    y_pred = df['score'] > cutoff
    
    scores['threshold'].append(cutoff)

    report = classification_report(y_true, y_pred, output_dict=True)
    # print(report['1'])
    precision = report['1']['precision']
    recall = report['1']['recall']
    f1score = report['1']['f1-score']
    scores['precision'].append(precision)
    scores['recall'].append(recall)
    scores['f1-score'].append(f1score)

scores = pd.DataFrame(scores)
scores = scores.melt(['threshold'], value_vars=['precision', 'recall', 'f1-score'])
scores

In [None]:
sns.lineplot(data=scores, x='threshold', y='value', hue='variable')

THRESHOLD = 0.90
plt.axvline(THRESHOLD)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix

print('VAL')

reference = []
hypothesis = []
probs = []
ltmodel = ltmodel.to(DEVICE)
for batch in val_loader:
    x, y = batch
    x = x.to(DEVICE)

    y_pred = ltmodel(x)#.exp().argmax(axis=1)
    

    hypothesis.append(y_pred.cpu().detach().numpy())
    reference.append(y.cpu().detach().numpy())


reference = np.concatenate(reference)
hypothesis = np.concatenate(hypothesis)

fig, ax = plt.subplots(1, 1, figsize=(8, 4))

confusion =  confusion_matrix(reference, hypothesis.argmax(axis=1))
print(confusion)
sns.heatmap(confusion, annot=True, fmt='d', ax=ax)
ax.set_xlabel("hypothesis")
ax.set_ylabel("reference")
# ax.set_xticklabels(["None", "desicion/no_hace_lugar", "descion/hace_lugar"])
# ax.set_yticklabels(["None", "desicion/no_hace_lugar", "descion/hace_lugar"])
ax.set_title('VAL')

plt.tight_layout()

report = classification_report(reference, hypothesis.argmax(axis=1), output_dict=True)
pd.DataFrame(report).T

In [None]:
np.exp(hypothesis)

score_class0 = np.exp(hypothesis)[:, 0]

df = pd.DataFrame(
    {
        'score': 1- score_class0,
        'true_class': reference
    }
)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

fig, subplot = plt.subplots(1, 2, figsize=(14, 4), sharey=True)


sns.histplot(df.query('true_class == 0'), x='score', color='r', alpha=0.8, label='class 0', ax=subplot[0], stat='probability')
sns.histplot(df.query('true_class == 1'), x='score', color='b', alpha=0.5, label='class 1',ax=subplot[1], stat='probability')
# sns.histplot(class1, color='b', alpha=0.5)
subplot[0].set_yscale('log')
subplot[1].set_yscale('log')
# subplot.legend()


In [None]:
sns.lineplot(data=scores, x='threshold', y='value', hue='variable')

THRESHOLD = 0.90
plt.axvline(THRESHOLD)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix

print('TEST')

reference = []
hypothesis = []
probs = []
ltmodel = ltmodel.to(DEVICE)
for batch in test_loader:
    x, y = batch
    x = x.to(DEVICE)

    y_pred = ltmodel(x)
    y_pred = ltmodel(x).exp()[:, 1]  > THRESHOLD #.argmax(axis=1)
    

    hypothesis.append(y_pred.cpu().detach().numpy())
    reference.append(y.cpu().detach().numpy())


reference = np.concatenate(reference)
hypothesis = np.concatenate(hypothesis)

fig, ax = plt.subplots(1, 1, figsize=(8, 4))

confusion =  confusion_matrix(reference, hypothesis)
print(confusion)
sns.heatmap(confusion, annot=True, fmt='d', ax=ax)
ax.set_xlabel("hypothesis")
ax.set_ylabel("reference")
# ax.set_xticklabels(["None", "desicion/no_hace_lugar", "descion/hace_lugar"])
# ax.set_yticklabels(["None", "desicion/no_hace_lugar", "descion/hace_lugar"])
ax.set_title('TEST')

plt.tight_layout()

report = classification_report(reference, hypothesis, output_dict=True)
pd.DataFrame(report).T

In [None]:
import pandas as pd

sentences = [pair[1] for pair in test]


df = pd.DataFrame(
    {
        "sentence": sentences,
        "decision": test_df['decision'],
        "hace_lugar": test_df['hace_lugar'],
        "cat": reference,
        "pred_cat": hypothesis.astype(int),
    }
)

df


In [None]:
pd.set_option(
    "display.max_columns",
    1000,
    "display.width",
    1000,
    "display.max_colwidth",
    None,
)


In [None]:
df['pred_ok'] = df['cat'] == df['pred_cat']
# test_

In [None]:
# test_.query('decision == 0 and pred_decision and not pred_hace_lugar').sample(1)
df.query('pred_ok == 0')

# regex classificacion

In [None]:
import regex

pattern = regex.compile(r"(?i)(no hacer? lugar|rechaz[ao]r?|no admitir|no convalidar|no autorizar|declarar inadmisible)")

def recategorize(row, cat_col='pred_cat'):
    # print(type(row))
    # print(len(row))
    # # i, row = row
    # print(row)
    # print()

    decision = row['decision']
    hace_lugar = row['hace_lugar']
    match (decision, hace_lugar):
        case (1, 0):
            row['cat'] = 1
        case (1, 1):
            row['cat'] = 2
        case _:
            row['cat'] = 0

    decision_pred = row[cat_col]
    if not decision_pred:
        return row
    
    match = pattern.findall(row['sentence'])
    if not match:
        row[cat_col] = 2
    return row
    
a = train_df.query('decision and not hace_lugar')
df_recat = df.apply(recategorize, axis=1)

In [None]:
df_recat['pred_cat']

In [None]:

reference = df_recat['cat']
hypothesis = df_recat['pred_cat']

fig, ax = plt.subplots(1, 1, figsize=(8, 4))

confusion =  confusion_matrix(reference, hypothesis)
print(confusion)
sns.heatmap(confusion, annot=True, fmt='d', ax=ax)
ax.set_xlabel("hypothesis")
ax.set_ylabel("reference")
# ax.set_xticklabels(["None", "desicion/no_hace_lugar", "descion/hace_lugar"])
# ax.set_yticklabels(["None", "desicion/no_hace_lugar", "descion/hace_lugar"])
ax.set_title('TEST')

plt.tight_layout()

report = classification_report(reference, hypothesis, output_dict=True)
pd.DataFrame(report).T

In [None]:
confusion_matrix(df_recat['cat'], df_recat['pred_cat'])

In [None]:
df_recat.query('pred_cat == 2')