# Procesamiento de datos

In [79]:
from datasets import load_dataset, Dataset, DatasetDict, concatenate_datasets
from gensim.models import Word2Vec, KeyedVectors
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import sent_tokenize, word_tokenize
from transformers import BertTokenizer, BertModel
import torch.nn.functional as F
import torch
import nltk
import numpy as np

# Descargar datasets y combinarlos

In [83]:

# Diccionario para mapear tipos de falacia del segundo dataset a las clases del primero
fallacy_mapping = {
    # ad hominem
    "Ad Hominem": "ad hominem",
    "Circumstantial Ad Hominem": "ad hominem",
    "Tu Quoque": "ad hominem",
    "Abusive Ad Hominem": "ad hominem",
    "Guilt By Association": "ad hominem",
    "Argument From Commitment": "ad hominem",
    "Precedent Ad Hominem": "ad hominem",
    "Behavioral Ad Hominem": "ad hominem",
    "Ad Hominem Against a Witness at Trial": "ad hominem",

    # false dilemma
    "False Dichotomy": "false dilemma",
    "False Dilemma/Dichotomy": "false dilemma",
    "False dilemma": "false dilemma",

    # ad populum
    "Appeal to Popularity": "ad populum",
    "Bandwagon Fallacy": "ad populum",
    "Common Belief Fallacy": "ad populum",

    # equivocation
    "Equivocation": "equivocation",

    # fallacy of credibility
    "Argument from Authority": "fallacy of credibility",
    "Appeal to Authority": "fallacy of credibility",
    "Appeal to False Authority": "fallacy of credibility",
    "Argument from False Authority": "fallacy of credibility",
    "Appealing to an irrelevant authority": "fallacy of credibility",

    # false causality
    "Correlation does not imply causation": "false causality",
    "False cause": "false causality",
    "Post hoc ergo propter hoc": "false causality",
    "Cum hoc ergo propter hoc": "false causality",

    # intentional
    "Intentional Fallacy": "intentional",
    "Authorial Intent as Constraint": "intentional",

    # fallacy of logic / circular reasoning
    "Circular Reasoning": "circular reasoning",
    "Circular reasoning": "circular reasoning",
    "Fallacy of Logic": "fallacy of logic",
    "Begging the question": "fallacy of logic",
    "Begging the Question": "fallacy of logic",

    # appeal to emotion
    "Appeal to Emotion": "appeal to emotion",
    "Appeal to emotion": "appeal to emotion",
    "Appeal to Pity": "appeal to emotion",
    "Appeal to fear": "appeal to emotion",
    "Appeal to consequences": "appeal to emotion",

    # fallacy of relevance / extension
    "Fallacy of Extension": "fallacy of extension",
    "Fallacy of Relevance": "fallacy of relevance",
    "Red Herring": "fallacy of relevance",
    "Straw Man": "fallacy of relevance",
    "Straw man": "fallacy of relevance",
    "Strawman": "fallacy of relevance",

    # faulty generalization
    "Hasty Generalization": "faulty generalization",
    "Faulty Generalization ": "faulty generalization",
    "Hasty generalization": "faulty generalization",
    "Accident": "faulty generalization",
    "Generalization": "faulty generalization",
}

# Cargar datasets
dataset1 = load_dataset("tasksource/logical-fallacy")
dataset2 = load_dataset("MrOvkill/fallacies-fallacy-base")

train1, test1, dev1 = dataset1["train"], dataset1["test"], dataset1["dev"]

# Mapear clases del dataset2 a las del dataset1
def map_fallacy(example):
    mapped = fallacy_mapping.get(example["name"])
    return {"logical_fallacies": mapped}

dataset2_mapped = dataset2["train"].map(map_fallacy)
dataset2_mapped = dataset2_mapped.filter(lambda x: x["logical_fallacies"] is not None)

# Mantener solo columnas necesarias
dataset2_mapped = dataset2_mapped.remove_columns(
    [c for c in dataset2_mapped.column_names if c not in ["logical_fallacies", "example"]]
)

# Dividir dataset2 en train/dev/test (80/10/10)
data_array = dataset2_mapped["example"]
labels_array = dataset2_mapped["logical_fallacies"]

train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    data_array, labels_array, test_size=0.2, stratify=labels_array, random_state=42
)
dev_texts, test_texts, dev_labels, test_labels = train_test_split(
    temp_texts, temp_labels, test_size=0.5, stratify=temp_labels, random_state=42
)

train2 = Dataset.from_dict({"example": train_texts, "logical_fallacies": train_labels})
dev2   = Dataset.from_dict({"example": dev_texts, "logical_fallacies": dev_labels})
test2  = Dataset.from_dict({"example": test_texts, "logical_fallacies": test_labels})

train2 = train2.rename_column("example", "source_article")
dev2   = dev2.rename_column("example", "source_article")
test2  = test2.rename_column("example", "source_article")

# Combinar datasets
train_combined = concatenate_datasets([train1, train2])
dev_combined   = concatenate_datasets([dev1, dev2])
test_combined  = concatenate_datasets([test1, test2])

dataset = DatasetDict({
    "train": train_combined,
    "dev": dev_combined,
    "test": test_combined
})

print(dataset)
print(f"Train examples: {len(dataset['train'])}")
print(f"Dev examples:   {len(dataset['dev'])}")
print(f"Test examples:  {len(dataset['test'])}")


DatasetDict({
    train: Dataset({
        features: ['config', 'source_article', 'logical_fallacies'],
        num_rows: 2901
    })
    dev: Dataset({
        features: ['config', 'source_article', 'logical_fallacies'],
        num_rows: 598
    })
    test: Dataset({
        features: ['config', 'source_article', 'logical_fallacies'],
        num_rows: 539
    })
})
Train examples: 2901
Dev examples:   598
Test examples:  539


In [86]:

# Descargar modelos de tonekizacion
nltk.download("punkt")
nltk.download("punkt_tab")


[nltk_data] Downloading package punkt to /home/david/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/david/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

# Tokenizacion + case-folding

In [87]:

def tokenize_flat(text):
    text = text.lower()
    sentences = sent_tokenize(text)
    tokens = []
    for sentence in sentences:
        tokens.extend(word_tokenize(sentence))
    return tokens

# NOTE: Esta funcion tokeniza cada frase por separado. Podria ser interesante.
def preprocess_sentences(text):
    text = text.lower()
    sentences = sent_tokenize(text)
    tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
    return tokenized_sentences

train = train.map(lambda x: {"tokenized": tokenize_flat(x["source_article"])})
test  = test.map(lambda x: {"tokenized": tokenize_flat(x["source_article"])})
dev   = dev.map(lambda x: {"tokenized": tokenize_flat(x["source_article"])})

print(train[1]["source_article"])
print(train[1]["tokenized"])


Map: 100%|██████████| 2680/2680 [00:00<00:00, 6853.85 examples/s]
Map: 100%|██████████| 511/511 [00:00<00:00, 7522.64 examples/s]
Map: 100%|██████████| 570/570 [00:00<00:00, 7298.90 examples/s]

The bigger a child's shoe size, the better the child's handwriting
['the', 'bigger', 'a', 'child', "'s", 'shoe', 'size', ',', 'the', 'better', 'the', 'child', "'s", 'handwriting']





# Bag of Words

In [88]:

train_corpus = [" ".join(tokens) for tokens in train["tokenized"]]

bow_vectorizer = CountVectorizer()
bow_matrix = bow_vectorizer.fit_transform(train_corpus)

print("==== Bag-of-Words ====")
print(f"Dimensiones de la matriz BoW: {bow_matrix.shape}")
print(f"Ejemplo de vocabulario:       {list(bow_vectorizer.get_feature_names_out()[1000:1016])}")

# Calculamos similitud entre algunos documentos
similarity_bow = cosine_similarity(bow_matrix[:5])
print("\nSimilitud coseno entre primeros documentos (BoW):")
print(np.round(similarity_bow, 3))


==== Bag-of-Words ====
Dimensiones de la matriz BoW: (2680, 7952)
Ejemplo de vocabulario:       ['borges', 'boring', 'boris', 'born', 'borrow', 'borrowed', 'borrowing', 'boss', 'both', 'bother', 'bottle', 'bottled', 'bottles', 'bottom', 'bought', 'bouncing']

Similitud coseno entre primeros documentos (BoW):
[[1.    0.    0.    0.    0.   ]
 [0.    1.    0.    0.    0.221]
 [0.    0.    1.    0.053 0.   ]
 [0.    0.    0.053 1.    0.26 ]
 [0.    0.221 0.    0.26  1.   ]]


# TF-IDF

In [89]:

# TF-IDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(train_corpus)

print("\n==== TF-IDF ====")
print(f"Dimensiones de la matriz TF-IDF: {tfidf_matrix.shape}")
print(f"Ejemplo de vocabulario:          {list(tfidf_vectorizer.get_feature_names_out()[1000:1016])}")

# Similitud coseno entre los algunos documentos
similarity_tfidf = cosine_similarity(tfidf_matrix[:5])
print("\nSimilitud coseno entre primeros documentos (TF-IDF):")
print(np.round(similarity_tfidf, 3))



==== TF-IDF ====
Dimensiones de la matriz TF-IDF: (2680, 7952)
Ejemplo de vocabulario:          ['borges', 'boring', 'boris', 'born', 'borrow', 'borrowed', 'borrowing', 'boss', 'both', 'bother', 'bottle', 'bottled', 'bottles', 'bottom', 'bought', 'bouncing']

Similitud coseno entre primeros documentos (TF-IDF):
[[1.    0.    0.    0.    0.   ]
 [0.    1.    0.    0.    0.021]
 [0.    0.    1.    0.033 0.   ]
 [0.    0.    0.033 1.    0.167]
 [0.    0.021 0.    0.167 1.   ]]


In [90]:

# Palabras mas relevantes por documento
tfidf_array = tfidf_matrix.toarray()
feature_names = tfidf_vectorizer.get_feature_names_out()

def get_top_words(tfidf_vector, feature_names, top_n=10):
    sorted_nzs = np.argsort(tfidf_vector)[-top_n:][::-1]
    return [(feature_names[i], tfidf_vector[i]) for i in sorted_nzs]

for idx in range(3):
    print(f"\nDocumento {idx} - Palabras más relevantes (TF-IDF):")
    top_words = get_top_words(tfidf_array[idx], feature_names)
    for word, score in top_words:
        print(f"{word}: {score:.4f}")


Documento 0 - Palabras más relevantes (TF-IDF):
slogan: 0.5407
expect: 0.4204
pay: 0.4204
company: 0.3994
less: 0.3577
more: 0.2583
existed: 0.0000
existence: 0.0000
existential: 0.0000
exist: 0.0000

Documento 1 - Palabras más relevantes (TF-IDF):
child: 0.5770
handwriting: 0.4045
bigger: 0.3845
shoe: 0.3593
size: 0.3427
better: 0.2457
the: 0.2210
exhausted: 0.0000
exhibit: 0.0000
exist: 0.0000

Documento 2 - Palabras más relevantes (TF-IDF):
true: 0.4081
many: 0.3721
since: 0.3642
believe: 0.3527
then: 0.3412
must: 0.3048
people: 0.2821
this: 0.2425
be: 0.2186
it: 0.2060


# Embedding no contextual con Word2Vec (fine-tuneado)

In [91]:

class EpochLogger(CallbackAny2Vec):

    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        if self.epoch == 0:
            print('Loss after epoch {}: {}'.format(self.epoch, loss))
        else:
            print('Loss after epoch {}: {}'.format(self.epoch, loss- self.loss_previous_step))
        self.epoch += 1
        self.loss_previous_step = loss

# Cargar modelo base preentrenado de Word2Vec
base_model = KeyedVectors.load_word2vec_format('./models/GoogleNews-vectors-negative300.bin.gz', binary=True, limit=400000)

train_tokens = list(train["tokenized"])
test_tokens  = list(test["tokenized"])
dev_tokens   = list(dev["tokenized"])
all_sentences = train_tokens + test_tokens + dev_tokens

epoch_logger = EpochLogger()

# Crear un nuevo modelo Word2Vec para fine-tuning
model = Word2Vec(
    vector_size=base_model.vector_size,
    window=5,
    min_count=1,
    compute_loss=True,
    sg=1,
    negative=5,
    workers=8
)

# Construir vocabulario a partir de nuestro dataset
model.build_vocab(all_sentences)
print(f"Vocabulario del dataset: {len(model.wv)} palabras.")

# Inicializar embeddings desde el modelo preentrenado
overlap = 0
for word in model.wv.key_to_index:
    if word in base_model.key_to_index:
        model.wv[word] = base_model[word]
        overlap += 1
print(f"Embeddings inicializados desde el modelo base: {overlap}/{len(model.wv)}")

# Fine-tuning con nuestro dataset
model.train(
    all_sentences,
    total_examples=len(all_sentences),
    epochs=10,
    callbacks=[epoch_logger]
)

# Guardar el modelo fine-tuneado
model.save("./models/word2vec_finetuned_fallacies.model")
print("Modelo fine-tuneado guardado correctamente.")

def compute_coverage(dataset, model):

    total = 0
    covered = 0

    # Si es KeyedVectors, usamos model.key_to_index
    vocab = model.key_to_index if isinstance(model, KeyedVectors) else model.wv.key_to_index

    for tokens in dataset:
        for t in tokens:
            total += 1
            if t in vocab:
                covered += 1
    return covered / total if total > 0 else 0

cov_before = compute_coverage(train_tokens, base_model)
cov_after  = compute_coverage(train_tokens, model)
print(f"Cobertura antes del fine-tuning: {cov_before:.2%}")
print(f"Cobertura después del fine-tuning: {cov_after:.2%}")


Vocabulario del dataset: 9977 palabras.
Embeddings inicializados desde el modelo base: 8084/9977
Loss after epoch 0: 0.0
Loss after epoch 1: 0.0
Loss after epoch 2: 0.0
Loss after epoch 3: 0.0
Loss after epoch 4: 0.0
Loss after epoch 5: 0.0
Loss after epoch 6: 0.0
Loss after epoch 7: 0.0
Loss after epoch 8: 0.0
Loss after epoch 9: 0.0
Modelo fine-tuneado guardado correctamente.
Cobertura antes del fine-tuning: 75.84%
Cobertura después del fine-tuning: 100.00%


# Embedding contextual con BERT

In [92]:

# Cargar modelo y tokenizer BERT preentrenado
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model.eval()

def get_bert_avg_embeddings(sentences, batch_size=16, max_length=128, device='cpu'):

    model.to(device)
    avg_embeddings = []

    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i+batch_size]
        batch_texts = [" ".join(s) for s in batch]
        encoded = tokenizer(
            batch_texts,
            return_tensors='pt',
            padding=True, # Paddear cada batch al tamaño del batch más largo
            truncation=True,
            max_length=max_length
        )
        encoded = {k: v.to(device) for k, v in encoded.items()}

        with torch.no_grad():
            outputs = model(**encoded)
            for j in range(len(batch)):
                # Promediar solo sobre tokens reales (ignorar padding)
                attention_mask = encoded['attention_mask'][j].unsqueeze(-1)
                sum_emb = (outputs.last_hidden_state[j] * attention_mask).sum(dim=0)
                length = attention_mask.sum()
                avg_embeddings.append(sum_emb / length)

    return avg_embeddings

# Generar embeddings para train/test/dev
train_sentences = list(train["tokenized"])
test_sentences  = list(test["tokenized"])
dev_sentences   = list(dev["tokenized"])

device = 'cuda' if torch.cuda.is_available() else 'cpu'

train_embeddings = get_bert_avg_embeddings(train_sentences, device=device)
test_embeddings  = get_bert_avg_embeddings(test_sentences, device=device)
dev_embeddings   = get_bert_avg_embeddings(dev_sentences, device=device)

print(f"Ejemplo: embedding promedio train[0] shape: {train_embeddings[0].shape}")  # 768-d

# Ejemplo de similitud coseno
sim = F.cosine_similarity(train_embeddings[0], train_embeddings[1], dim=0)
print(f"Similitud coseno entre train[0] y train[1]: {sim.item():.4f}")


Ejemplo: embedding promedio train[0] shape: torch.Size([768])
Similitud coseno entre train[0] y train[1]: 0.5843
