In [3]:
try:
    import torch
except:
    print("[INFO] Couldn't find torch... installing it.")
    !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
    import torch

try:
    import torchmetrics
except:
    print("[INFO] Couldn't find torchmetrics... installing it.")
    !pip install torchmetrics
    import torchmetrics

try:
    import torchinfo
except:
    print("[INFO] Couldn't find torchinfo... installing it.")
    !pip install torchinfo
    import torchinfo

try:
    from torch.utils.tensorboard import SummaryWriter
except:
    print("[INFO] Couldn't find tensorboard... installing it.")
    !pip install -q tensorboard
    from torch.utils.tensorboard import SummaryWriter

try: 
    import wandb
except: 
    print("[INFO] Couldn't find tensorboard... installing it.")
    !pip install wandb


import sys
import os
import random
from typing import Dict, Tuple, List, Set, Union, Type, Literal
from itertools import product
from dataclasses import dataclass
from collections import Counter
import re
from pathlib import Path

import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torchmetrics import Accuracy
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from torchinfo import summary

from sklearn.model_selection import train_test_split

from tqdm.auto import tqdm
from pathlib import Path

In [2]:
# --- Importing Formula Class ---
# Go two levels up: from ICTCS_notebooks → theorem_prover_core → project root
project_root = os.path.abspath(os.path.join(os.getcwd(), "..", ".."))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

from theorem_prover_core.formula import (Formula, Letter, Falsity, Conjunction, Disjunction, Implication,
                                         Negation, BinaryConnectiveFormula, UnaryConnectiveFormula, bottom)

In [1]:
from logic_utils import Normalizer, Metavariable, CustomTokenizer, FormulaTreeNode, assign_embedding_indices
from data_setup import (generate_normalized_dataset, add_new_tautologies_to_dataset, parse_dimacs_files, 
                        prepare_formula_dataset, FormulaDataset, FormulaTreeNode)
from train_utils import set_seeds, compute_vocab_size, train, save_results, save_model 
from models import AsymmetricFocalLoss

---
### **1 Generate Datasets**

In [47]:
# --- Generating normalized dataset ---
SIZE = 10000
MAX_DEPTH = 5
NUM_LETTERS = 7
SEED = 42

random.seed(SEED)
data_set = generate_normalized_dataset(num_formulas=SIZE, 
                                      max_depth=MAX_DEPTH,
                                      num_letters=NUM_LETTERS)

datapath = "datasets/normalized_formulas_dataset.csv"
data_set.to_csv(datapath, index=False)
print(f"[INFO] Saving Dataset to {os.path.abspath(datapath)}")

[INFO] Saving Dataset to /home/labeconomia/nbalestra/theorem_prover/theorem_prover_core/ICTCS_notebooks/datasets/normalized_formulas_dataset.csv


In [48]:
data_set.head()

Unnamed: 0,formula,is_tautology
0,A0,False
1,(A0 ∧ ¬(A1 ∧ A2 ∧ A3)) ∨ (A4 → A5),False
2,(¬(A0 → A1) → A2) ∨ A3,False
3,⊥ ∨ A0,False
4,¬A0,False


In [49]:
print(data_set.count())
count = data_set.is_tautology.value_counts()
print(f"\nNumber of True and False formulas: \n{count}\n")

total = len(data_set)
tautologies = data_set["is_tautology"].sum()
percentage = (tautologies / total) * 100
print(f"Percentage of tautologies in the dataset: {percentage:.2f}%")

formula         10000
is_tautology    10000
dtype: int64

Number of True and False formulas: 
is_tautology
False    9576
True      424
Name: count, dtype: int64

Percentage of tautologies in the dataset: 4.24%


#### **1.1 Data Augmentation With Common Tautologies Instantiation**

In [50]:
# --- Creating Common Tautologies --- 
A = Metavariable("A")
B = Metavariable("B")
C = Metavariable("C")

# List of common tautologies
tautologies = [

    Disjunction(A, Negation(A)),

    Negation(Conjunction(A, Negation(A))),

    Conjunction(
        Implication(Negation(Conjunction(A, B)), Disjunction(Negation(A), Negation(B))),
        Implication(Disjunction(Negation(A), Negation(B)), Negation(Conjunction(A, B)))
    ),

    Conjunction(
        Implication(Negation(Disjunction(A, B)), Conjunction(Negation(A), Negation(B))),
        Implication(Conjunction(Negation(A), Negation(B)), Negation(Disjunction(A, B)))
    ),

    Conjunction(
        Implication(Conjunction(A, Disjunction(B, C)), Disjunction(Conjunction(A, B), Conjunction(A, C))),
        Implication(Disjunction(Conjunction(A, B), Conjunction(A, C)), Conjunction(A, Disjunction(B, C)))
    ),

    Conjunction(
        Implication(Disjunction(A, Conjunction(B, C)), Conjunction(Disjunction(A, B), Disjunction(A, C))),
        Implication(Conjunction(Disjunction(A, B), Disjunction(A, C)), Disjunction(A, Conjunction(B, C)))
    )
]

for tautology in tautologies:
    print(tautology)

A ∨ ¬A
¬(A ∧ ¬A)
(¬(A ∧ B) → ¬A ∨ ¬B) ∧ (¬A ∨ ¬B → ¬(A ∧ B))
(¬(A ∨ B) → ¬A ∧ ¬B) ∧ (¬A ∧ ¬B → ¬(A ∨ B))
(A ∧ (B ∨ C) → (A ∧ B) ∨ (A ∧ C)) ∧ ((A ∧ B) ∨ (A ∧ C) → A ∧ (B ∨ C))
(A ∨ (B ∧ C) → (A ∨ B) ∧ (A ∨ C)) ∧ ((A ∨ B) ∧ (A ∨ C) → A ∨ (B ∧ C))


In [51]:
# --- Adding 3,000 (30% of dataaset) new tatologies to the dataset ---
num_samples = 3000
seed_value = 42

dataset = add_new_tautologies_to_dataset(dataset=data_set,
                                         tautologies=tautologies,
                                         num_samples=num_samples,
                                         max_depth=MAX_DEPTH,
                                         num_letters=NUM_LETTERS,
                                         seed=seed_value)

datapath = "datasets/extended_dataset_with_tautologies.csv"
dataset.to_csv(datapath, index=False)
print(f"[INFO] Saving Dataset to {os.path.abspath(datapath)}")

[INFO] Saving Dataset to /home/labeconomia/nbalestra/theorem_prover/theorem_prover_core/ICTCS_notebooks/datasets/extended_dataset_with_tautologies.csv


In [52]:
# --- Get Dataset Info ---
print(dataset.count())
count = dataset.is_tautology.value_counts()
print(f"\nNumber of True and False formulas: \n{count}\n")

total = len(dataset)
tautologies = dataset["is_tautology"].sum()
percentage = (tautologies / total) * 100
print(f"Percentage of tautologies in the dataset: {percentage:.2f}%")


formula         13000
is_tautology    13000
dtype: int64

Number of True and False formulas: 
is_tautology
False    9576
True     3424
Name: count, dtype: int64

Percentage of tautologies in the dataset: 26.34%


#### **1.2 Data Augmentation With Common Tautologies Instantiation and DIMACS format formulas**

Extending Dataset with [SATLIB - Benchmark Problems](https://www.cs.ubc.ca/~hoos/SATLIB/benchm.html), using propositional formulas in Dimacs format.

Formulas Downloaded from SATLIB: 
- uf20-91: 20 variables, 91 clauses - 1000 instances, all satisfiable
- uf50-218 / uuf50-218: 50 variables, 218 clauses - 1000 instances, all sat/unsat

In [53]:
base_dir = "dimacs_formulas_datasets/"
dataset_satlib = parse_dimacs_files(base_dir=base_dir)

In [54]:
dataset_satlib.is_tautology.value_counts()

is_tautology
True     2000
False    1000
Name: count, dtype: int64

In [55]:
# --- Normalizing new formulas ---
dataset_satlib['formula'] = dataset_satlib['formula'].apply(lambda f: str(Normalizer().normalize(f)))

In [56]:
# --- Concateating datasets and shuffling ---
dataset_composed = pd.concat([dataset, dataset_satlib], ignore_index=True)
dataset_composed = dataset_composed.sample(frac=1, random_state=42).reset_index(drop=True) # frac=1 means shuffle all rows
                                                                                           # reset_index(drop=True) removes the old index

In [57]:
# --- Adding a column 'source' to combined dataset to distinguish between synthetic formulas and dimacs formulas ---
dataset['source'] = 'synthetic'
dataset_satlib['source'] = 'satlib'

# --- Concateating datasets and shuffling ---
dataset_composed = pd.concat([dataset, dataset_satlib], ignore_index=True)
dataset_composed = dataset_composed.sample(frac=1, random_state=42).reset_index(drop=True) # frac=1 means shuffle all rows
                                                                                           # reset_index(drop=True) removes the old index
# --- Adding indices to dataset---
dataset_composed['index'] = dataset_composed.index

datapath = "datasets/extended_dataset_with_dimacs_formulas.csv"
dataset_composed.to_csv(datapath, index=False)
print(f"[INFO] Saving Dataset to {os.path.abspath(datapath)}")

[INFO] Saving Dataset to /home/labeconomia/nbalestra/theorem_prover/theorem_prover_core/ICTCS_notebooks/datasets/extended_dataset_with_dimacs_formulas.csv


In [58]:
dataset_composed.head()

Unnamed: 0,formula,is_tautology,source,index
0,((A0 ∧ A1) ∨ ((A0 ∨ ⊥) ∧ ((A0 ∧ A1) ∨ (((A2 ∧ ...,True,synthetic,0
1,(¬((A0 → A1 ∨ A2 ∨ A3) ∨ A0) → ¬(A0 → A1 ∨ A2 ...,True,synthetic,1
2,¬A0 ∧ ((⊥ ∧ A1) ∨ (¬A2 ∧ A3)) ∧ A4,False,synthetic,2
3,(A0 ∨ (A1 ∧ ¬A2 ∧ A3)) ∧ (A4 ∨ A5 ∨ ¬(A6 ∧ A7 ...,False,synthetic,3
4,A0 ∨ (¬(A1 ∨ A2) → ¬(A3 ∨ A4)),False,synthetic,4


In [59]:
# --- Get Dataset Info ---
print(dataset_composed.count())
count = dataset_composed.is_tautology.value_counts()
print(f"\nNumber of True and False formulas: \n{count}\n")

total = len(dataset)
tautologies = dataset_composed["is_tautology"].sum()
percentage = (tautologies / total) * 100
print(f"Percentage of tautologies in the composed dataset: {percentage:.2f}%")


formula         16000
is_tautology    16000
source          16000
index           16000
dtype: int64

Number of True and False formulas: 
is_tautology
False    10576
True      5424
Name: count, dtype: int64

Percentage of tautologies in the composed dataset: 41.72%


---
### **2 Preparing the Data**

In [60]:
TEST_SIZE = 0.2
BATCH_SIZE = 16
SEED = 42

(train_dataloader, test_dataloader, 
 X_train, X_test, 
 y_train, y_test, idx_test)  = prepare_formula_dataset(dataset = dataset,
                                                                 test_size=TEST_SIZE,
                                                                 batch_size=BATCH_SIZE,
                                                                 seed=SEED)

In [61]:
print(f"Training set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")

Training set: 10400 samples
Test set: 2600 samples


In [62]:
tokenizer = CustomTokenizer()
tokenizer.fit(X_train)

num_letters = sum(1 for formula in tokenizer.formula_to_token if isinstance(formula, Letter))
num_connectives = len(tokenizer.connective_map)
num_parenthesis = sum(1 for formula in tokenizer.special_map)

print(f"Number of unique letters: {num_letters}")
print(f"Number of unique connectives: {num_connectives}")
print(f"Number of spacial tokens: {num_parenthesis}")

Number of unique letters: 8
Number of unique connectives: 4
Number of spacial tokens: 2


In [63]:
import collections

train_class_counts = collections.Counter(y_train)
test_class_counts = collections.Counter(y_test)

print(f"Train class counts: {train_class_counts}")
print(f"Test class counts: {test_class_counts}")

Train class counts: Counter({False: 7662, True: 2738})
Test class counts: Counter({False: 1914, True: 686})


In [64]:
print(f"Train set (10400 samples): False = {(7662/10400)*100:.2f} % and True = {(2738/10400)* 100:.2f} %")
print(f"Test set (2600samples): False = {(1914/2600)*100:.2f} % and True = {(686/2600)*100:.2f} %")

Train set (10400 samples): False = 73.67 % and True = 26.33 %
Test set (2600samples): False = 73.62 % and True = 26.38 %


In [65]:
print(f"Dataloaders: {train_dataloader, test_dataloader}") 
print(f"Length of train dataloader: {len(train_dataloader)} batches of {BATCH_SIZE}")
print(f"Length of test dataloader: {len(test_dataloader)} batches of {BATCH_SIZE}")

Dataloaders: (<torch.utils.data.dataloader.DataLoader object at 0x7f669783f8e0>, <torch.utils.data.dataloader.DataLoader object at 0x7f6697840610>)
Length of train dataloader: 650 batches of 16
Length of test dataloader: 163 batches of 16


In [66]:
# Check out what's inside the training dataloader
train_features_batch, train_labels_batch = next(iter(train_dataloader)) # next() grabs the first batch from the iterator
print(f"{train_features_batch.shape, train_labels_batch.shape} -> [batch_size, num_of_tokens_per_formula], [bach_size]")

(torch.Size([16, 603]), torch.Size([16])) -> [batch_size, num_of_tokens_per_formula], [bach_size]


----
### **3 Set up device-agnostic code**

In [67]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

---
### **4 Build, Train and Test Models on Synthetic Dataset**

In [68]:
# --- Hyperparameters --- 
# Determine the vocabulary size for the embedding layer and add 1 for padding index (0)
VOCAB_SIZE = compute_vocab_size(tokenizer)
print(f"Vocabulary size (including padding token): {VOCAB_SIZE}")
print("Max index in batch:", train_features_batch.max().item())
assert train_features_batch.max().item() < VOCAB_SIZE, "Some token indices exceed the embedding size!"

EMBEDDING_DIM = 32
LR = 0.0005
EPOCHS = 5

Vocabulary size (including padding token): 108
Max index in batch: 107


#### **4.1 Model 1**

In [26]:
class RNN_V1(nn.Module):
    def __init__(self, vocab_size :int, embedding_dim :int, hidden_units: int, output_size :int):
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        self.rnn = nn.RNN(input_size=embedding_dim, hidden_size=hidden_units, batch_first=True)
        self.linear = nn.Linear(in_features=hidden_units, out_features=output_size)
    
    def forward(self, x: torch.Tensor):
        x = self.embedding(x)               # [batch_size, seq_len, embed_dim]
        _, h_n = self.rnn(x)                # [num_layers, batch_size, hidden_dim]
        last_hidden = h_n.squeeze(0)        # remove the first dimension, which is num_layers=1 
        output = self.linear(last_hidden)   # [batch_size, output_size] == [32, 1]
        output = output.squeeze(1)          # Reshape output to match label shape [32]
        return output

In [27]:
# --- Model 1 ---
model_1 = RNN_V1(vocab_size=VOCAB_SIZE, 
                 embedding_dim=EMBEDDING_DIM,
                 hidden_units=64,
                 output_size=1
).to(device) 

print(f"Model_1 is on the model device: {next(model_1.parameters()).device}")
model_1

Model_1 is on the model device: cuda:0


RNN_V1(
  (embedding): Embedding(108, 32)
  (rnn): RNN(32, 64, batch_first=True)
  (linear): Linear(in_features=64, out_features=1, bias=True)
)

In [28]:
# --- Loss and Optimizer Functions ---
loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(params=model_1.parameters(), 
                            lr=LR)

In [29]:
# Get a summary of Model_1 
summary(model_1, 
         input_size=train_features_batch.shape,
         dtypes=[torch.long],
         verbose=0,
         col_names=["input_size", "output_size", "num_params", "trainable"],
         col_width=20,
         row_settings=["var_names"],
         device=device
)

Layer (type (var_name))                  Input Shape          Output Shape         Param #              Trainable
RNN_V1 (RNN_V1)                          [16, 603]            [16, 1]              --                   True
├─Embedding (embedding)                  [16, 603]            [16, 603, 32]        3,456                True
├─RNN (rnn)                              [16, 603, 32]        [16, 603, 64]        6,272                True
├─Linear (linear)                        [16, 64]             [16, 1]              65                   True
Total params: 9,793
Trainable params: 9,793
Non-trainable params: 0
Total mult-adds (M): 60.57
Input size (MB): 0.08
Forward/backward pass size (MB): 7.41
Params size (MB): 0.04
Estimated Total Size (MB): 7.53

In [30]:
# --- Train and Test Model_1 ---
set_seeds()
model_1_results = train(model=model_1,
                        train_dataloader=train_dataloader,
                        test_dataloader=test_dataloader,
                        optimizer=optimizer,
                        loss_fn=loss_fn,
                        epochs=5,
                        device=device)

Training Epochs:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch: 1 | train_loss: 0.5785 | train_acc: 0.7367 | test_loss: 0.5768 | test_acc: 0.7362
Epoch: 2 | train_loss: 0.5777 | train_acc: 0.7367 | test_loss: 0.5773 | test_acc: 0.7362
Epoch: 3 | train_loss: 0.5772 | train_acc: 0.7367 | test_loss: 0.5767 | test_acc: 0.7362
Epoch: 4 | train_loss: 0.5773 | train_acc: 0.7367 | test_loss: 0.5768 | test_acc: 0.7370
Epoch: 5 | train_loss: 0.5770 | train_acc: 0.7369 | test_loss: 0.5765 | test_acc: 0.7370


In [31]:
save_results(model_1_results, target_dir="models_results", filename="Model_1_vanilla_rnn_results.csv")

[INFO] Results saved to: models_results/Model_1_vanilla_rnn_results.csv


In [33]:
print(f"\nGiven the data distribution and Model performances, model_1 predicts False every time — and that would still be right ~74% of the time.")


Given the data distribution and Model performances, model_1 predicts False every time — and that would still be right ~74% of the time.


#### **4.2 Model 2**

In [34]:
class GRU(nn.Module):
    def __init__(self, vocab_size, embedding_dim=EMBEDDING_DIM):
        super().__init__()

        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

        # First bidirectional GRU layer
        self.gru1 = nn.GRU(
            input_size=embedding_dim,
            hidden_size=128,
            batch_first=True,
            bidirectional=True
        )

        # Second bidirectional GRU layer
        self.gru2 = nn.GRU(
            input_size=128 * 2,  # Because bidirectional doubles output size
            hidden_size=64,
            batch_first=True,
            bidirectional=True
        )

        # Fully connected layers
        self.fc1 = nn.Linear(64 * 2, 32)  # Because bidirectional doubles output size
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(32, 1)  

    def forward(self, x):
        # x shape: [batch_size, seq_len]
        x = self.embedding(x)              # [batch_size, seq_len, embed_dim]

        out1, _ = self.gru1(x)             # [batch_size, seq_len, 256]
        out2, _ = self.gru2(out1)          # [batch_size, seq_len, 128]

        out2_last = out2[:, -1, :]         # Use the last timestep's features
        x = self.relu(self.fc1(out2_last)) # [batch_size, 32]
        output = self.fc2(x)               # [batch_size, 1]

        return output 

In [35]:
model_2 = GRU(vocab_size=VOCAB_SIZE, embedding_dim=EMBEDDING_DIM).to(device)

print(f"Model_2 is on the model device: {next(model_2.parameters()).device}")
model_2

Model_2 is on the model device: cuda:0


GRU(
  (embedding): Embedding(108, 32, padding_idx=0)
  (gru1): GRU(32, 128, batch_first=True, bidirectional=True)
  (gru2): GRU(256, 64, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=128, out_features=32, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=32, out_features=1, bias=True)
)

In [36]:
# --- Loss and Optimizer Functions ---
loss_fn = AsymmetricFocalLoss(
    alpha_pos=0.3,  # minority (tautology)
    alpha_neg=0.7,  # majority
    gamma_pos=3.0,
    gamma_neg=1.5
)
optimizer = torch.optim.Adam(params=model_2.parameters(), 
                            lr=LR)

In [37]:
# Get a summary of Model_2 
summary(model_2, 
         input_size=train_features_batch.shape,
         dtypes=[torch.long],
         verbose=0,
         col_names=["input_size", "output_size", "num_params", "trainable"],
         col_width=20,
         row_settings=["var_names"],
         device=device
)

Layer (type (var_name))                  Input Shape          Output Shape         Param #              Trainable
GRU (GRU)                                [16, 603]            [16, 1]              --                   True
├─Embedding (embedding)                  [16, 603]            [16, 603, 32]        3,456                True
├─GRU (gru1)                             [16, 603, 32]        [16, 603, 256]       124,416              True
├─GRU (gru2)                             [16, 603, 256]       [16, 603, 128]       123,648              True
├─Linear (fc1)                           [16, 128]            [16, 32]             4,128                True
├─ReLU (relu)                            [16, 32]             [16, 32]             --                   --
├─Linear (fc2)                           [16, 32]             [16, 1]              33                   True
Total params: 255,681
Trainable params: 255,681
Non-trainable params: 0
Total mult-adds (G): 2.39
Input size (MB): 0.08
Forwa

In [38]:
# --- Train and Test Model_2 ---
set_seeds()
model_2_results = train(model=model_2,
                        train_dataloader=train_dataloader,
                        test_dataloader=test_dataloader,
                        optimizer=optimizer,
                        loss_fn=loss_fn,
                        epochs=5,
                        device=device)

Training Epochs:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch: 1 | train_loss: 0.0429 | train_acc: 0.8403 | test_loss: 0.0213 | test_acc: 0.9463
Epoch: 2 | train_loss: 0.0214 | train_acc: 0.9428 | test_loss: 0.0197 | test_acc: 0.9548
Epoch: 3 | train_loss: 0.0188 | train_acc: 0.9512 | test_loss: 0.0157 | test_acc: 0.9594
Epoch: 4 | train_loss: 0.0153 | train_acc: 0.9587 | test_loss: 0.0153 | test_acc: 0.9486
Epoch: 5 | train_loss: 0.0133 | train_acc: 0.9608 | test_loss: 0.0129 | test_acc: 0.9640


In [39]:
save_results(model_2_results, target_dir="models_results", filename="Model_2_stacked_bidirectional_gru_results.csv")

[INFO] Results saved to: models_results/Model_2_stacked_bidirectional_gru_results.csv


In [40]:
save_model(model=model_2,
           target_dir="models",
           model_name="Bidirectional_GRU_trained_on_synth_formulas.pth")

[INFO] Saving model to: models/Bidirectional_GRU_trained_on_synth_formulas.pth


---

### **5 LSTM Tree**

Partiamo con il primo passo: la rappresentazione delle formule logiche come alberi. Questo ci servirà come struttura ricorsiva su cui il TreeLSTM opererà.

#### Passo 1: Creazione della struttura ad albero FormulaTreeNode
Ogni nodo dell’albero corrisponderà a un oggetto FormulaTreeNode, che rappresenta un nodo della formula logica.

In [69]:
class FormulaTreeNode:
    """
    Rappresenta un nodo nell'albero sintattico di una formula logica.
    """
    def __init__(self, formula):
        self.formula = formula
        self.children = []
        self.embedding_index = None  # sarà assegnato più avanti dal tokenizer

        # Costruzione ricorsiva dei figli
        if isinstance(formula, UnaryConnectiveFormula):
            self.children.append(FormulaTreeNode(formula.formula))

        elif isinstance(formula, BinaryConnectiveFormula):
            self.children.append(FormulaTreeNode(formula.left))
            self.children.append(FormulaTreeNode(formula.right))

    def __repr__(self):
        return f"Node({self.formula}, children={len(self.children)})"

Creazione di un FormulaTreeNode a partire da essa per verificare che la costruzione dell’albero funzioni correttamente.

In [70]:
# Example
example_formula = X_train[0]
print("Formula:", example_formula)

root_node = FormulaTreeNode(example_formula)
print("Radice:", root_node)
print("Figli:", root_node.children)

Formula: ((A0 → A1) ∨ (A2 ∧ A3) ∨ A4 ∨ A5 → A6) ∨ ¬(A7 → A0)
Radice: Node(((A0 → A1) ∨ (A2 ∧ A3) ∨ A4 ∨ A5 → A6) ∨ ¬(A7 → A0), children=2)
Figli: [Node((A0 → A1) ∨ (A2 ∧ A3) ∨ A4 ∨ A5 → A6, children=2), Node(¬(A7 → A0), children=1)]


Fase 2: implementazione del TreeLSTM cell, che è il cuore del modello. Useremo una versione binaria del TreeLSTM, visto che le formule logiche usano solo connectivi unari e binari.

#### Passo 2: Implementazione di BinaryTreeLSTMCell

Questa classe sarà un modulo PyTorch che combina l'input x di un nodo con gli hidden state dei suoi figli per produrre un nuovo stato.

In [79]:
class BinaryTreeLSTMCell(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.hidden_size = hidden_size

        # Linear transformations for i (input), o (output), and u (update) gates
        self.W_iou = nn.Linear(input_size, 3 * hidden_size)
        self.U_iou = nn.Linear(2 * hidden_size, 3 * hidden_size)

        # Linear transformations for forget gates (left and right children) 
        self.W_f = nn.Linear(input_size, 2 * hidden_size)
        self.U_f = nn.Linear(2 * hidden_size, 2 * hidden_size)

    def forward(self, x, left_state :Tuple[Tensor, Tensor], right_state :Tuple[Tensor, Tensor]):
        h_l, c_l = left_state  # left hidden states and cell states,  [1, hidd_size], [1, hidd_size]
        h_r, c_r = right_state # right hidden states and cell states, [1, hidd_size], [1, hidd_size]

        h_cat = torch.cat([h_l, h_r], dim=1)  # [1, 2xhidd_size]

        # Input, Output, Update gates
        iou = self.W_iou(x) + self.U_iou(h_cat)
        i, o, u = torch.chunk(torch.sigmoid(iou), 3, dim=1)
        u = torch.tanh(u)

        # Forget gates
        f = self.W_f(x) + self.U_f(h_cat)
        f_l, f_r = torch.chunk(torch.sigmoid(f), 2, dim=1)

        # Cell state
        c = i * u + f_l * c_l + f_r * c_r
        h = o * torch.tanh(c)

        return h, c

Ora passiamo al TreeLSTM encoder, che applicherà ricorsivamente la cella BinaryTreeLSTMCell a ogni nodo dell’albero logico.

### Passo 3: Costruzione di TreeLSTMEncoder
Questo modulo:

- trasforma un FormulaTreeNode in una rappresentazione vettoriale,
- ricorsivamente applica il TreeLSTM a tutti i figli,
- restituisce lo stato nascosto (h) e lo stato di memoria (c) della radice, che potremo usare per la classificazione.

In [80]:
class TreeLSTMEncoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size):
        super().__init__()

        # Nota: Questo encoder assume di ricevere un solo nodo (radice) per volta.
        # L'embedding è calcolato su un singolo indice: [1] → [1, E]
        # Non esiste gestione nativa del batch in questa architettura.

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.cell = BinaryTreeLSTMCell(input_size=embedding_dim, hidden_size=hidden_size)

    def forward(self, node: FormulaTreeNode):
        # Ottieni embedding del nodo corrente
        x = self.embedding(torch.tensor([node.embedding_index], device=self.embedding.weight.device))

        # Caso foglia: nessun figlio
        if len(node.children) == 0:
            zero_state = (
                torch.zeros(1, self.cell.hidden_size, device=x.device), # zero_state = (h_zero, c_zero)
                torch.zeros(1, self.cell.hidden_size, device=x.device)  # where h_zero: initial hidden state filled whit zeros
                                                                        # c_zero: initial cell state filled with zeros
            )
            h, c = self.cell(x, zero_state, zero_state)
            return h, c

        # Caso unary: un figlio (es. Negazione)
        elif len(node.children) == 1:
            child_state = self.forward(node.children[0])
            h, c = self.cell(x, child_state, child_state)  # The only child is duplicated, so that the binary cell can also be used for unary nodes.
            return h, c

        # Caso binary: due figli
        elif len(node.children) == 2:
            left_state = self.forward(node.children[0])
            right_state = self.forward(node.children[1])
            h, c = self.cell(x, left_state, right_state)
            return h, c

        else:
            raise ValueError(f"Unexpected number of children: {len(node.children)}")


Riepilogo:
- Ogni nodo dell’albero riceve il suo embedding.
- I figli vengono elaborati prima, poi i loro stati sono usati per calcolare lo stato del genitore.
- I nodi con 0, 1 o 2 figli sono trattati separatamente.

Ora ci occupiamo di associare a ogni nodo dell'albero il suo indice di embedding, usando il CustomTokenizer che hai già definito.

#### Passo 4: Assegnazione degli indici di embedding ai nodi
Il tokenizer già mappa ogni oggetto Formula in un intero. Ci serve un helper che:

- Visita ricorsivamente l’albero FormulaTreeNode
- Assegna a ogni nodo il corrispondente embedding_index usando tokenizer.formula_to_token.

In [71]:
def assign_embedding_indices(node: FormulaTreeNode, tokenizer: CustomTokenizer):
    """
    Ricorsivamente assegna a ciascun nodo dell'albero l'indice di embedding
    usando il tokenizer già fittato.
    """
    if node.formula in tokenizer.formula_to_token:
        node.embedding_index = tokenizer.formula_to_token[node.formula]
    else:
        # fallback: prova ad assegnare sulla base del tipo (per connettivi)
        if isinstance(node.formula, Falsity):
            node.embedding_index = tokenizer.falsity_token
        elif isinstance(node.formula, Letter):
            # If it is a letter but it is not in the tokenizer, it raises an error
            raise ValueError(f"Using an unknown letter is not accepted: {node.formula}")
        elif isinstance(node.formula, UnaryConnectiveFormula):
            node.embedding_index = tokenizer.connective_map[type(node.formula).__name__]
        elif isinstance(node.formula, BinaryConnectiveFormula):
            node.embedding_index = tokenizer.connective_map[type(node.formula).__name__]
        else:
            raise ValueError(f"Unknown formula: {node.formula}")

    for child in node.children:
        assign_embedding_indices(child, tokenizer)

In [72]:
# Example 

# Function: Print tree with embedding indexes
def print_tree_with_embeddings(node: FormulaTreeNode, prefix: str = "", is_last: bool = True):
    # Preparazione linea da stampare
    connector = "└── " if is_last else "├── "
    formula_str = str(node.formula)
    formula_type = type(node.formula).__name__
    print(f"{prefix}{connector}[{formula_type}] {formula_str} (embedding_index={node.embedding_index})")

    # Preparazione del prefisso per i figli
    new_prefix = prefix + ("    " if is_last else "│   ")
    child_count = len(node.children)
    for i, child in enumerate(node.children):
        is_child_last = i == (child_count - 1)
        print_tree_with_embeddings(child, new_prefix, is_child_last)

# Tokenizer fit
tokenizer = CustomTokenizer()
tokenizer.fit(X_train)

# Tree construction for a formula
example_formula = X_train[0]
root_node = FormulaTreeNode(example_formula)

# Embedding index assignment
assign_embedding_indices(root_node, tokenizer)

# Tree print with indexes
print_tree_with_embeddings(root_node)

└── [Disjunction] ((A0 → A1) ∨ (A2 ∧ A3) ∨ A4 ∨ A5 → A6) ∨ ¬(A7 → A0) (embedding_index=101)
    ├── [Implication] (A0 → A1) ∨ (A2 ∧ A3) ∨ A4 ∨ A5 → A6 (embedding_index=103)
    │   ├── [Disjunction] (A0 → A1) ∨ (A2 ∧ A3) ∨ A4 ∨ A5 (embedding_index=101)
    │   │   ├── [Disjunction] (A0 → A1) ∨ (A2 ∧ A3) ∨ A4 (embedding_index=101)
    │   │   │   ├── [Disjunction] (A0 → A1) ∨ (A2 ∧ A3) (embedding_index=101)
    │   │   │   │   ├── [Implication] A0 → A1 (embedding_index=103)
    │   │   │   │   │   ├── [Letter] A0 (embedding_index=1)
    │   │   │   │   │   └── [Letter] A1 (embedding_index=2)
    │   │   │   │   └── [Conjunction] A2 ∧ A3 (embedding_index=100)
    │   │   │   │       ├── [Letter] A2 (embedding_index=3)
    │   │   │   │       └── [Letter] A3 (embedding_index=4)
    │   │   │   └── [Letter] A4 (embedding_index=5)
    │   │   └── [Letter] A5 (embedding_index=6)
    │   └── [Letter] A6 (embedding_index=7)
    └── [Negation] ¬(A7 → A0) (embedding_index=102)
        └── [Impli

 Passiamo ora al classificatore finale, che prende il vettore h (output del nodo radice del TreeLSTM) e predice se la formula è una tautologia (1) o non-tautologia (0).

#### Passo 5: Classificatore TreeLSTM completo

In [83]:
class TreeLSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, fc_size):
        super().__init__()

        # Nota importante: questo modello lavora su UN singolo albero per volta.
        # Non supporta input batchificati come nei modelli sequenziali (es. GRU).
        # La dimensione batch non viene mai usata nei tensori in input/output: ogni forward
        # riceve una root `FormulaTreeNode`, non un tensore [B, ...].
        #
        # I tensori all’interno del modello (es. hidden state, embeddings) sono sempre di forma [1, H]
        # e vengono creati dinamicamente nel forward ricorsivo. Questo è compatibile con DataLoader,
        # ma impedisce il parallelismo tipico del batch training.

        self.encoder = TreeLSTMEncoder(vocab_size, embedding_dim, hidden_size)
        self.fc1 = nn.Linear(hidden_size, fc_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(fc_size, 1)  # Output binario (logit)

    def forward(self, root_node: FormulaTreeNode):
        h, _ = self.encoder(root_node)  # root's hidden state (ignore the cell state)
        x = self.relu(self.fc1(h))
        output = self.fc2(x)
        output = output.squeeze(1)  # shape: [batch_size] (senza dimensione extra)
        return output


In [84]:
# Esempio d’uso su una formula
model = TreeLSTMClassifier(vocab_size=VOCAB_SIZE, embedding_dim=32, hidden_size=64, fc_size=32).to(device)

# Prepara l'albero
example_formula = X_train[0]
root = FormulaTreeNode(example_formula)
assign_embedding_indices(root, tokenizer)

# Inference
model.eval()
with torch.inference_mode():
    logits = model(root)
    prob = torch.sigmoid(logits).item()
    prediction = round(prob)
    print(f"Formula: {example_formula}, Tautology status: {y_train[0]}")
    print(f"Model's tautology probabiity: {prob:.4f} -> Prediction: {prediction}")


Formula: ((A0 → A1) ∨ (A2 ∧ A3) ∨ A4 ∨ A5 → A6) ∨ ¬(A7 → A0), Tautology status: False
Model's tautology probabiity: 0.4796 -> Prediction: 0


Ora adattiamo il sistema per addestrare un modello TreeLSTM su interi alberi, invece che su sequenze di token.

#### Passo 6: Dataset e collate_fn per strutture ad albero
Obiettivo - Costruire:

- Un Dataset che restituisce (FormulaTreeNode, label)
- Un collate_fn che crea un batch (in realtà una lista) di alberi
- Un DataLoader che usa collate_fn
- Un adattamento di train_step() e test_step() per TreeLSTM

6.1. Dataset basato su alberi

In [85]:
class TreeFormulaDataset(torch.utils.data.Dataset):
    def __init__(self, formulas: List[Formula], labels: List[float], tokenizer: CustomTokenizer):
        self.formulas = formulas
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.formulas)

    def __getitem__(self, idx):
        formula = self.formulas[idx]
        root = FormulaTreeNode(formula)
        assign_embedding_indices(root, self.tokenizer)
        label = torch.tensor(self.labels[idx], dtype=torch.float32)
        return root, label


6.2. Collate function
Non dobbiamo fare padding: basta restituire le liste.

In [None]:
def tree_collate_fn(batch: List[Tuple[FormulaTreeNode, torch.Tensor]]) -> Tuple[List[FormulaTreeNode], torch.Tensor]:
    """
    Collate function personalizzata per TreeLSTM:
    - Riceve il batch del DataLoader: una lista di tuple (root, label)
    - Restituisce:
        * una lista di root (non tensorizzabile)
        * un tensore impilato delle label

    Args:
        batch: Lista di tuple (FormulaTreeNode, label)

    Returns:
        roots: lista di FormulaTreeNode
        labels: tensori scalari (float), impilati in un batch tensoriale di shape [batch_size]
    """
    # Estraiamo separatamente le radici degli alberi e le etichette:
    roots, labels = zip(*batch)
    
    return list(roots), torch.stack(labels)


6.3. Creazione dei dataloader - Rappresentative Subset for experiments

Funzione chiamata create_balanced_subset() che:

- Prende in input: tutte le formule e le etichette (0 = non tautologia, 1 = tautologia)
- Seleziona un sottoinsieme bilanciato con la stessa proporzione (~74% / 26%)
- Ritorna i DataLoader già pronti per l’addestramento

In [87]:
def create_balanced_subset_from_dataset(dataset: pd.DataFrame,
                                        tokenizer: CustomTokenizer,
                                        train_size: int = 1000,
                                        test_size: int = 200,
                                        positive_ratio: float = 0.26,
                                        batch_size: int = 2,
                                        seed: int = 42):
    assert 0.0 < positive_ratio < 1.0 
    assert "formula" in dataset.columns and "is_tautology" in dataset.columns 

    # Suddividi in base alla classe
    taut = dataset[dataset["is_tautology"] == 1]
    nontaut = dataset[dataset["is_tautology"] == 0]

    # Campiona quantità bilanciate
    train_taut = int(train_size * positive_ratio)
    train_nontaut = train_size - train_taut
    test_taut = int(test_size * positive_ratio)
    test_nontaut = test_size - test_taut

    train_taut_subset = taut.sample(train_taut, random_state=seed)
    test_taut_subset = taut.drop(train_taut_subset.index).sample(test_taut, random_state=seed)
    train_nontaut_subset = nontaut.sample(train_nontaut, random_state=seed)
    test_nontaut_subset = nontaut.drop(train_nontaut_subset.index).sample(test_nontaut, random_state=seed)

    # Costruzione set finali
    train_subset = pd.concat([train_taut_subset, train_nontaut_subset]).sample(frac=1, random_state=seed)
    test_subset = pd.concat([test_taut_subset, test_nontaut_subset]).sample(frac=1, random_state=seed)

    # Log distribuzione classi
    train_counts = Counter(train_subset["is_tautology"])
    test_counts = Counter(test_subset["is_tautology"])
    train_ratio = train_counts[1] / len(train_subset) * 100
    test_ratio = test_counts[1] / len(test_subset) * 100

    print(f"Train set: {train_counts} (tautologies: {train_ratio:.1f}%)")
    print(f"Test set:  {test_counts} (tautologies: {test_ratio:.1f}%)")

    # Dataset PyTorch
    train_dataset = TreeFormulaDataset(train_subset["formula"].tolist(), train_subset["is_tautology"].tolist(), tokenizer)
    test_dataset = TreeFormulaDataset(test_subset["formula"].tolist(), test_subset["is_tautology"].tolist(), tokenizer)

    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=batch_size, shuffle=True, collate_fn=tree_collate_fn
    )
    test_loader = torch.utils.data.DataLoader(
        test_dataset, batch_size=batch_size, shuffle=False, collate_fn=tree_collate_fn
    )

    # Nota: anche se viene specificato un batch_size > 1,
    # il TreeLSTM non elabora veri batch paralleli. 
    # Il DataLoader restituisce una lista di alberi (`roots`) e un batch di etichette,
    # ma ogni albero viene processato individualmente nel training loop, 
    # perché le strutture ad albero non sono compatibili con operazioni vettoriali batched.

    return train_loader, test_loader

In [88]:
tree_train_loader, tree_test_loader = create_balanced_subset_from_dataset(dataset,       
                                                                          tokenizer,
                                                                          train_size=1000,
                                                                          test_size=200,
                                                                          positive_ratio=0.26,
                                                                          batch_size=2
                                                                         )

Train set: Counter({False: 740, True: 260}) (tautologies: 26.0%)
Test set:  Counter({False: 148, True: 52}) (tautologies: 26.0%)


6.4. Adattamento di train_step e test_step per TreeLSTM

In [97]:
def tree_train_step(model, dataloader, loss_fn, optimizer, device):
    model.train()
    total_loss = 0.0
    total_correct = 0
    total_examples = 0

    # Nota_1 Batch: anche se batch_size > 1, ogni albero viene processato individualmente.
    # Questo approccio non è "batchificato" nel senso stretto (non sfrutta il parallelismo GPU).
    # PyTorch DataLoader restituisce un batch di radici, ma il modello è invocato una volta per albero.   
    # Questo perché gli alberi hanno struttura variabile e non possono essere impilati in un singolo tensore.

    # Nota_2 Device: non mandiamo le radici (root) su device, perché non sono tensori PyTorch ma oggetti FormulaTreeNode.
    # Il passaggio su device avviene all'interno del modello (es. embedding), quando servono veri tensori.
    # root.to(device) darebbe errore, mentre x = embedding(torch.tensor(..., device=...)) è sicuro.
    for roots, labels in dataloader:
        labels = labels.to(device)

        batch_logits = []
        for root in roots:
            root_logits = model(root)                    # shape [1]
            batch_logits.append(root_logits)

        logits = torch.stack(batch_logits).squeeze(1)    # (after stach:) [batch_size, 1] -> (after squezze(1):) [batch_size]
        preds = torch.round(torch.sigmoid(logits))

        loss = loss_fn(logits, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        total_correct += (preds == labels).sum().item()
        total_examples += len(labels)

    avg_loss = total_loss / len(dataloader)
    accuracy = total_correct / total_examples
    return avg_loss, accuracy

def tree_test_step(model, dataloader, loss_fn, device):
    model.eval()
    total_loss = 0.0
    total_correct = 0
    total_examples = 0

    with torch.inference_mode():
        # Nota_1 Batch: anche se batch_size > 1, ogni albero viene processato individualmente.
        # Questo approccio non è "batchificato" nel senso stretto (non sfrutta il parallelismo GPU).
        # PyTorch DataLoader restituisce un batch di radici, ma il modello è invocato una volta per albero.
        # Questo perché gli alberi hanno struttura variabile e non possono essere impilati in un singolo tensore.

        # Nota_2 Device: non mandiamo le radici (root) su device, perché non sono tensori PyTorch ma oggetti FormulaTreeNode.
        # Il passaggio su device avviene all'interno del modello (es. embedding), quando servono veri tensori.
        # root.to(device) darebbe errore, mentre x = embedding(torch.tensor(..., device=...)) è sicuro.
        for roots, labels in dataloader:
            labels = labels.to(device)

            batch_logits = []
            for root in roots:
                root_logits = model(root)                 # shape [1]
                batch_logits.append(root_logits)

            logits = torch.stack(batch_logits).squeeze(1) # (after stach:) [batch_size, 1] -> (after squezze(1):) [batch_size]
            preds = torch.round(torch.sigmoid(logits))

            loss = loss_fn(logits, labels)

            total_loss += loss.item()
            total_correct += (preds == labels).sum().item()
            total_examples += len(labels)

    avg_loss = total_loss / len(dataloader)
    accuracy = total_correct / total_examples
    return avg_loss, accuracy

6.5. Ciclo di training

In [98]:
def train_tree_lstm(model, train_loader, test_loader, loss_fn, optimizer, epochs, device):
    results = {"train_loss": [], "train_acc": [], "test_loss": [], "test_acc": []}

    for epoch in tqdm(range(epochs), desc="Training Epochs"):
        train_loss, train_acc = tree_train_step(model, train_loader, loss_fn, optimizer, device)
        test_loss, test_acc = tree_test_step(model, test_loader, loss_fn, device)

        print(f"Epoch {epoch+1} |"
              f"train_loss: {train_loss:.4f} |"
              f"train_acc={train_acc:.4f} |"
              f"test_loss={test_loss:.4f} |"
              f"test_acc={test_acc:.4f}"
        )

        results["train_loss"].append(train_loss)
        results["train_acc"].append(train_acc)
        results["test_loss"].append(test_loss)
        results["test_acc"].append(test_acc)

    return results

---
### **6 WandB Sweep**

Strategia modulare consigliata (3 step)

STEP 1 — Ottimizzazione mirata del core (modello)
Testa la combinazione:

hidden_size ∈ [32, 64, 128]
fc_size ∈ [16, 32, 64]
embedding_dim = 32 (fisso, per ora)
loss e lr = fissi (usa quelli del tuo modello base)
Obiettivo: vedere se c’è un “collo di bottiglia” nella rete.

STEP 2 — Ottimizzazione focal loss
Fissa il modello (usa i migliori parametri del passaggio 1) e varia:

alpha_pos ∈ [0.2, 0.4, 0.5]
alpha_neg ∈ [0.6, 0.7, 0.8]
gamma_pos ∈ [2.0, 3.0, 4.0]
gamma_neg ∈ [1.0, 1.5, 2.0]
Obiettivo: trovare il giusto equilibrio per classi sbilanciate.

STEP 3 — Learning rate finetuning
Blocca modello e loss ottimizzati, e varia:

learning_rate ∈ [1e-4, 3e-4, 5e-4, 1e-3]
Obiettivo: capire la sensibilità alla velocità di apprendimento.


Dopo i 3 step

Quando avrai:

best_model_config
best_loss_config
best_lr
puoi fare un mini-sweep combinato solo su queste 2–3 migliori combinazioni per confermare stabilità.

 **Sweep config — Step 1: modello (hidden e fc)**

In [13]:
# First Sweep results
sweep_1_dataset = pd.read_csv("datasets/wandb_Sweep_1_TreeLSTM_hidden_states_fc_size.csv")

sweep_1_dataset

Unnamed: 0,Name,num_epochs,learning_rate,embedding_dim,hidden_size,fc_size,alpha_pos,alpha_neg,gamma_pos,gamma_neg,train_loss,train_acc,test_loss,test_acc
0,TreeLSTM_h128_fc32,5,0.0005,32,128,32,0.3,0.7,3,1.5,0.022615,0.921,0.024835,0.92
1,TreeLSTM_h128_fc64,5,0.0005,32,128,64,0.3,0.7,3,1.5,0.023098,0.929,0.025563,0.92
2,TreeLSTM_h128_fc16,5,0.0005,32,128,16,0.3,0.7,3,1.5,0.023888,0.928,0.026621,0.92
3,TreeLSTM_h64_fc64,5,0.0005,32,64,64,0.3,0.7,3,1.5,0.025826,0.915,0.024889,0.915
4,TreeLSTM_h32_fc64,5,0.0005,32,32,64,0.3,0.7,3,1.5,0.037759,0.789,0.033304,0.895
5,TreeLSTM_h64_fc16,5,0.0005,32,64,16,0.3,0.7,3,1.5,0.037697,0.744,0.033604,0.84
6,TreeLSTM_h64_fc32,5,0.0005,32,64,32,0.3,0.7,3,1.5,0.03674,0.74,0.032706,0.76
7,TreeLSTM_h32_fc16,5,0.0005,32,32,16,0.3,0.7,3,1.5,0.048183,0.74,0.044698,0.74
8,TreeLSTM_h32_fc32,5,0.0005,32,32,32,0.3,0.7,3,1.5,0.048044,0.74,0.04532,0.74


In [7]:
# Best Configurations
best_sweep_1_dataset = sweep_1_dataset.sort_values("test_acc", ascending=False)
best_sweep_1_dataset[["hidden_size", "fc_size", "test_acc", "train_acc", "train_loss", "test_loss"]].head(1)

Unnamed: 0,hidden_size,fc_size,test_acc,train_acc,train_loss,test_loss
0,128,32,0.92,0.921,0.022615,0.024835


In [8]:
best_config_sweep_1 = best_sweep_1_dataset.iloc[0]  # first row, best run
best_hidden_sweep_1 = best_config_sweep_1["hidden_size"]
best_fc_sweep_1 = best_config_sweep_1["fc_size"]

print(f"Best conf. hidden states: {best_hidden_sweep_1}"
      f"\nBest conf. fully conn. layers: {best_fc_sweep_1}")

Best conf. hidden states: 128
Best conf. fully conn. layers: 32


**Sweep config — Step 2: alpha_pos, alpha_neg, gamma_pos, gamma_neg**

In [2]:
# Second Sweep results
sweep_2_dataset = pd.read_csv("datasets/wandb_Sweep_2_TreeLSTM_alpha_gamma.csv")

sweep_2_dataset

NameError: name 'pd' is not defined

In [19]:
# Ordering by test_acc in decreasing order and, if test_acc is the same, for test_loss in increasing order
best_sweep_2_dataset = sweep_2_dataset.sort_values(by=["test_acc", "test_loss"], ascending=[False, True])
best_sweep_2_dataset[["alpha_pos", "alpha_neg", "gamma_pos", "gamma_neg", "train_loss", "train_acc", "test_loss", "test_acc"]].head(3)

Unnamed: 0,alpha_pos,alpha_neg,gamma_pos,gamma_neg,train_loss,train_acc,test_loss,test_acc
4,0.25,0.7,3.0,2.5,0.011988,0.93,0.01543,0.93
5,0.35,0.65,3.0,2.5,0.014446,0.938,0.016611,0.93
0,0.3,0.65,3.5,1.5,0.019334,0.929,0.023019,0.93


In [20]:
best_config_sweep_2 = best_sweep_2_dataset.iloc[0]  # first row, best run

best_alpha_pos_sweep_2 = best_config_sweep_2["alpha_pos"]
best_alpha_neg_sweep_2 = best_config_sweep_2["alpha_neg"]

best_gamma_pos_sweep_2 = best_config_sweep_2["gamma_pos"]
best_gamma_neg_sweep_2 = best_config_sweep_2["gamma_neg"]


print(f"Best conf. alpha pos: {best_alpha_pos_sweep_2}"
      f"\nBest conf. alpha neg: {best_alpha_neg_sweep_2}"
      f"\n\nBest conf. gamma pos: {best_gamma_pos_sweep_2}"
      f"\nBest conf. gamma neg: {best_gamma_neg_sweep_2}")

Best conf. alpha pos: 0.25
Best conf. alpha neg: 0.7

Best conf. gamma pos: 3.0
Best conf. gamma neg: 2.5


**Sweep config — Step 3: learning_rate**

In [6]:
# Third Sweep results
sweep_3_dataset = pd.read_csv("datasets/wandb_Sweep_3_TreeLSTM_lr.csv")

sweep_3_dataset

Unnamed: 0,Name,num_epochs,learning_rate,embedding_dim,hidden_size,fc_size,alpha_pos,alpha_neg,gamma_pos,gamma_neg,train_loss,train_acc,test_loss,test_acc
0,TreeLSTM_h128_fc32,5,0.0009,32,128,32,0.25,0.7,3,2.5,0.009835,0.943,0.01123,0.93
1,TreeLSTM_h128_fc32,5,0.0007,32,128,32,0.25,0.7,3,2.5,0.010163,0.941,0.012574,0.93
2,TreeLSTM_h128_fc32,5,0.0004,32,128,32,0.25,0.7,3,2.5,0.011897,0.933,0.014662,0.925
3,TreeLSTM_h128_fc32,5,0.0006,32,128,32,0.25,0.7,3,2.5,0.010608,0.937,0.01256,0.925
4,TreeLSTM_h128_fc32,5,0.0005,32,128,32,0.25,0.7,3,2.5,0.013062,0.93,0.015351,0.92
5,TreeLSTM_h128_fc32,5,0.0008,32,128,32,0.25,0.7,3,2.5,0.011169,0.934,0.012204,0.92
6,TreeLSTM_h128_fc32,5,0.0003,32,128,32,0.25,0.7,3,2.5,0.017525,0.908,0.01679,0.915
7,TreeLSTM_h128_fc32,5,0.0002,32,128,32,0.25,0.7,3,2.5,0.022405,0.74,0.020852,0.785
8,TreeLSTM_h128_fc32,5,0.0001,32,128,32,0.25,0.7,3,2.5,0.028553,0.74,0.027474,0.74


In [9]:
# Ordering by test_acc in decreasing order and, if test_acc is the same, for test_loss in increasing order
best_sweep_3_dataset = sweep_3_dataset.sort_values(by=["test_acc", "test_loss"], ascending=[False, True])
best_sweep_3_dataset[["learning_rate", "alpha_pos", "alpha_neg", "gamma_pos", "gamma_neg", "train_loss", "train_acc", "test_loss", "test_acc"]].head(3)

Unnamed: 0,learning_rate,alpha_pos,alpha_neg,gamma_pos,gamma_neg,train_loss,train_acc,test_loss,test_acc
0,0.0009,0.25,0.7,3,2.5,0.009835,0.943,0.01123,0.93
1,0.0007,0.25,0.7,3,2.5,0.010163,0.941,0.012574,0.93
3,0.0006,0.25,0.7,3,2.5,0.010608,0.937,0.01256,0.925


In [13]:
best_config_sweep_3 = best_sweep_3_dataset.iloc[0]  # first row, best run

best_lr_sweep_3 = best_config_sweep_3["learning_rate"]

best_alpha_pos_sweep_3 = best_config_sweep_3["alpha_pos"]
best_alpha_neg_sweep_3 = best_config_sweep_3["alpha_neg"]

best_gamma_pos_sweep_3 = best_config_sweep_3["gamma_pos"]
best_gamma_neg_sweep_3 = best_config_sweep_3["gamma_neg"]

print("Final best hyperparameters congigration: \n")
print(f"- Best conf. learning rate: {best_lr_sweep_3}"
      f"\n\n- Best conf. alpha pos: {best_alpha_pos_sweep_3}"
      f"\n- Best conf. alpha neg: {best_alpha_neg_sweep_3}"
      f"\n\n- Best conf. gamma pos: {best_gamma_pos_sweep_3}"
      f"\n- Best conf. gamma neg: {best_gamma_neg_sweep_3}")

Final best hyperparameters congigration: 

- Best conf. learning rate: 0.0009

- Best conf. alpha pos: 0.25
- Best conf. alpha neg: 0.7

- Best conf. gamma pos: 3
- Best conf. gamma neg: 2.5


----

In [None]:
# Allenamento e Test su tutto il datset con best Hyperparameters 
def prepare_tree_dataset(dataset: pd.DataFrame,
                         test_size: float,
                         batch_size: int,
                         tokenizer: CustomTokenizer,
                         seed: int = 42):
    """
    Prepara dataloader per TreeLSTM usando tutto il dataset, con eventuale split train/test.

    Args:
        dataset (pd.DataFrame): contiene colonne 'formula' e 'is_tautology'
        test_size (float): proporzione di dati da usare per il test
        batch_size (int): dimensione dei batch
        tokenizer (CustomTokenizer): tokenizer già fittato
        seed (int): seme per riproducibilità

    Returns:
        train_loader, test_loader: DataLoader PyTorch
    """
    assert "formula" in dataset.columns and "is_tautology" in dataset.columns

    # Parsing delle formule
    formulas = [parse_formula_string(f) for f in dataset["formula"]]
    labels = dataset["is_tautology"].tolist()

    # Split train/test
    train_formulas, test_formulas, train_labels, test_labels = train_test_split(
        formulas, labels, test_size=test_size, random_state=seed
    )

    # Dataset PyTorch
    train_dataset = TreeFormulaDataset(train_formulas, train_labels, tokenizer)
    test_dataset = TreeFormulaDataset(test_formulas, test_labels, tokenizer)

    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=batch_size, shuffle=True, collate_fn=tree_collate_fn
    )
    test_loader = torch.utils.data.DataLoader(
        test_dataset, batch_size=batch_size, shuffle=False, collate_fn=tree_collate_fn
    )

    return train_loader, test_loader





# --- Tokenizer ---
tokenizer = CustomTokenizer()
tokenizer.fit([parse_formula_string(f) for f in dataset["formula"]])

# --- Tree train and Tree test sets ---
tree_train_loader, tree_test_loader = prepare_tree_dataset(
    dataset=dataset,
    test_size=0.2,
    batch_size=2,
    tokenizer=tokenizer,
    seed=42
)

---

Definizione del modello TreeLSTMClassifier

In [40]:
EMBEDDING_DIM = 32
HIDDEN_SIZE = 64

tree_model = TreeLSTMClassifier(
    vocab_size=VOCAB_SIZE,
    embedding_dim=EMBEDDING_DIM,
    hidden_size=HIDDEN_SIZE
).to(device)

tree_model

TreeLSTMClassifier(
  (encoder): TreeLSTMEncoder(
    (embedding): Embedding(108, 32, padding_idx=0)
    (cell): BinaryTreeLSTMCell(
      (W_iou): Linear(in_features=32, out_features=192, bias=True)
      (U_iou): Linear(in_features=128, out_features=192, bias=True)
      (W_f): Linear(in_features=32, out_features=128, bias=True)
      (U_f): Linear(in_features=128, out_features=128, bias=True)
    )
  )
  (fc1): Linear(in_features=64, out_features=32, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=32, out_features=1, bias=True)
)

Loss e Ottimizzatore

In [41]:
loss_fn = AsymmetricFocalLoss(
    alpha_pos=0.3,  # minority class (tautology)
    alpha_neg=0.7,  # majority class
    gamma_pos=3.0,
    gamma_neg=1.5
)

optimizer = torch.optim.Adam(tree_model.parameters(), lr=0.0005)

In [42]:
set_seeds()
tree_lstm_results = train_tree_lstm(
    model=tree_model,
    train_loader=tree_train_loader,
    test_loader=tree_test_loader,
    loss_fn=loss_fn,
    optimizer=optimizer,
    epochs=5,
    device=device
)

  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1: train_loss=0.0195, train_acc=0.9271, test_loss=0.0141, test_acc=0.9423
Epoch 2: train_loss=0.0106, train_acc=0.9613, test_loss=0.0122, test_acc=0.9708
Epoch 3: train_loss=0.0087, train_acc=0.9673, test_loss=0.0085, test_acc=0.9742
Epoch 4: train_loss=0.0066, train_acc=0.9727, test_loss=0.0092, test_acc=0.9608
Epoch 5: train_loss=0.0047, train_acc=0.9807, test_loss=0.0057, test_acc=0.9804


In [43]:
save_results(tree_lstm_results, target_dir="models_results", filename="Tree_lstm_results.csv")

[INFO] Results saved to: models_results/Tree_lstm_results.csv


In [46]:
save_model(model=tree_model,
           target_dir="models",
           model_name="Tree_lstm.pth")

[INFO] Saving model to: models/Tree_lstm.pth


---

---