**TabNet**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
from pytorch_tabular import TabularModel
from pytorch_tabular.models import TabNetModelConfig
from pytorch_tabular.config import (
    DataConfig,
    OptimizerConfig,
    TrainerConfig,
    ExperimentConfig,
)

# Caricamento dei dati
FILENAME = "train.csv"
df = pd.read_csv(FILENAME)

# Normalizzazione delle features dopo l'imputazione degli outlier
scaler = MinMaxScaler()
df.iloc[:, 1:] = scaler.fit_transform(df.iloc[:, 1:])

# Seleziona le variabili di input (X) e output (y)
X = df.drop("Year", axis=1)
y = df["Year"]

# Suddivisione del dataset in set di addestramento, validazione e test
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Creazione di DataFrame separati per addestramento, validazione e test
train_data = pd.concat([X_train, y_train], axis=1)
val_data = pd.concat([X_val, y_val], axis=1)

# Configurazione del modello TabNet per la regressione
data_config = DataConfig(
    target=["Year"],
    continuous_cols=X.columns.tolist(),
    )

optimizer_config = OptimizerConfig()

model_config = TabNetModelConfig(
    task="regression",
    learning_rate=1e-3,
    n_d=32, 
    n_a=32,  
    n_steps=5,
    gamma=1.3,
)


trainer_config = TrainerConfig(
    max_epochs=100,
    batch_size=256,  # Riduci il batch size
    early_stopping_patience=10,
)



# Creazione e addestramento del modello TabNet
tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
)

# Aumenta il numero di workers nei DataLoader
tabular_model.fit(train=train_data, validation=val_data)

# Previsioni sul set di addestramento, validazione e test
y_train_pred = tabular_model.predict(X_train)
y_val_pred = tabular_model.predict(X_val)
y_test_pred = tabular_model.predict(X_test)

# Valutazione delle prestazioni del modello
mse_train = mean_squared_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)

mse_val = mean_squared_error(y_val, y_val_pred)
r2_val = r2_score(y_val, y_val_pred)

mse_test = mean_squared_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)

# Stampa dei risultati
print("Performance sul set di addestramento:")
print(f"MSE: {mse_train}")
print(f"R-squared: {r2_train}")
print("\nPerformance sul set di validazione:")
print(f"MSE: {mse_val}")
print(f"R-squared: {r2_val}")
print("\nPerformance sul set di test:")
print(f"MSE: {mse_test}")
print(f"R-squared: {r2_test}")

**TabTransformer**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
from pytorch_tabular import TabularModel
from pytorch_tabular.models import TabTransformerConfig
from pytorch_tabular.config import (
    DataConfig,
    OptimizerConfig,
    TrainerConfig,
    ExperimentConfig,
)

# Caricamento dei dati
FILENAME = "train.csv"
df = pd.read_csv(FILENAME)

# Seleziona le variabili di input (X) e output (y)
X = df.drop("Year", axis=1)
y = df["Year"]

# Suddivisione del dataset in set di addestramento, validazione e test
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Creazione di DataFrame separati per addestramento, validazione e test
train_data = pd.concat([pd.DataFrame(X_train, columns=X.columns), y_train], axis=1)
val_data = pd.concat([pd.DataFrame(X_val, columns=X.columns), y_val], axis=1)

# Configurazione del modello TabNet per la regressione
data_config = DataConfig(
    target=["Year"],
    continuous_cols=X.columns.tolist(),
)

optimizer_config = OptimizerConfig()

model_config = TabTransformerConfig(
    task="regression",
    learning_rate=0.001, 
)


trainer_config = TrainerConfig(
    auto_lr_find=True,  
    max_epochs=100,
    early_stopping_patience=10, 
)

# Creazione e addestramento del modello TabNet
tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
)

# Aumenta il numero di workers nei DataLoader
tabular_model.fit(train=train_data, validation=val_data)

# Previsioni sul set di addestramento, validazione e test
y_train_pred = tabular_model.predict(pd.DataFrame(X_train, columns=X.columns))
y_val_pred = tabular_model.predict(pd.DataFrame(X_val, columns=X.columns))
y_test_pred = tabular_model.predict(pd.DataFrame(X_test, columns=X.columns))

# Valutazione delle prestazioni del modello
mse_train = mean_squared_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)

mse_val = mean_squared_error(y_val, y_val_pred)
r2_val = r2_score(y_val, y_val_pred)

mse_test = mean_squared_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)

# Stampa dei risultati
print("Performance sul set di addestramento:")
print(f"MSE: {mse_train}")
print(f"R-squared: {r2_train}")
print("\nPerformance sul set di validazione:")
print(f"MSE: {mse_val}")
print(f"R-squared: {r2_val}")
print("\nPerformance sul set di test:")
print(f"MSE: {mse_test}")
print(f"R-squared: {r2_test}")


# Prove


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler  # Importa lo StandardScaler
from pytorch_tabular import TabularModel
from pytorch_tabular.models import TabNetModelConfig
from pytorch_tabular.config import (
    DataConfig,
    OptimizerConfig,
    TrainerConfig,
    ExperimentConfig,
)

# Caricamento dei dati
FILENAME = "train.csv"
df = pd.read_csv(FILENAME)

# Seleziona le variabili di input (X) e output (y)
X = df.drop("Year", axis=1)
y = df["Year"]

# Suddivisione del dataset in set di addestramento, validazione e test
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Configurazione del modello TabNet per la regressione
data_config = DataConfig(
    target=["Year"],
    continuous_cols=X.columns.tolist(),
)

optimizer_config = OptimizerConfig()

model_config = TabNetModelConfig(
    task="regression",
)

trainer_config = TrainerConfig(
    auto_lr_find=True,
    max_epochs=100,
    early_stopping_patience=10,
)

# Creazione e addestramento del modello TabNet
tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
)

# Addestramento del modello con dati pre-processati
tabular_model.fit(train=pd.concat([X_train, y_train], axis=1), validation=pd.concat([X_val, y_val], axis=1))
result = tabular_model.evaluate(X_test)

pred_df = tabular_model.predict(X_test)
print(pred_df.shape)




Seed set to 42


GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


/Users/alessandrotocco/Library/Python/3.9/lib/python/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:639: Checkpoint directory /Users/alessandrotocco/Desktop/Universita/Data Analysis/DataAnalyticsProject/Training_Module/saved_models exists and is not empty.
/Users/alessandrotocco/Library/Python/3.9/lib/python/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.
/Users/alessandrotocco/Library/Python/3.9/lib/python/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_steps=100` reached.
Learning rate set to 0.8317637711026709
Restoring states from the checkpoint path at /Users/alessandrotocco/Desktop/Universita/Data Analysis/DataAnalyticsProject/Training_Module/.lr_find_69aab239-3156-44b3-9a30-516a47019309.ckpt
Restored all states from the checkpoint at /Users/alessandrotocco/Desktop/Universita/Data Analysis/DataAnalyticsProject/Training_Module/.lr_find_69aab239-3156-44b3-9a30-516a47019309.ckpt


Output()

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
from pytorch_tabular import TabularModel
from pytorch_tabular.models import TabTransformerConfig
from pytorch_tabular.config import (
    DataConfig,
    OptimizerConfig,
    TrainerConfig,
    ExperimentConfig,
)

# Caricamento dei dati
FILENAME = "train.csv"
df = pd.read_csv(FILENAME)

# Seleziona le variabili di input (X) e output (y)
X = df.drop("Year", axis=1)
y = df["Year"]

# Suddivisione del dataset in set di addestramento, validazione e test
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Creazione di DataFrame separati per addestramento, validazione e test
train_data = pd.concat([X_train, y_train], axis=1)
val_data = pd.concat([X_val, y_val], axis=1)

# Configurazione del modello TabNet per la regressione
data_config = DataConfig(
    target=["Year"],
    continuous_cols=X.columns.tolist(),
    )

optimizer_config = OptimizerConfig()

model_config = TabTransformerConfig(
    task="regression"
)

trainer_config = TrainerConfig(
    auto_lr_find=True,  
    max_epochs=100,
    early_stopping_patience=10, 
)

# Creazione e addestramento del modello TabNet
tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
)

# Aumenta il numero di workers nei DataLoader
tabular_model.fit(train=train_data, validation=val_data)

# Previsioni sul set di addestramento, validazione e test
y_train_pred = tabular_model.predict(X_train)
y_val_pred = tabular_model.predict(X_val)
y_test_pred = tabular_model.predict(X_test)

#result = tabular_model.evaluate(test)
#pred_df = tabular_model.predict(test)

# Valutazione delle prestazioni del modello
mse_train = mean_squared_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)

mse_val = mean_squared_error(y_val, y_val_pred)
r2_val = r2_score(y_val, y_val_pred)

mse_test = mean_squared_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)

# Stampa dei risultati
print("Performance sul set di addestramento:")
print(f"MSE: {mse_train}")
print(f"R-squared: {r2_train}")
print("\nPerformance sul set di validazione:")
print(f"MSE: {mse_val}")
print(f"R-squared: {r2_val}")
print("\nPerformance sul set di test:")
print(f"MSE: {mse_test}")
print(f"R-squared: {r2_test}")

Seed set to 42


GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


/Users/alessandrotocco/Library/Python/3.9/lib/python/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:639: Checkpoint directory /Users/alessandrotocco/Desktop/Universita/Data Analysis/DataAnalyticsProject/Training_Module/saved_models exists and is not empty.
/Users/alessandrotocco/Library/Python/3.9/lib/python/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.
/Users/alessandrotocco/Library/Python/3.9/lib/python/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_steps=100` reached.
Learning rate set to 0.5754399373371567
Restoring states from the checkpoint path at /Users/alessandrotocco/Desktop/Universita/Data Analysis/DataAnalyticsProject/Training_Module/.lr_find_138a7f69-cb3b-4b7a-b6d5-2720a6340d65.ckpt
Restored all states from the checkpoint at /Users/alessandrotocco/Desktop/Universita/Data Analysis/DataAnalyticsProject/Training_Module/.lr_find_138a7f69-cb3b-4b7a-b6d5-2720a6340d65.ckpt


Output()