In [1]:
from pytorch_tabular.utils import load_covertype_dataset
data, _, _, _ = load_covertype_dataset()

In [2]:
from rich import print
# One of the easiest ways to identify categorical features is using the pandas select_dtypes function.
categorical_features = data.select_dtypes(include=['object'])
print(categorical_features.columns)

In [3]:
# Another way is to use the unique values in each column.
for col in data.columns:
    print(col, len(data[col].unique()))

In [4]:
# This separation have already been done for you while loading this particular dataset from `PyTorch Tabular`. Let's load the dataset in the right way.
data, cat_col_names, num_col_names, target_col = load_covertype_dataset()
# Let's also print out a few details
print(f"Data Shape: {data.shape} | # of cat cols: {len(cat_col_names)} | # of num cols: {len(num_col_names)}")
print(f"[bold dodger_blue2] Features: {num_col_names + cat_col_names}[/bold dodger_blue2]")
print(f"[bold purple4]Target: {target_col}[/bold purple4]")

In [23]:
import pandas as pd
uhi_data = pd.read_csv("/Users/beas28/Desktop/info Challenge/uhi_data_processed.csv").dropna()
features_columns = [
    'B01', 'B02', 'B03', 'B04', 'B05', 'B12', 
    'NDVI', 'NDMI', 'NDBI', 'NDWI', 'UI', 'STI', 
    'B11_B12_ratio', 'veg_fraction', 'impervious_estimate', 'UHI Index'
]
model_data = uhi_data[features_columns]

In [24]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(model_data, random_state=42, test_size=0.2)
train, val = train_test_split(train, random_state=42, test_size=0.2)
print(f"Train Shape: {train.shape} | Val Shape: {val.shape} | Test Shape: {test.shape}")

In [25]:
train

Unnamed: 0,B01,B02,B03,B04,B05,B12,NDVI,NDMI,NDBI,NDWI,UI,STI,B11_B12_ratio,veg_fraction,impervious_estimate,UHI Index
21546,1898.0,1870.0,1915.0,1970.0,2205.0,2439.0,0.061010,-0.118590,0.118590,-0.075103,0.203408,1.158262,1.158262,0.061010,0.938990,1.009010
7947,1447.0,1502.0,1586.0,1478.0,1693.0,1815.0,0.395254,0.157698,-0.157698,-0.365092,0.245795,1.366942,1.366942,0.395254,0.604746,1.016729
751,1242.0,1421.0,1559.0,1368.0,1698.0,1735.0,0.490313,0.233236,-0.233236,-0.439108,0.272774,1.433429,1.433429,0.490313,0.509687,1.002393
387,1176.0,1310.0,1495.0,1341.0,1725.0,1840.0,0.507800,0.195750,-0.195750,-0.466357,0.356740,1.501630,1.501630,0.507800,0.492200,0.967105
2778,1345.0,1545.0,1687.0,1534.0,1929.0,1970.0,0.440146,0.159906,-0.159906,-0.401030,0.298206,1.450761,1.450761,0.440146,0.559854,1.000188
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4677,1258.0,1317.0,1504.0,1398.0,1984.0,2063.0,0.377560,0.085233,-0.085233,-0.345803,0.328917,1.264178,1.264178,0.377560,0.622440,1.002393
23038,1352.0,1570.0,1752.0,1570.0,1935.0,2032.0,0.491745,0.234065,-0.234065,-0.449057,0.291196,1.407480,1.407480,0.491745,0.508255,0.997982
3447,1447.0,1874.0,2180.0,2154.0,2161.0,2206.0,0.287934,0.103058,-0.103058,-0.282423,0.256644,1.436083,1.436083,0.287934,0.712066,0.996879
22679,1609.0,1652.0,1850.0,1828.0,1924.0,1955.0,0.324963,0.179681,-0.179681,-0.319603,0.203279,1.276215,1.276215,0.324963,0.675037,1.021140


In [26]:
target_col = 'UHI Index'

In [27]:
num_col_names = model_data.select_dtypes(include=['int64', 'float64']).columns.tolist()
num_col_names = [col for col in num_col_names if col != 'UHI Index']  # Remove target if present
num_col_names

['B01',
 'B02',
 'B03',
 'B04',
 'B05',
 'B12',
 'NDVI',
 'NDMI',
 'NDBI',
 'NDWI',
 'UI',
 'STI',
 'B11_B12_ratio',
 'veg_fraction',
 'impervious_estimate']

In [28]:
cat_col_names = model_data.select_dtypes(include=['object', 'category']).columns.tolist()

In [135]:
from pytorch_tabular.models import GANDALFConfig, TabTransformerConfig
from pytorch_tabular.config import (
    DataConfig,
    OptimizerConfig,
    TrainerConfig,
)

data_config = DataConfig(
    target=[
        target_col
    ],  # target should always be a list
    # num_workers=10,
    continuous_cols=num_col_names,
    categorical_cols=cat_col_names,
)
trainer_config = TrainerConfig(
    batch_size=512,
    max_epochs=10000,
    accelerator = 'auto',
    devices = -1
)
optimizer_config = OptimizerConfig(
)
model_config = TabTransformerConfig(
    task="regression",
    input_embed_dim=16,            # Increased embedding dimension
    num_heads=8,                   # Using default or increased number of heads
    num_attn_blocks=8,             # Increased number of stacked attention layers
    transformer_head_dim=16,       # Explicitly set head dimension (if beneficial)
    attn_dropout=0.2,              # Keep dropout as needed
    add_norm_dropout=0.1,
    ff_dropout=0.1,
    ff_hidden_multiplier=8,        # Increased feedforward multiplier for more capacity
    transformer_activation="LeakyReLU",
    learning_rate=5e-3
)

In [136]:
from pytorch_tabular import TabularModel

tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
    verbose=True
)

In [137]:

tabular_model.fit(train=train, validation=val)


Seed set to 42


GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


/Users/beas28/miniconda3/envs/infoChallenge/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /Users/beas28/Desktop/info Challenge/saved_models exists and is not empty.


Output()

<pytorch_lightning.trainer.trainer.Trainer at 0x34d2c5e10>

In [138]:
from sklearn.metrics import r2_score
y_pred = tabular_model.predict(test=test)
score = r2_score(y_pred, test['UHI Index'])
print(score)