# Imports

In [10]:
import numpy as np
import warnings
from sklearn.model_selection import train_test_split

import pandas as pd
from pytorch_tabular import TabularModel

from pytorch_tabular.models import GANDALFConfig, CategoryEmbeddingModelConfig
from pytorch_tabular.config import (
    DataConfig,
    OptimizerConfig,
    TrainerConfig, 
    ExperimentConfig
)
import torch
warnings.filterwarnings("ignore")
import wandb

# %load_ext autoreload
# %autoreload 2

In [11]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mddima19981998[0m ([33mdegt[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

### Exctract Colnames

In [12]:
dataset_colnames= np.load('dataset_colnames.npy')

In [13]:
dataset_colnames = np.append(dataset_colnames, "target")

len(dataset_colnames)

4296

In [14]:
categorical_names = [str(name) for name in dataset_colnames if ("topo_vect" or "line_status") in name and name != "target"]
continous_names = [str(name) for name in dataset_colnames if ("topo_vect" and "line_status") not in name and name != "target"]

In [15]:
print(continous_names)
print(categorical_names)

['year_0', 'month_0', 'day_0', 'hour_of_day_0', 'minute_of_hour_0', 'day_of_week_0', 'gen_p_0', 'gen_p_1', 'gen_p_2', 'gen_p_3', 'gen_p_4', 'gen_p_5', 'gen_p_6', 'gen_p_7', 'gen_p_8', 'gen_p_9', 'gen_p_10', 'gen_p_11', 'gen_p_12', 'gen_p_13', 'gen_p_14', 'gen_p_15', 'gen_p_16', 'gen_p_17', 'gen_p_18', 'gen_p_19', 'gen_p_20', 'gen_p_21', 'gen_p_22', 'gen_p_23', 'gen_p_24', 'gen_p_25', 'gen_p_26', 'gen_p_27', 'gen_p_28', 'gen_p_29', 'gen_p_30', 'gen_p_31', 'gen_p_32', 'gen_p_33', 'gen_p_34', 'gen_p_35', 'gen_p_36', 'gen_p_37', 'gen_p_38', 'gen_p_39', 'gen_p_40', 'gen_p_41', 'gen_p_42', 'gen_p_43', 'gen_p_44', 'gen_p_45', 'gen_p_46', 'gen_p_47', 'gen_p_48', 'gen_p_49', 'gen_p_50', 'gen_p_51', 'gen_p_52', 'gen_p_53', 'gen_p_54', 'gen_p_55', 'gen_p_56', 'gen_p_57', 'gen_p_58', 'gen_p_59', 'gen_p_60', 'gen_p_61', 'gen_q_0', 'gen_q_1', 'gen_q_2', 'gen_q_3', 'gen_q_4', 'gen_q_5', 'gen_q_6', 'gen_q_7', 'gen_q_8', 'gen_q_9', 'gen_q_10', 'gen_q_11', 'gen_q_12', 'gen_q_13', 'gen_q_14', 'gen_q_15',

#### Read the data

In [16]:
# read_dataset
my_df = np.load("/share/data1/GYM/full_obs_data.npz")
topo_ag_data=my_df["topo"]
dn_ag_data=my_df["dn"]
senior_ag_data=my_df["senior"]
print(topo_ag_data.shape, len(topo_ag_data))

(55772, 4296) 55772


In [17]:
topo_ag_df = pd.DataFrame(topo_ag_data, columns=dataset_colnames)
for col in categorical_names:
    topo_ag_df[col] = topo_ag_df[col].astype(int)
topo_ag_df["target"] = topo_ag_df["target"].astype(int)

In [18]:
dn_ag_df = pd.DataFrame(dn_ag_data, columns=dataset_colnames)
for col in categorical_names:
    dn_ag_df[col] = dn_ag_df[col].astype(int)
dn_ag_df["target"] = dn_ag_df["target"].astype(int)

In [19]:
senior_ag_df = pd.DataFrame(senior_ag_data, columns=dataset_colnames)
for col in categorical_names:
    senior_ag_df[col] = senior_ag_df[col].astype(int)
senior_ag_df["target"] = senior_ag_df["target"].astype(int)

In [20]:
all_data = pd.DataFrame(np.concatenate([dn_ag_data,topo_ag_data,senior_ag_data]), columns=dataset_colnames)
for col in categorical_names:
    all_data[col] = all_data[col].astype(int)
all_data["target"] = all_data["target"].astype(int)


In [21]:
topo_ag_data.shape

(55772, 4296)

### Classification multiclass

In [22]:
print(topo_ag_df.shape, topo_ag_df.shape)
print(topo_ag_df.head())

(55772, 4296) (55772, 4296)
   year_0  month_0  day_0  hour_of_day_0  minute_of_hour_0  day_of_week_0  \
0  2050.0      1.0    3.0            0.0               0.0            0.0   
1  2050.0      1.0    3.0            0.0               0.0            0.0   
2  2050.0      1.0    3.0            0.0               0.0            0.0   
3  2050.0      1.0    3.0            0.0               0.0            0.0   
4  2050.0      1.0    3.0            0.0               0.0            0.0   

      gen_p_0     gen_p_1     gen_p_2    gen_p_3  ...  gen_margin_down_53  \
0   21.900000   26.900000  360.000000  50.000000  ...                11.2   
1   30.900000   31.500000  359.100006  27.400000  ...                11.2   
2   42.500000   47.299999  353.899994  56.099998  ...                11.2   
3   88.300003   88.900002  359.899994  81.400002  ...                11.2   
4  105.199997  102.900002  359.899994  90.599998  ...                11.2   

   gen_margin_down_54  gen_margin_down_55  gen

In [23]:
#from pytorch_tabular.utils import load_covertype_dataset
#data, _, _, _ = load_covertype_dataset()

#### Split the data

In [24]:
# Split the data
tr_val, test = train_test_split(all_data, random_state=42, test_size=0.1, shuffle=True, stratify=all_data["target"])
train, val = train_test_split(tr_val, random_state=42, test_size=0.1, shuffle=True, stratify=tr_val["target"])

In [25]:
print(train.shape, val.shape, test.shape)

print(train.head)

(128788, 4296) (22728, 4296) (37879, 4296)
<bound method NDFrame.head of         year_0  month_0  day_0  hour_of_day_0  minute_of_hour_0  \
81784   2050.0      2.0   14.0            2.0              35.0   
3875    2050.0      2.0   28.0            2.0              20.0   
138050  2050.0      2.0    8.0           20.0               0.0   
173175  2050.0      9.0    5.0           14.0              30.0   
131203  2050.0      1.0    5.0           10.0              55.0   
...        ...      ...    ...            ...               ...   
15349   2050.0      4.0   25.0           11.0              30.0   
8644    2050.0      4.0    4.0            4.0              10.0   
158786  2050.0      5.0   30.0           10.0               5.0   
30950   2050.0      6.0   23.0           19.0              15.0   
45456   2050.0      8.0   16.0           17.0              50.0   

        day_of_week_0     gen_p_0    gen_p_1     gen_p_2    gen_p_3  ...  \
81784             0.0   50.500000  48.900002  

In [26]:
data_config = DataConfig(
    target=[
        "target"
    ], 
    continuous_cols=dataset_colnames.tolist()[:-1],
    categorical_cols=[],# target should always be a list. Multi-targets are only supported for regression. Multi-Task Classification is not implemented
)
trainer_config = TrainerConfig(
    devices = -1,
    batch_size=1024,
    max_epochs=150,
    min_epochs=50,
    early_stopping='valid_loss',
    early_stopping_patience=10,
    early_stopping_mode='min'
)

optimizer_config = OptimizerConfig(
    optimizer="Adamax",
    lr_scheduler="ReduceLROnPlateau",
    lr_scheduler_params={"mode": "min", "patience": 5, "factor": 0.5}
)
experiment_config = ExperimentConfig(project_name="GANDALF Multiclass", run_name="run", log_target="wandb")
model_config = GANDALFConfig(
    task="classification",
    gflu_stages=6,
    gflu_feature_init_sparsity=0.5,
    gflu_dropout=0.05,
    learning_rate=1e-5,
)


"""model_config = CategoryEmbeddingModelConfig(
    task="classification",
    layers="1000-1000-1000-128",
    use_batch_norm=True,
    dropout=0.2,
"""

'model_config = CategoryEmbeddingModelConfig(\n    task="classification",\n    layers="1000-1000-1000-128",\n    use_batch_norm=True,\n    dropout=0.2,\n'

In [27]:
torch.cuda.empty_cache()
tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    experiment_config=experiment_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
    verbose=True
)

In [28]:
model = tabular_model.fit(train=train, validation=val)

Seed set to 42


[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`


Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Output()

Trainer was signaled to stop but the required `min_epochs=50` or `min_steps=None` has not been met. Training will continue...


In [29]:
tabular_model.evaluate(train)
tabular_model.evaluate(val)
tabular_model.evaluate(test)
#y_pred_proba = tabular_model.predict_proba(test)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Output()

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Output()

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Output()

[{'test_loss': 0.7154021263122559, 'test_accuracy': 0.7421526312828064}]

In [30]:
y_pred_val= tabular_model.predict(val)
y_pred_test= tabular_model.predict(test)



In [31]:
y_pred_val

Unnamed: 0,0_probability,1_probability,2_probability,3_probability,prediction
135456,0.053141,0.838222,0.103996,0.004641,1
153539,0.026704,0.870266,0.098530,0.004500,1
138185,0.001649,0.702937,0.294919,0.000495,1
110103,0.653951,0.090113,0.070606,0.185330,0
56254,0.441785,0.016690,0.171985,0.369541,0
...,...,...,...,...,...
158831,0.879303,0.020888,0.037857,0.061951,0
174295,0.250419,0.267723,0.171853,0.310005,3
69653,0.986058,0.000469,0.001889,0.011583,0
64376,0.898574,0.003691,0.019582,0.078152,0


In [32]:
from sklearn.metrics import accuracy_score, f1_score


In [33]:
print("Val Acc", accuracy_score(val["target"], y_pred_val["prediction"]))
print("Test Acc", accuracy_score(test["target"], y_pred_test["prediction"]))
print("Val F1", f1_score(val["target"], y_pred_val["prediction"], average="macro"))
print("Test F1", f1_score(test["target"], y_pred_test["prediction"], average="macro"))

Val Acc 0.7364484336501232
Test Acc 0.7421526439451939
Val F1 0.6765518083233737
Test F1 0.6829331351668431


### Classification dichotom

In [34]:
wandb.login()



True

#### Split the data

In [35]:
# Split the data
all_data_dichotom = all_data.copy()
all_data_dichotom["target"] = all_data_dichotom["target"].apply(lambda x: 1 if x != 0 else 0)
tr_val, test = train_test_split(all_data_dichotom, random_state=42, test_size=0.2, shuffle=True, stratify=all_data_dichotom["target"])
train, val = train_test_split(tr_val, random_state=42, test_size=0.15, shuffle=True, stratify=tr_val["target"])

In [36]:
print(train.shape, val.shape, test.shape)

print(train.head)

(128788, 4296) (22728, 4296) (37879, 4296)
<bound method NDFrame.head of         year_0  month_0  day_0  hour_of_day_0  minute_of_hour_0  \
96891   2050.0      5.0    6.0           10.0              15.0   
96337   2050.0      5.0    3.0           17.0              45.0   
27609   2050.0      6.0   13.0            8.0              35.0   
47012   2050.0      8.0   22.0           11.0               0.0   
122726  2050.0     11.0    8.0           11.0              30.0   
...        ...      ...    ...            ...               ...   
93326   2050.0      4.0   18.0           21.0              30.0   
52426   2050.0      9.0   13.0            8.0              50.0   
29820   2050.0      6.0   20.0           10.0              20.0   
168516  2050.0      8.0    2.0            9.0               0.0   
140614  2050.0      2.0   22.0            3.0              45.0   

        day_of_week_0    gen_p_0    gen_p_1     gen_p_2    gen_p_3  ...  \
96891             4.0  29.400000  27.299999  27

In [37]:
data_config = DataConfig(
    target=[
        "target"
    ], 
    continuous_cols=dataset_colnames.tolist()[:-1],
    categorical_cols=[],# target should always be a list. Multi-targets are only supported for regression. Multi-Task Classification is not implemented
)
trainer_config = TrainerConfig(
    devices = -1,
    batch_size=1024,
    max_epochs=50,
    min_epochs=2,
    early_stopping='valid_loss',
    early_stopping_patience=10,
    early_stopping_mode='min'
)

optimizer_config = OptimizerConfig(
    optimizer="Adam",
    lr_scheduler="ReduceLROnPlateau",
    lr_scheduler_params={"mode": "min", "patience": 5, "factor": 0.5}
)
experiment_config = ExperimentConfig(project_name="GANDALF Dichotom", run_name="run", log_target="wandb")
model_config = GANDALFConfig(
    task="classification",
    gflu_stages=10,
    gflu_feature_init_sparsity=0.5,
    gflu_dropout=0.1,
    learning_rate=1e-2,
)


"""model_config = CategoryEmbeddingModelConfig(
    task="classification",
    layers="1000-1000-1000-128",
    use_batch_norm=True,
    dropout=0.2,
"""

'model_config = CategoryEmbeddingModelConfig(\n    task="classification",\n    layers="1000-1000-1000-128",\n    use_batch_norm=True,\n    dropout=0.2,\n'

In [38]:
torch.cuda.empty_cache()
tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    experiment_config=experiment_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
    verbose=True
)

In [None]:
model = tabular_model.fit(train=train, validation=val)

Seed set to 42


[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`


Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Output()

In [None]:
tabular_model.evaluate(train)
tabular_model.evaluate(val)
tabular_model.evaluate(test)
#y_pred_proba = tabular_model.predict_proba(test)

In [None]:
y_pred_val= tabular_model.predict(val)
y_pred_test= tabular_model.predict(test)



In [None]:
y_pred_val

In [None]:
from sklearn.metrics import accuracy_score, f1_score


In [None]:
print("Val Acc", accuracy_score(val["target"], y_pred_val["prediction"]))
print("Test Acc", accuracy_score(test["target"], y_pred_test["prediction"]))
print("Val F1", f1_score(val["target"], y_pred_val["prediction"], average="macro"))
print("Test F1", f1_score(test["target"], y_pred_test["prediction"], average="macro"))