In [1]:
import torch.nn as nn
import torch
import math
from torch.nn import functional as F

import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
from torch.utils.data import Dataset, DataLoader

import math
import copy

from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [None]:
!pip install wandb
import wandb
wandb.login()

In [None]:
!pip install pymfe
from pymfe.mfe import MFE

def get_meta_features(data):
    s = "inst_to_attr, nr_class, nr_attr, attr_to_inst, skewness, kurtosis, cor, cov, attr_conc, class_conc, sparsity, gravity, skewness, class_ent, attr_ent, mut_inf, eq_num_attr, ns_ratio, tree_depth, leaves_branch, nodes_per_attr, leaves_per_class"
    s = s.split(", ")

    X, y = data.drop(['reg_id', 'y'], axis=1).to_numpy(), data['y'].to_numpy()

    mfe = MFE(features=[*s])
    mfe.fit(X, y)
    ft = mfe.extract()
    return ft[1]

In [3]:
torch.manual_seed(0)
np.random.seed(0)
import random
random.seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

def worker_init_fn(worker_id):
    np.random.seed(0 + worker_id)
    random.seed(0 + worker_id)


In [None]:
!pip install gdown
!gdown 17OsETGZ4gGD7pb7PMUJD0PWPdedwSsIm
!gdown 1Al_vQTkQwrKwo_-Le5fft21I8ffjn1bK
!yes | unzip dataset_all_regs.zip

In [5]:
from random import randint, choice, random

act = {
    "ReLU": {
        "layer": nn.ReLU,
        "args": {},
    },
    "Tanh": {
        "layer": nn.Tanh,
        "args": {},
    },
    "Sigmoid": {
        "layer": nn.Sigmoid,
        "args": {},
    },
}

class Net(nn.Module):
    def __init__(self, input_size, output_size):
        super(Net, self).__init__()

        self.layers = nn.Sequential()
        self.current_size = input_size

        self.count_layers = randint(1, 3)

        for i in range(self.count_layers):
            self.layers.append(self.make_linear())
            # if random() < .2: self.layers.append(self.make_normalization_layer())
            self.layers.append(self.make_activation_layer())

        self.layers.append(self.make_linear(output_size))

    def forward(self, x):
        return self.layers(x)

    def make_linear(self, output_size=None) -> nn.Linear:
        input_size = self.current_size

        if output_size is None:
            output_size = randint(4, 8)

        self.current_size = output_size

        return nn.Linear(in_features=input_size,
                         out_features=output_size,
                         bias=choice([True, False]))

    def make_activation_layer(self) -> nn.Module:
        layer_info = choice(list(act.values()))

        args = layer_info["args"]
        eval_args = {}
        for key, value in args.items():
            eval_args[key] = value() if callable(value) else value

        return layer_info["layer"](**eval_args)

    def make_normalization_layer(self) -> nn.Module:
        return choice([
            nn.BatchNorm1d(self.current_size),
            nn.LayerNorm(self.current_size),
        ])

In [6]:
act_map = {nn.ReLU: 2,
           nn.Tanh: 3,
           nn.Sigmoid: 1}

def converter(net: nn.Module) -> torch.Tensor:
    matrix = torch.zeros((64, 64))
    x_p, y_p = 0, 0
    for layer in net.layers:
        if isinstance(layer, nn.Linear):
            matrix[x_p:x_p+layer.weight.shape[0], y_p:y_p+layer.weight.shape[1]] = layer.weight
            x_p += layer.weight.shape[0]
            y_p += layer.weight.shape[1]
        else:
            matrix[x_p, y_p] = act_map[layer.__class__]
            x_p += 1
            y_p += 1
    return matrix

def reverse_converter(weights: torch.Tensor, net: nn.Module) -> nn.Module: #
    x_p, y_p = 0, 0
    layers = []
    net_copy = copy.deepcopy(net)
    for layer in net_copy.layers:
        if isinstance(layer, nn.Linear):
            layer.weight = nn.Parameter(weights[x_p:x_p+layer.weight.shape[0], y_p:y_p+layer.weight.shape[1]])
            x_p += layer.weight.shape[0]
            y_p += layer.weight.shape[1]
        else:
            x_p += 1
            y_p += 1
        layers += [layer]
    return RNet(layers)

In [7]:
def get_weights_mask(weights: torch.Tensor) -> torch.Tensor:
    return (weights != 0).float()

In [8]:
class RNet(nn.Module):
    def __init__(self, in_layers):
        super().__init__()
        self.layers = nn.Sequential()
        for i in in_layers:
            self.layers.append(i)
    def forward(self, x):
        self.emb = self.layers[:-1](x)
        return self.layers[-1](self.emb)

In [9]:
N_REGS = 20
N_NETS = 100
REGS_BATCHSIZE = 1
NETS_BATCHSIZE = 100

In [10]:
regs = pd.read_csv('/kaggle/working/random_regressions.csv')
uniq_regs = regs["reg_id"].unique()
reg_data = [regs[regs["reg_id"] == i] for i in uniq_regs]

In [11]:
class RegDataset(Dataset):
    def __init__(self, df):
        super().__init__()
        self.features = torch.tensor(df.drop(columns=['reg_id', 'y']).values, dtype=torch.float32)
        self.targets = torch.tensor(df['y'].values, dtype=torch.float32)

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        return self.features[idx], self.targets[idx]

In [12]:
reg_datasets = []
reg_dataloaders = []
for reg in reg_data[:N_REGS]:
    train_data, valid_data = train_test_split(reg, test_size=0.2)
    train_dataset = RegDataset(train_data)
    valid_dataset = RegDataset(valid_data)
    reg_datasets += [[train_dataset, valid_dataset]]
    train_loader = DataLoader(train_dataset, batch_size=REGS_BATCHSIZE, shuffle=True, worker_init_fn=worker_init_fn)
    valid_loader = DataLoader(valid_dataset, batch_size=REGS_BATCHSIZE, shuffle=False, worker_init_fn=worker_init_fn)
    reg_dataloaders += [[train_loader, valid_loader]]

In [13]:
class NetDataset(Dataset):
    def __init__(self, start, end, regs):
        self.data = []  
        self.mfs_dict = {}  

        for index, row in df.iterrows():
            if start <= index % 100 <= end:
                net = torch.load(f"/kaggle/working/{row['struct']}", weights_only=False)
                converted_net = converter(net).unsqueeze(0) 
                w_mask = get_weights_mask(converted_net)  
                
                converted_net = torch.cat([converted_net, w_mask], dim=0)
                
                reg_id = row['dataset_id']
                
                if reg_id not in self.mfs_dict:
#                     self.mfs_dict[reg_id] = get_meta_features(regs[reg_id])
                    self.mfs_dict[reg_id] = torch.tensor(0)
                
                self.data.append((converted_net, index, reg_id, self.mfs_dict[reg_id]))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        converted_net, net_id, dataset_id, meta_features = self.data[idx]
        return (
            converted_net.clone().detach(), 
            torch.tensor(net_id),           
            torch.tensor(dataset_id),       
            torch.tensor(meta_features)     
        )

In [14]:
df = pd.read_csv('/kaggle/working/models.csv')

train_dataset = NetDataset(0, 79, reg_data)
valid_dataset = NetDataset(80, 99, reg_data)

train_dataloader = DataLoader(train_dataset, batch_size=NETS_BATCHSIZE, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=NETS_BATCHSIZE, shuffle=False)

In [15]:
def apply_mask(mask_matrix, to_matrix):
    if mask_matrix.shape != to_matrix.shape:
        raise ValueError("mask_matrix and to_matrix must have the same shape")
    mask_matrix = mask_matrix.to(to_matrix.device, to_matrix.dtype)

#     for value in act_map.values():
#         to_matrix[mask_matrix == value] = value

    mask = mask_matrix != 0
    result = torch.where(mask, to_matrix, torch.zeros_like(to_matrix))

    return result, mask

In [16]:
len(train_dataset), len(valid_dataset)

(1600, 400)

In [17]:
len(train_dataloader), len(valid_dataloader)

(16, 4)

In [38]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Encoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(2, 32, kernel_size=4, stride=2, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2, padding=1)
        
        latent_dim = 32 * 32
        self.fc_mu = nn.Linear(64 * 16 * 16, latent_dim)  
        self.fc_sigma = nn.Linear(64 * 16 * 16, latent_dim) 
        nn.init.constant_(self.fc_sigma.bias, 1e-4)  

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = x.view(x.size(0), -1)
        mu = self.fc_mu(x) 
        sigma = torch.exp(0.5 * self.fc_sigma(x)) 
        sigma = torch.clamp(sigma, min=1e-6, max=1e2) 
        return mu, sigma

class Decoder(nn.Module):
    def __init__(self):
        super().__init__()
        latent_dim = 32 * 32
        self.fc = nn.Linear(latent_dim, 64 * 16 * 16)
        self.deconv1 = nn.ConvTranspose2d(64, 32, kernel_size=4, stride=2, padding=1)
        self.deconv2 = nn.ConvTranspose2d(32, 2, kernel_size=4, stride=2, padding=1)  

    def forward(self, z):
#         mfs = mfs.to(z.dtype)
#         z = torch.cat([z, mfs], dim=1)

        x = F.relu(self.fc(z))
        x = x.view(x.size(0), 64, 16, 16)
        x = F.relu(self.deconv1(x))
        x = self.deconv2(x)
        
        weights = x[:, 0, :, :].unsqueeze(1) 
        mask = torch.sigmoid(x[:, 1, :, :]).unsqueeze(1)  
        return torch.cat((weights, mask), dim=1)

class VAE(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = Encoder()
        self.decoder = Decoder()
        self.kl = 0

    def reparameterize(self, mu, sigma):
        eps = torch.randn_like(mu)
        return mu + sigma * eps

    def forward(self, x):
        mu, sigma = self.encoder(x)
        z = self.reparameterize(mu, sigma)
        self.kl = 0.5 * torch.sum(mu**2 + sigma**2 - sigma.log() - 1) / x.size(0)
        return self.decoder(z)


In [19]:
from tqdm import tqdm

def valid_model(model, valid_loader):
    mse = 0
    smape = 0
    model.eval()
    for data in valid_loader:
        x, y = data
        x, y = x.to(device), y.to(device)
        with torch.no_grad():
            output = model(x)
        mse += ((output.T - y) ** 2).sum()
        smape += (output - y).abs() / (output.abs() + y.abs()) / 2
    return mse / len(valid_loader), smape / len(valid_loader)

In [20]:
def valid_cos_sim(model1, model2, valid_loader):
    cos_sim = 0
    with torch.no_grad():
        for data in valid_loader:
            x, y = data
            x, y = x.to(device), y.to(device)
            model1(x)
            model2(x)

            cos_sim += (F.cosine_similarity(model1.emb, model2.emb).sum() / x.shape[0])
    return cos_sim / len(valid_loader)

In [21]:
lr = 2e-4

In [None]:
run = wandb.init(
    project="netformer",
    name=f"cnn, vae, brand_new_vae, emb=45*45, lr={lr}",
    config={
        "learning_rate": lr,
        "d_model": 0,
        "num_heads": 0,
        "num_layers": 0,
        "d_ff": 0,
        "dropout": 0,
        "BATCHSIZE": NETS_BATCHSIZE,
        "data": "random",
        "edge_c": 0
    },
)

In [None]:
from torch.optim.lr_scheduler import StepLR, CosineAnnealingWarmRestarts
cnnvae = VAE()
# cnnvae = torch.load('/kaggle/input/cnnvae/pytorch/default/5/cnnvae_brand_new(87)(lr2e-4).pt')
cnnvae.to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(cnnvae.parameters(), lr=lr, weight_decay=1e-5)
val_criterion = nn.MSELoss()


# scheduler = CosineAnnealingWarmRestarts(optimizer, 10)

In [22]:
def smape(true, pred):
    epsilon = 1e-8
    return torch.mean(2 * torch.abs(pred - true) / (torch.abs(true) + torch.abs(pred) + epsilon))

def mse(true, pred):
    return torch.mean((true - pred) ** 2)

In [23]:
def threshold_tensor(gen_matrix, threshold=0.5):
    return torch.where(gen_matrix.cpu() < threshold, torch.tensor(0), torch.tensor(1))

gen_matrix = torch.rand(5, 5)
threshold_tensor(gen_matrix)

tensor([[0, 1, 0, 0, 0],
        [1, 0, 1, 0, 1],
        [0, 0, 0, 0, 0],
        [1, 1, 1, 0, 0],
        [1, 1, 0, 1, 0]])

In [24]:
def loss_function(data, output, kl=0, kl_weight=0):
    
    weights_true = data[:, 0, :, :].to("cpu")
    mask_true = data[:, 1, :, :].to("cpu")
    weights_pred = output[:, 0, :, :].to("cpu")
    mask_pred = output[:, 1, :, :].to("cpu")

    loss_mse = F.mse_loss(weights_pred, weights_true)
    loss_mask = F.binary_cross_entropy(mask_pred, mask_true)
    loss = loss_mse + loss_mask + kl_weight * kl
    return loss


# Training

In [None]:
for epoch in range(100):
    train_metrics = {'train_loss': 0, 'train_kl': 0, 'loss_mse': 0}
    val_metrics = {'val_loss': 0, 'mse': 0, 'mape': 0, 'true_model_metric': 0, 'gen_model_metric': 0, 'cos_sim': 0, 
                   'smape': 0, 'smape_m': 0, 'mse_m': 0, 'mse_m_fixed': 0, "val_loss2": 0}
    
    cnnvae.train()
    for batch in tqdm(train_dataloader):
        data = batch[0].to(device)
#         mfs = batch[-1].to(device)
        optimizer.zero_grad()

        output = cnnvae(data)        
        loss_mse = criterion(output, data)
        
        kl_weight = max(0.00001 - 0.000001 * epoch, 0)
        
        loss = loss_function(data, output, cnnvae.kl, kl_weight)
        
        loss.backward()
        optimizer.step()
        
        train_metrics['train_kl'] += cnnvae.kl.item()
        train_metrics['train_loss'] += loss.item()
        train_metrics['loss_mse'] += loss_mse.item()

    train_metrics['train_loss'] /= len(train_dataloader)
    train_metrics['train_kl'] /= len(train_dataloader)
    train_metrics['loss_mse'] /= len(train_dataloader)
    print("train: epoch:", epoch,
          "loss:", round(train_metrics['train_loss'], 4),
          "kl:", train_metrics['train_kl'])


    cnnvae.eval()
    for batch in tqdm(valid_dataloader):
        data, net_num, reg, mfs = batch
        data = data.to(device)
        mfs = mfs.to(device)

        with torch.no_grad():
            output = cnnvae(data)
            val_loss = val_criterion(output, data)
            val_loss2 = loss_function(data, output)
            
        val_metrics['val_loss2'] += val_loss2.item()
        val_metrics['val_loss'] += val_loss.item()
        for j in range(NETS_BATCHSIZE):
            true_net = torch.load(f"/kaggle/working/model{int(net_num[j])}.pt", map_location=torch.device('cpu'), weights_only=False)

            true_net = RNet([layer for layer in true_net.layers])
            net = reverse_converter(output[j][0], true_net)

            net.to(device).eval()
            true_net.to(device).eval()

            true_model_mse, true_model_smape = valid_model(true_net, reg_dataloaders[int(reg[j])][1])
            gen_model_mse, gen_model_smape = valid_model(net, reg_dataloaders[int(reg[j])][1])

            cos_sim = valid_cos_sim(net, true_net, reg_dataloaders[int(reg[j])][1])

            gen_matrix = output[j][1].cpu()  
            true_matrix = data[j][1].cpu()  
            fixed_matrix = threshold_tensor(gen_matrix)

            
            smape_m = smape(true_matrix, gen_matrix)
            mse_m = mse(true_matrix, gen_matrix)
            mse_m_fixed = mse(true_matrix, fixed_matrix)

            val_metrics['smape'] += gen_model_smape
            val_metrics['mse'] += (true_model_mse - gen_model_mse)**2
            safe_denominator = max(gen_model_mse, 1e-8)  
            val_metrics['mape'] += (true_model_mse - gen_model_mse).abs() / safe_denominator
            val_metrics['true_model_metric'] += true_model_mse
            val_metrics['gen_model_metric'] += gen_model_mse
            val_metrics['cos_sim'] += cos_sim

            
            val_metrics['smape_m'] += smape_m.item()
            val_metrics['mse_m'] += mse_m.item()
            val_metrics['mse_m_fixed'] += mse_m_fixed.item()

    num_batches = len(valid_dataloader) * NETS_BATCHSIZE
    val_metrics['smape'] /= num_batches
    val_metrics['cos_sim'] /= num_batches
    val_metrics['val_loss'] /= len(valid_dataloader)
    val_metrics['val_loss2'] /= len(valid_dataloader)
    val_metrics['mse'] /= num_batches
    val_metrics['mape'] /= num_batches
    val_metrics['true_model_metric'] /= num_batches
    val_metrics['gen_model_metric'] /= num_batches
    val_metrics['smape_m'] /= num_batches
    val_metrics['mse_m'] /= num_batches
    val_metrics['mse_m_fixed'] /= num_batches

    print(val_metrics['true_model_metric'], val_metrics['gen_model_metric'])
    print("loss:", round(val_metrics['val_loss'], 4), "loss2:", round(val_metrics['val_loss2'], 4))

    # torch.save(cnnvae, f'cnnvae{epoch}.pt')
    # from IPython.display import FileLink
    # FileLink(f'cnnvae{epoch}.pt')

    # scheduler.step()

    wandb.log(val_metrics | train_metrics)

In [None]:
name_f = f'conv-vae-({epoch}ep)-({val_metrics["cos_sim"]}cos_sim).pt'
torch.save(cnnvae, name_f)
from IPython.display import FileLink
FileLink(name_f)

# Testing

In [40]:
model = torch.load('conv-vae-(99ep)-(0.6cos_sim).pt')
model.eval()

  model = torch.load('/kaggle/input/cnnvae/pytorch/default/3/cnnvae_brand_new(99)1.pt')


VAE(
  (encoder): Encoder(
    (conv1): Conv2d(2, 32, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))
    (conv2): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))
    (fc_mu): Linear(in_features=16384, out_features=1024, bias=True)
    (fc_sigma): Linear(in_features=16384, out_features=1024, bias=True)
  )
  (decoder): Decoder(
    (fc): Linear(in_features=1024, out_features=16384, bias=True)
    (deconv1): ConvTranspose2d(64, 32, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))
    (deconv2): ConvTranspose2d(32, 2, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))
  )
)

In [45]:
train_metrics = {'train_loss': 0, 'train_kl': 0, 'loss_mse': 0}
val_metrics = {'val_loss': 0, 'mse': 0, 'mape': 0, 'true_model_metric': 0, 'gen_model_metric': 0, 'cos_sim': 0, 
                   'smape': 0, 'smape_m': 0, 'mse_m': 0, 'mse_m_fixed': 0, "val_loss2": 0, "smape_m_fixed": 0}
    

In [46]:
for batch in tqdm(valid_dataloader):
    data, net_num, reg, mfs = batch
    data = data.to(device)

    with torch.no_grad():
        output = model(data)
        val_loss2 = loss_function(data, output)

    val_metrics['val_loss2'] += val_loss2.item()
    for j in range(NETS_BATCHSIZE):
        true_net = torch.load(f"/kaggle/working/model{int(net_num[j])}.pt", map_location=torch.device('cpu'), weights_only=False)

        true_net = RNet([layer for layer in true_net.layers])
        net = reverse_converter(output[j][0], true_net)

        net.to(device).eval()
        true_net.to(device).eval()

        true_model_mse, true_model_smape = valid_model(true_net, reg_dataloaders[int(reg[j])][1])
        gen_model_mse, gen_model_smape = valid_model(net, reg_dataloaders[int(reg[j])][1])

        cos_sim = valid_cos_sim(net, true_net, reg_dataloaders[int(reg[j])][1])

        gen_matrix = output[j][1].cpu()  
        true_matrix = data[j][1].cpu() 
        fixed_matrix = threshold_tensor(gen_matrix)

       
        smape_m = smape(true_matrix, gen_matrix)
        smape_m_fixed = smape(true_matrix, fixed_matrix)
        mse_m = mse(true_matrix, gen_matrix)
        mse_m_fixed = mse(true_matrix, fixed_matrix)

        val_metrics['smape'] += gen_model_smape
        val_metrics['mse'] += (true_model_mse - gen_model_mse)**2
        safe_denominator = max(gen_model_mse, 1e-8)  
        val_metrics['mape'] += (true_model_mse - gen_model_mse).abs() / safe_denominator
        val_metrics['true_model_metric'] += true_model_mse
        val_metrics['gen_model_metric'] += gen_model_mse
        val_metrics['cos_sim'] += cos_sim

        
        val_metrics['smape_m'] += smape_m.item()
        val_metrics['mse_m'] += mse_m.item()
        val_metrics['mse_m_fixed'] += mse_m_fixed.item()
        val_metrics["smape_m_fixed"] += smape_m_fixed.item()


num_batches = len(valid_dataloader) * NETS_BATCHSIZE
val_metrics['smape'] /= num_batches
val_metrics['cos_sim'] /= num_batches
val_metrics['val_loss'] /= len(valid_dataloader)
val_metrics['val_loss2'] /= len(valid_dataloader)
val_metrics['mse'] /= num_batches
val_metrics['mape'] /= num_batches
val_metrics['true_model_metric'] /= num_batches
val_metrics['gen_model_metric'] /= num_batches
val_metrics['smape_m'] /= num_batches
val_metrics['mse_m'] /= num_batches
val_metrics['mse_m_fixed'] /= num_batches
val_metrics['smape_m_fixed'] /= num_batches


  torch.tensor(meta_features)      # Метафичи, полученные из словаря
100%|██████████| 4/4 [02:46<00:00, 41.55s/it]


In [48]:
print(val_metrics['cos_sim'], val_metrics['smape_m_fixed'])

tensor(0.6037, device='cuda:0') 0.013646240234375


In [72]:
noise = torch.randn(1, 32 * 32).to(device)

output = model.decoder(noise)

print("Output shape:", output.shape)
print("Weights shape:", output[:, 0, :, :].shape)
print("Mask shape:", output[:, 1, :, :].shape)

Output shape: torch.Size([1, 2, 64, 64])
Weights shape: torch.Size([1, 64, 64])
Mask shape: torch.Size([1, 64, 64])


In [73]:
mask = threshold_tensor(output[:, 1, :, :])
mask.sum()

tensor(1825)

In [74]:
a = output[:, 0, :, :].cpu() * mask
a = a.squeeze(0)
a

tensor([[-0.0665, -0.2014, -0.0518,  ..., -0.0000, -0.0549, -0.0000],
        [-0.0000,  0.0000,  0.1940,  ..., -0.0808, -0.0733, -0.0774],
        [ 0.1189,  0.0430, -0.0000,  ..., -0.0000, -0.0395, -0.0000],
        ...,
        [-0.1026, -0.0750, -0.0896,  ..., -0.0000, -0.0792, -0.0758],
        [-0.0000, -0.0000, -0.0484,  ..., -0.0000, -0.0535, -0.0000],
        [-0.0824, -0.0932, -0.0903,  ..., -0.0934, -0.0000, -0.0878]],
       grad_fn=<SqueezeBackward1>)