In [None]:
import json
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from math import cos, pi, floor, sin
import torch.nn.functional as F

In [None]:
def load_json(path):
    with open(path, 'r') as file:
        data = json.load(file)
    return data

train_data = load_json("/kaggle/input/company-revenue/train_V3.jsonl")
validation_data = load_json("/kaggle/input/company-revenue/validation_V3.jsonl")
test_data = load_json("/kaggle/input/company-revenue/test_V3.jsonl")
label = 1

labels = load_json("/kaggle/input/company-revenue/two_labels_clustering_result.json")
labels = [key for key in labels.keys() if labels[key]["label"] == label]
train_data = [sample for sample in train_data if sample["soleadify_id"] in labels]
validation_data = [sample for sample in validation_data if sample["soleadify_id"] in labels]
test_data = [sample for sample in test_data if sample["soleadify_id"] in labels]

In [None]:
keys = list(train_data[0].keys())[1:]

def flatten(lst):
    result = []
    for item in lst:
        if isinstance(item, list):
            result.extend(flatten(item))
        else:
            result.append(item)
    return result

def make_tensors(data):
    return torch.tensor([flatten([sample[key] for key in keys]) for sample in data])

train_tensors = make_tensors(train_data)
validation_tensors = make_tensors(validation_data)
test_tensors = make_tensors(test_data)

In [None]:
import wandb
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()

# I have saved my API token with "wandb_api" as Label. 
# If you use some other Label make sure to change the same below. 
wandb_api = user_secrets.get_secret("wandb_api") 

wandb.login(key=wandb_api)

# Hyperparameters
wandb.init(entity="mihail-chirobocea", project="eda", name="test_label1.5_plot")
wandb.config.batch_size = 64
wandb.config.epochs = 14
# wandb.config.learning_rate = 1e-6
wandb.config.learning_rate = 2e-7
wandb.config.bottleneck_dim = 256

In [None]:
class CycleAnnealScheduler:
    def __init__(
        self, optimizer, lr_max, lr_divider, cut_point, step_size, momentum=None
    ):
        self.lr_max = lr_max
        self.lr_divider = lr_divider
        self.cut_point = step_size // cut_point
        self.step_size = step_size
        self.iteration = 0
        self.cycle_step = int(step_size * (1 - cut_point / 100) / 2)
        self.momentum = momentum
        self.optimizer = optimizer

    def get_lr(self):
        if self.iteration > 2 * self.cycle_step:
            cut = (self.iteration - 2 * self.cycle_step) / (
                self.step_size - 2 * self.cycle_step
            )
            lr = self.lr_max * (1 + (cut * (1 - 100) / 100)) / self.lr_divider

        elif self.iteration > self.cycle_step:
            cut = 1 - (self.iteration - self.cycle_step) / self.cycle_step
            lr = self.lr_max * (1 + cut * (self.lr_divider - 1)) / self.lr_divider

        else:
            cut = self.iteration / self.cycle_step
            lr = self.lr_max * (1 + cut * (self.lr_divider - 1)) / self.lr_divider

        return lr

    def get_momentum(self):
        if self.iteration > 2 * self.cycle_step:
            momentum = self.momentum[0]

        elif self.iteration > self.cycle_step:
            cut = 1 - (self.iteration - self.cycle_step) / self.cycle_step
            momentum = self.momentum[0] + cut * (self.momentum[1] - self.momentum[0])

        else:
            cut = self.iteration / self.cycle_step
            momentum = self.momentum[0] + cut * (self.momentum[1] - self.momentum[0])

        return momentum

    def step(self):
        lr = self.get_lr()

        if self.momentum is not None:
            momentum = self.get_momentum()

        self.iteration += 1

        if self.iteration == self.step_size:
            self.iteration = 0

        for group in self.optimizer.param_groups:
            group['lr'] = lr

            if self.momentum is not None:
                group['betas'] = (momentum, group['betas'][1])

        return lr


def anneal_linear(start, end, proportion):
    return start + proportion * (end - start)


def anneal_cos(start, end, proportion):
    cos_val = cos(pi * proportion) + 1

    return end + (start - end) / 2 * cos_val


class Phase:
    def __init__(self, start, end, n_iter, anneal_fn):
        self.start, self.end = start, end
        self.n_iter = n_iter
        self.anneal_fn = anneal_fn
        self.n = 0

    def step(self):
        self.n += 1

        return self.anneal_fn(self.start, self.end, self.n / self.n_iter)

    def reset(self):
        self.n = 0

    @property
    def is_done(self):
        return self.n >= self.n_iter


class CycleScheduler:
    def __init__(
        self,
        optimizer,
        lr_max,
        n_iter,
        momentum=(0.95, 0.85),
        divider=25,
        warmup_proportion=0.3,
        phase=('linear', 'cos'),
    ):
        self.optimizer = optimizer

        phase1 = int(n_iter * warmup_proportion)
        phase2 = n_iter - phase1
        lr_min = lr_max / divider

        phase_map = {'linear': anneal_linear, 'cos': anneal_cos}

        self.lr_phase = [
            Phase(lr_min, lr_max, phase1, phase_map[phase[0]]),
            Phase(lr_max, lr_min / 1e4, phase2, phase_map[phase[1]]),
        ]

        self.momentum = momentum

        if momentum is not None:
            mom1, mom2 = momentum
            self.momentum_phase = [
                Phase(mom1, mom2, phase1, phase_map[phase[0]]),
                Phase(mom2, mom1, phase2, phase_map[phase[1]]),
            ]

        else:
            self.momentum_phase = []

        self.phase = 0

    def step(self):
        lr = self.lr_phase[self.phase].step()

        if self.momentum is not None:
            momentum = self.momentum_phase[self.phase].step()

        else:
            momentum = None

        for group in self.optimizer.param_groups:
            group['lr'] = lr

            if self.momentum is not None:
                if 'betas' in group:
                    group['betas'] = (momentum, group['betas'][1])

                else:
                    group['momentum'] = momentum

        if self.lr_phase[self.phase].is_done:
            self.phase += 1

        if self.phase >= len(self.lr_phase):
            for phase in self.lr_phase:
                phase.reset()

            for phase in self.momentum_phase:
                phase.reset()

            self.phase = 0

        return lr, momentum

In [None]:
class MyDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.data[index]

train_dataset = MyDataset(train_tensors)
validation_dataset = MyDataset(validation_tensors)
test_dataset = MyDataset(test_tensors)

train_dataloader = DataLoader(train_dataset, batch_size=wandb.config.batch_size, shuffle=True)
validation_dataloader = DataLoader(validation_dataset, batch_size=wandb.config.batch_size, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=wandb.config.batch_size, shuffle=False)

In [None]:
class QuantizationLayer(nn.Module):
    def __init__(self, num_clusters, latent_dim):
        super(QuantizationLayer, self).__init__()
        self.num_clusters = num_clusters
        self.codebook = nn.Parameter(torch.rand(num_clusters, latent_dim))

    def forward(self, x):
        # Calculate distances between inputs and codebook vectors
        distances = torch.norm(x.detach().unsqueeze(1) - self.codebook, dim=-1)

        # Find the index of the closest codebook vector for each input
        quantized_indices = torch.argmin(distances, dim=-1)

        # Retrieve the corresponding codebook vectors
        quantized_inputs = self.codebook[quantized_indices]

        return quantized_inputs, quantized_indices

class FCBlock(nn.Module):
    def __init__(self, in_dim, out_dim, use_norm=True, activation=True):
        super(FCBlock, self).__init__()
        self.fc = nn.Linear(in_dim, out_dim)
        self.use_norm = use_norm
        if activation:
            self.activation = nn.SiLU()
        else: 
            self.activation = False
        if use_norm:
            self.norm = nn.LayerNorm(out_dim)
        else:
            self.norm = False

    def forward(self, x):
        x = self.fc(x)
        if self.use_norm:
            x = self.norm(x)
        if self.activation:
            x = self.activation(x)
        return x

    
class Encoder(nn.Module):
    def __init__(
        self,
        in_dim,
        embed_dim,
    ):
        super().__init__()
        
        self.encoder = nn.Sequential(
            FCBlock(input_dim, 8 * embed_dim),
#             FCBlock(8 * embed_dim, 8 * embed_dim),
            FCBlock(8 * embed_dim, 4 * embed_dim),
#             FCBlock(4 * embed_dim, 4 * embed_dim),
            FCBlock(4 * embed_dim, 2 * embed_dim),
#             FCBlock(2 * embed_dim, 2 * embed_dim),
            FCBlock(2 * embed_dim, embed_dim),
#             FCBlock(embed_dim, embed_dim),
#             FCBlock(embed_dim, embed_dim),
            FCBlock(embed_dim, embed_dim),
            FCBlock(embed_dim, embed_dim),
            FCBlock(embed_dim, embed_dim),
        ) 
            
    def forward(self, x):
        return self.encoder(x)
    
    
class VQVAE(nn.Module):
    def __init__(
        self,
        in_dim,
        embed_dim,
        num_clusters=4096,
    ):
        super().__init__()
        self.latent_dim = embed_dim // 2

        self.encoder = Encoder(in_dim, embed_dim)
        
        self.decoder = nn.Sequential(
            FCBlock(self.latent_dim, embed_dim),
            FCBlock(embed_dim, embed_dim),
            FCBlock(embed_dim, embed_dim),
            FCBlock(embed_dim, embed_dim),
            FCBlock(embed_dim, embed_dim),
            FCBlock(embed_dim, embed_dim),
            FCBlock(embed_dim, 2 * embed_dim),
            FCBlock(2 * embed_dim, 2 * embed_dim),
            FCBlock(2 * embed_dim, 4 * embed_dim),
            FCBlock(4 * embed_dim, 4 * embed_dim),
            FCBlock(4 * embed_dim, 8 * embed_dim),
            FCBlock(8 * embed_dim, 8 * embed_dim),
            FCBlock(8 * embed_dim, in_dim, activation=False)
        )

        # Quantization layer
        self.quantization = QuantizationLayer(num_clusters, self.latent_dim)

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def encode(self, x):
        return self.encoder(x)
    
    def forward(self, x):
        new_x = x.clone()
        new_x[:, 7] = torch.zeros(new_x.shape[0])
        # Encode
        enc_output = self.encoder(new_x)
        mu, logvar = enc_output[:, :self.latent_dim], enc_output[:, self.latent_dim:]

        # Reparameterize
        z = self.reparameterize(mu, logvar)

        # Quantize the latent space
        quantized_z, quantized_indices = self.quantization(z)

        # Decode
        x_recon = self.decoder(quantized_z)

        return x_recon, mu, logvar, quantized_indices

In [None]:
class Autoencoder(nn.Module):
    def __init__(self, input_dim, embed_dim):
        super(Autoencoder, self).__init__()

        self.encoder = Encoder(input_dim, embed_dim)
        self.decoder = nn.Sequential(
            FCBlock(embed_dim, 2 * embed_dim),
            FCBlock(2 * embed_dim, 2 * embed_dim),
            FCBlock(2 * embed_dim, 4 * embed_dim),
            FCBlock(4 * embed_dim, 4 * embed_dim),
            FCBlock(4 * embed_dim, 8 * embed_dim),
            FCBlock(8 * embed_dim, 8 * embed_dim),
            FCBlock(8 * embed_dim, input_dim, activation=False)
        )

    def forward(self, x):
        new_x = x.clone()
        new_x[:, 7] = torch.zeros(new_x.shape[0])
        encoded = self.encoder(new_x)
        decoded = self.decoder(encoded)
        return decoded
    
class Predictor(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Predictor, self).__init__()

        self.predict = nn.Sequential(
            FCBlock(input_dim, input_dim // 4),
            FCBlock(input_dim // 4, input_dim // 8),
            FCBlock(input_dim // 8, output_dim),
        ) 

    def forward(self, x):
        return self.predict(x)
    
class FullModel(nn.Module):
    def __init__(self, input_dim, output_dim, encoder):
        super(FullModel, self).__init__()
        
        self.encoder = encoder
        self.predict = Predictor(input_dim, output_dim)

    def forward(self, x):
#         print(f"Original revenue: {x[0][7]} denormalized: {x[0][7]* 789799993000 + 7000}")
        new_x = x.clone()
        new_x[:, 7] = torch.zeros(new_x.shape[0])
#         print(f"Seen revenue: {x[0][7]}")
        new_x = self.encoder(new_x)
        new_x = self.predict(new_x)
#         print(f"Predicted revenue: {x[0].item()} denormalized: {x[0].item()* 789799993000 + 7000}")
        return new_x

In [None]:
# def train_vqautoencoder(model, train_dataloader, val_dataloader, criterion, optimizer, scheduler, num_epochs=10, device="cpu", save_path="best_model.pth"):
#     model.to(device)
#     best_val_loss = float('inf')
    
#     for epoch in range(num_epochs):
#         model.train()
#         running_loss = 0.0
        
#         for k, batch_data in enumerate(train_dataloader):
            
#             optimizer.zero_grad()
#             inputs = batch_data.to(device)
            
#             # Forward pass
#             recon_batch, mu, logvar, quantized_indices = model(inputs)

#             # Compute reconstruction loss and KL divergence
#             reconstruction_loss = criterion(recon_batch, batch_data)
#             kl_divergence = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())

#             # Quantization loss - encourage the codebook to adapt to the data
#             quantization_loss = torch.mean(torch.norm(mu - model.quantization.codebook[quantized_indices], dim=-1))

#             # Total loss
#             loss = reconstruction_loss + kl_divergence + quantization_loss
#             running_loss += reconstruction_loss.item()
            
#             # Backward pass and optimization
#             loss.backward()
#             scheduler.step()
#             optimizer.step()
            
#             if k % 100 == 0:
#                 print(f"Training - Epoch [{epoch + 1}/{num_epochs}], Step [{k}], Loss: {reconstruction_loss.item():.4f}")
#                 wandb.log({"Train Loss": reconstruction_loss.item()})
#                 wandb.log({"Learning Rate": optimizer.param_groups[0]['lr']})
                
#         # Calculate average training loss for the epoch
#         average_train_loss = running_loss / len(train_dataloader)
#         print(f"Training - Epoch [{epoch + 1}/{num_epochs}], Average Loss: {average_train_loss:.4f}")
#         wandb.log({"Train Epoch Loss": average_train_loss, "Epoch": epoch})
    
#         # Validation
#         model.eval()
#         val_running_loss = 0.0
#         with torch.no_grad():
#             for k, val_data in enumerate(val_dataloader):

#                 inputs = val_data.to(device)

#                 # Forward pass
#                 recon_batch, mu, logvar, quantized_indices = model(inputs)

#                 # Compute reconstruction loss and KL divergence
#                 reconstruction_loss = criterion(recon_batch, val_data)
#                 kl_divergence = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())

#                 # Quantization loss - encourage the codebook to adapt to the data
#                 quantization_loss = torch.mean(torch.norm(mu - model.quantization.codebook[quantized_indices], dim=-1))

#                 # Total loss
#                 val_loss = reconstruction_loss + kl_divergence + quantization_loss
#                 val_running_loss += reconstruction_loss.item()
                
#         # Calculate average validation loss for the epoch
#         average_val_loss = val_running_loss / len(val_dataloader)
#         print(f"Validation - Epoch [{epoch + 1}/{num_epochs}], Average Loss: {average_val_loss:.8f}")
#         wandb.log({"Validation Epoch Loss": average_val_loss, "Epoch": epoch})

#         # Save the best model
#         if average_val_loss < best_val_loss:
#             best_val_loss = average_val_loss
#             torch.save(model.state_dict(), save_path)
#             print(f"Best model saved with validation loss: {best_val_loss:.4f} at epoch {epoch + 1}")

#     print("Training complete.")
#     wandb.finish()

In [None]:
def train_autoencoder(model, train_dataloader, val_dataloader, criterion, optimizer, scheduler, num_epochs=10, device="cpu", save_path="best_model.pth"):
    model.to(device)
    best_val_loss = float('inf')
#     std = 0.006515
    all_preds = []
    
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        running_metric = 0.0
        
#         for k, batch_data in enumerate(train_dataloader):
            
#             optimizer.zero_grad()
#             inputs = batch_data.to(device)
            
#             # Forward pass
#             recon_batch = model(inputs).flatten().clamp(0, 1)
#             reconstruction_loss = criterion((recon_batch * (789800000000-7000) + 7000), batch_data[:, 7] * (789800000000-7000) + 7000)
#             metric = F.l1_loss((recon_batch * (789800000000-7000) + 7000), batch_data[:, 7] * (789800000000-7000) + 7000)
            
#             if epoch > -1:
#                 if k % 100 == 0:
#                     print(f"Original revenue: {batch_data[0][7]}")
#                     print(f"Predicted revenue: {recon_batch[0]}")
#                     print(f"Error: {abs((batch_data[0][7]) - (recon_batch[0]))}")

#             # Total loss
#             loss = reconstruction_loss
#             running_loss += reconstruction_loss.item()
#             running_metric += metric.item()
            
#             # Backward pass and optimization
#             loss.backward()
#             scheduler.step()
#             optimizer.step()
            
#             if k % 100 == 0:
#                 print(f"Training - Epoch [{epoch + 1}/{num_epochs}], Step [{k}], Loss: {reconstruction_loss.item():.4f}")
#                 wandb.log({"Train MSE Loss": reconstruction_loss.item()})
#                 wandb.log({"Train L1 Metric": metric.item()})
#                 wandb.log({"Learning Rate": optimizer.param_groups[0]['lr']})
                
#         # Calculate average training loss for the epoch
#         average_train_loss = running_loss / len(train_dataloader)
#         average_train_metric = running_metric / len(train_dataloader)
#         print(f"Training - Epoch [{epoch + 1}/{num_epochs}], Average Loss: {average_train_loss:.4f}")
#         wandb.log({"Train Epoch Loss": average_train_loss, "Epoch": epoch})
#         wandb.log({"Train Epoch Metric": average_train_metric, "Epoch": epoch})
    
        # Validation
        model.eval()
        val_running_loss = 0.0
        with torch.no_grad():
            for k, val_data in enumerate(val_dataloader):

                inputs = val_data.to(device)

                # Forward pass
                recon_batch = model(inputs).flatten().clamp(0, 1)
                predicted = recon_batch.detach().cpu().numpy()
                all_preds.append(predicted)
#                 reconstruction_loss = criterion((recon_batch * 789800000000 + 7000), val_data[:, 7] * 789800000000 + 7000)
                metric = F.l1_loss((recon_batch * (789800000000-7000) + 7000), val_data[:, 7] * (789800000000-7000) + 7000)
                val_running_loss += metric.item()
                
        # Calculate average validation loss for the epoch
        average_val_loss = val_running_loss / len(val_dataloader)
        print(f"Validation - Epoch [{epoch + 1}/{num_epochs}], Average Metric: {average_val_loss:.8f}")
        wandb.log({"Validation Epoch Metric": average_val_loss, "Epoch": epoch})

        # Save the best model
        if average_val_loss < best_val_loss:
            best_val_loss = average_val_loss
            torch.save(model.state_dict(), save_path)
            print(f"Best model saved with validation loss: {best_val_loss:.4f} at epoch {epoch + 1}")

    print("Training complete.")
    wandb.finish()
    return all_preds

In [None]:
# import torch
# import numpy as np
# import matplotlib.pyplot as plt
# from torch.autograd import Variable

# def validate_autoencoder(model, val_dataloader, criterion, device="cpu"):
#     model.eval()
#     val_running_loss = 0.0
#     all_gradients = []  # List to store gradients for each batch
    
#     for k, val_data in enumerate(val_dataloader):
#         inputs = val_data.to(device)

#         # Forward pass
#         recon_batch = model(inputs)

#         reconstruction_loss = criterion(recon_batch, val_data[7])
#         val_running_loss += reconstruction_loss.item()

#         # Compute gradients for input features
#         input_data_tensor = Variable(val_data, requires_grad=True).to(device)
#         gradients = input_gradients(model, input_data_tensor)
#         all_gradients.append(gradients.numpy())

#     # Calculate average validation loss
#     average_val_loss = val_running_loss / len(val_dataloader)
#     print(f"Average Validation Loss: {average_val_loss:.8f}")

#     # Calculate average gradients over all batches
#     avg_gradients = np.mean(all_gradients, axis=0)

#     # Plot the average gradients
#     plot_gradients(avg_gradients)

# def plot_gradients(avg_gradients):
#     num_features = avg_gradients.shape[1]
#     feature_names = [f"Feature {i}" for i in range(1, num_features + 1)]

#     plt.bar(feature_names, avg_gradients)
#     plt.xlabel('Input Features')
#     plt.ylabel('Average Gradients')
#     plt.title('Average Gradients for Input Features')
#     plt.show()

# def input_gradients(model, input_data, target_index=0):
#     input_data = Variable(input_data, requires_grad=True)
#     model.eval()  # Set the model to evaluation mode

#     output = model(input_data)
    
#     # Choose a specific element or aggregation of elements from the output for gradient computation
#     target_value = output[0, target_index]  # You may need to adjust this based on your model's output shape

#     target_value.backward()
#     gradients = input_data.grad.data

#     # You can return gradients as a numpy array if needed
#     return gradients

In [None]:
input_dim = 1740
autoencoder = Autoencoder(input_dim, wandb.config.bottleneck_dim)
# autoencoder.load_state_dict(torch.load("/kaggle/input/models-eda/autoencoder_v16.pth"))
autoencoder = FullModel(256, 1, autoencoder.encoder)
autoencoder.load_state_dict(torch.load("/kaggle/input/models-eda/N-L1_label_1_autoencoder_V0.5.pth"))
# Define training parameters
criterion = nn.MSELoss()
optimizer = AdamW(autoencoder.parameters(), lr=wandb.config.learning_rate)
scheduler = CycleScheduler(
    optimizer,
    wandb.config.learning_rate,
    n_iter=len(train_dataloader) * wandb.config.epochs,
    momentum=None,
    warmup_proportion=0.15,
)

# Train the autoencoder
y_val_pred = train_autoencoder(autoencoder, train_dataloader, test_dataloader, criterion, optimizer, scheduler, num_epochs=wandb.config.epochs, save_path="N-L1_label_1_autoencoder_V0.5.pth")

In [None]:
len(test_dataloader)

In [None]:
y_val_pred_f = []
for sample in y_val_pred:
    y_val_pred_f.extend(sample)
np.array(y_val_pred_f[:3546]).shape

In [None]:
y_val = [sample["estimated_revenue"] for sample in test_data]
np.array(y_val).shape

In [None]:
import seaborn as sns

sns.kdeplot(y_val, label='True Values', fill=True)
sns.kdeplot(y_val_pred_f, label='Predicted Values', fill=True)
plt.title('True vs Predicted Values on Test - Label 1 ')
plt.legend()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming y_val and y_val_pred_f are your data arrays

plt.hist(y_val, label='True Values', alpha=0.5, density=True, bins=70, color='blue', edgecolor='black')
plt.hist(y_val_pred_f, label='Predicted Values', alpha=0.5, density=True, bins=10, color='orange', edgecolor='black')

plt.title('True vs Predicted Values on Test - Label 1 ')
plt.legend()
plt.show()


In [None]:
# input_dim = 1740
# # autoencoder = Autoencoder(input_dim, wandb.config.bottleneck_dim)
# # autoencoder.load_state_dict(torch.load("/kaggle/input/models-eda/autoencoder_v16.pth"))
# autoencoder = FullModel(256, 1, Encoder(input_dim, 256))
# autoencoder.load_state_dict(torch.load("/kaggle/input/models-eda/predictor_v19.pth"))
# validate_autoencoder(autoencoder, validation_dataloader, criterion)