## Import libraries

This notebook is all about just mimicking the eaxt architecture of grownet using the kaggle notebook

In [None]:
import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt
import os
import copy
import seaborn as sns

from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler, PowerTransformer, QuantileTransformer, OneHotEncoder, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

import time

import warnings
warnings.filterwarnings('ignore')

## Parameters

In [70]:
params = {
    "feat_d": 200,
    "hidden_size": 128,
    "n_classes": 7,
    "num_nets": 20,
    "boost_rate": 0.05,
    "lr": 1e-3,
    "weight_decay": 1e-5,
    "batch_size": 128,
    "epochs_per_stage": 10,
    "correct_epoch": 3,
    "early_stopping_steps": 10,
}


In [4]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

## Preprocessing

In [9]:
# Function to process the data to feed it into the neural network
def process_data(data_path):

    try:
        in_data = pd.read_csv(data_path)
    except FileNotFoundError:
        print(f"Error: File not found at {data_path}")


    # Initialize label and scalers
    le_continent = LabelEncoder()
    le_city = LabelEncoder()
    stdscaler_lat = StandardScaler() 
    stdscaler_long = StandardScaler() 
    coordinate_scaler = StandardScaler()

    
    # Convert all the categorical variables into numbers
    in_data['city_encoding'] = in_data[['city']].apply(le_city.fit_transform)
    in_data['continent_encoding'] = in_data[['continent']].apply(le_continent.fit_transform)
    in_data['lat_scaled'] = stdscaler_lat.fit_transform(in_data[['latitude']])
    in_data['long_scaled'] = stdscaler_long.fit_transform(in_data[['longitude']])

    
    # Another way of scaling latitiude and longitude data to avoid exploding gradient problem.
    # https://datascience.stackexchange.com/questions/13567/ways-to-deal-with-longitude-latitude-feature 
    # Convert latitude and longitutde into radians
    in_data['latitude_rad'] = np.deg2rad(in_data['latitude'])
    in_data['longitude_rad'] = np.deg2rad(in_data['longitude'])

    # Calculate x, y, z coordinates -  Converting polar co-ordinates into cartesian co-ordinates
    in_data['x'] = np.cos(in_data['latitude_rad']) * np.cos(in_data['longitude_rad'])
    in_data['y'] = np.cos(in_data['latitude_rad']) * np.sin(in_data['longitude_rad'])
    in_data['z'] = np.sin(in_data['latitude_rad'])

    # Scale the x, y, z coordinates together
    in_data[['scaled_x','scaled_y','scaled_z']] = coordinate_scaler.fit_transform (in_data[['x','y','z']])

    # Encoding dictionary for simpler plotting and understanding the results
    continent_encoding_map = dict(zip(le_continent.transform(le_continent.classes_), le_continent.classes_))
    city_encoding_map = dict(zip(le_city.transform(le_city.classes_),le_city.classes_))

    # Define all non-feature columns
    non_feature_columns = [
        'city', 'continent', 'latitude', 'longitude', # Original identifier/target columns
        'city_encoding', 'continent_encoding', # Encoded categorical targets
        'lat_scaled', 'long_scaled', # Old scaled lat/long (if not used as features)
        'latitude_rad', 'longitude_rad', # Intermediate radian values
        'x', 'y', 'z', # Intermediate cartesian coordinates
        'scaled_x', 'scaled_y', 'scaled_z','Unnamed: 0' # Final XYZ targets
    ]

    # Select X by dropping non-feature columns
    # Use errors='ignore' in case some columns don't exist (e.g., if you only keep one scaling method)
    X = in_data.drop(columns=non_feature_columns, errors='ignore').values.astype(np.float32)

    # Define target columns explicitly
    y_columns = ['continent_encoding', 'city_encoding', 'scaled_x','scaled_y','scaled_z']
    y = in_data[y_columns].values.astype(np.float32)

    return in_data, X, y, le_continent, le_city, coordinate_scaler, continent_encoding_map, city_encoding_map

# Inverse transform xyz cordinates into latitude and longitude values
def inverse_transform_spherical(scaled_xyz, coordinate_scaler):
    """Inverse transforms scaled x, y, z back to latitude and longitude (degrees)."""
    xyz = coordinate_scaler.inverse_transform(scaled_xyz)
    x = xyz[:, 0]
    y = xyz[:, 1]
    z = xyz[:, 2]
    latitude_rad = np.arcsin(np.clip(z, -1, 1))
    longitude_rad = np.arctan2(y, x)
    latitude_deg = np.degrees(latitude_rad)
    longitude_deg = np.degrees(longitude_rad)
    return latitude_deg, longitude_deg


In [11]:
df = pd.read_csv("/home/chandru/binp37/results/metasub/metasub_training_testing_data.csv")
df = pd.concat([df.iloc[:,:-4],df['continent']],axis=1)
x_data = df[df.columns[:-1]][:].to_numpy()
print(x_data.shape)
y_data = df[df.columns[-1]][:].to_numpy()
le = LabelEncoder()
y_data = le.fit_transform(y_data)
print(le.classes_)

continent_encoding_map = dict(zip(le.transform(le.classes_), le.classes_))
print(continent_encoding_map)

(4070, 200)
['east_asia' 'europe' 'middle_east' 'north_america' 'oceania'
 'south_america' 'sub_saharan_africa']
{np.int64(0): 'east_asia', np.int64(1): 'europe', np.int64(2): 'middle_east', np.int64(3): 'north_america', np.int64(4): 'oceania', np.int64(5): 'south_america', np.int64(6): 'sub_saharan_africa'}


## Dataset Classes

In [12]:
class TrainDataset:
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets

    def __len__(self):
        return (self.features.shape[0])
    
    def __getitem__(self,idx):
        dct = {
            'x': torch.tensor(self.features[idx,:],dtype=torch.float),
            'y': torch.tensor(self.targets[idx,:],dtype=torch.float)
        }
        return dct

class TestDataset:
    def __init__(self, features):
        self.features = features
        
    def __len__(self):
        return (self.features.shape[0])
    
    def __getitem__(self, idx):
        dct = {
            'x' : torch.tensor(self.features[idx, :], dtype=torch.float)
        }
        return dct

## Dynamic Model

In [55]:
from enum import Enum
class ForwardType(Enum):
    SIMPLE = 0
    STACKED = 1
    CASCADE = 2
    GRADIENT = 3

class DynamicNet(object):
    def __init__(self, c0, lr):
        self.models = []
        self.c0 = c0
        self.lr = lr
        self.boost_rate  = nn.Parameter(torch.tensor(lr, requires_grad=True))

    def to(self, device):
        self.c0 = self.c0.to(device)
        self.boost_rate = self.boost_rate.to(device)
        for m in self.models:
            m.to(device)

    def add(self, model):
        self.models.append(model)

    def parameters(self):
        params = []
        for m in self.models:
            params.extend(m.parameters())

        params.append(self.boost_rate)
        return params

    def zero_grad(self):
        for m in self.models:
            m.zero_grad()

    def to_cuda(self):
        for m in self.models:
            m.cuda()

    def to_eval(self):
        for m in self.models:
            m.eval()

    def to_train(self):
        for m in self.models:
            m.train(True)

    def forward(self, x):
        if len(self.models) == 0:
            batch = x.shape[0]
            c0 = np.repeat(self.c0.detach().cpu().numpy().reshape(1,-1), batch, axis=0)
            return None, torch.Tensor(c0).to(device=device)
        middle_feat_cum = None
        prediction = None
        with torch.no_grad():
            for m in self.models:
                if middle_feat_cum is None:
                    middle_feat_cum, prediction = m(x, middle_feat_cum)
                else:
                    middle_feat_cum, pred = m(x, middle_feat_cum)
                    prediction += pred
        return middle_feat_cum, self.c0 + self.boost_rate * prediction

    def forward_grad(self, x):
        if len(self.models) == 0:
            batch = x.shape[0]
            c0 = np.repeat(self.c0.detach().cpu().numpy().reshape(1, -1), batch, axis=0)
            return None, torch.Tensor(c0).cuda()
        # at least one model
        middle_feat_cum = None
        prediction = None
        for m in self.models:
            if middle_feat_cum is None:
                middle_feat_cum, prediction = m(x, middle_feat_cum)
            else:
                middle_feat_cum, pred = m(x, middle_feat_cum)
                prediction += pred
        return middle_feat_cum, self.c0 + self.boost_rate * prediction

    @classmethod
    def from_file(cls, path, builder):
        d = torch.load(path)
        net = DynamicNet(d['c0'], d['lr'])
        net.boost_rate = d['boost_rate']
        for stage, m in enumerate(d['models']):
            submod = builder(stage)
            submod.load_state_dict(m)
            net.add(submod)
        return net

    def to_file(self, path):
        models = [m.state_dict() for m in self.models]
        d = {'models': models, 'c0': self.c0, 'lr': self.lr, 'boost_rate': self.boost_rate}
        torch.save(d, path)    

## Weak Models

In [56]:
class MLP_1HL(nn.Module):
    def __init__(self, dim_in, dim_hidden1, dim_out):
        super(MLP_1HL, self).__init__()
        
        # Layer 1: Input -> Hidden
        self.layer1 = nn.Sequential(
            nn.Linear(dim_in,dim_hidden1),
            nn.BatchNorm1d(dim_hidden1),
            nn.ReLU(),
            nn.Dropout(0.2)            
        )

        self.layer2 = nn.Sequential(
           nn.Linear(dim_hidden1,dim_out)
        )


    def forward(self, x, lower_f):
        if lower_f is not None:
            x = torch.cat([x, lower_f], dim=1)
        out = self.layer1(x)
        return out, self.layer2(out)

    @classmethod
    def get_model(cls, stage, params):
        if stage == 0:
            dim_in = params["feat_d"]
        else:
            dim_in = params["feat_d"] + params["hidden_size"]
        model = MLP_1HL(dim_in, params["hidden_size"], params["hidden_size"])
        return model


class MLP_2HL(nn.Module):
    def __init__(self, dim_in, dim_hidden1, dim_hidden2,dim_out=7, sparse=False, bn=True):
        super(MLP_2HL, self).__init__()
        self.bn2 = nn.BatchNorm1d(dim_in)

        self.layer1 = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(dim_in, dim_hidden1),
            nn.ReLU(),
            nn.BatchNorm1d(dim_hidden1),
            nn.Dropout(0.4),
            nn.Linear(dim_hidden1, dim_hidden2)
        )
        self.layer2 = nn.Sequential(
            nn.ReLU(),
            nn.Linear(dim_hidden2, dim_out)
        )

    def forward(self, x, lower_f):
        if lower_f is not None:
            x = torch.cat([x, lower_f], dim=1)
            x = self.bn2(x)

        middle_feat = self.layer1(x)
        out = self.layer2(middle_feat)
        return middle_feat, out

    @classmethod
    def get_model(cls, stage, params):
        if stage == 0:
            dim_in = params["feat_d"]
        else:
            dim_in = params["feat_d"] + params["hidden_size"]
        model = MLP_2HL(dim_in, params["hidden_size"], params["hidden_size"])
        return model


In [57]:
from torch.nn.modules.loss import _WeightedLoss
class SmoothBCEwLogits(_WeightedLoss):
    def __init__(self, weight=None, reduction='mean', smoothing=0.0):
        super().__init__(weight=weight, reduction=reduction)
        self.smoothing = smoothing
        self.weight = weight
        self.reduction = reduction

    @staticmethod
    def _smooth(targets:torch.Tensor, n_labels:int, smoothing=0.0):
        assert 0 <= smoothing < 1
        with torch.no_grad():
            targets = targets * (1.0 - smoothing) + 0.5 * smoothing
        return targets

    def forward(self, inputs, targets):
        targets = SmoothBCEwLogits._smooth(targets, inputs.size(-1),
            self.smoothing)
        loss = F.binary_cross_entropy_with_logits(inputs, targets,self.weight)

        if  self.reduction == 'sum':
            loss = loss.sum()
        elif  self.reduction == 'mean':
            loss = loss.mean()

        return loss

In [58]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [59]:
def get_optim(params, lr, weight_decay):
    optimizer = optim.Adam(params, lr, weight_decay=weight_decay)
    #optimizer = SGD(params, lr, weight_decay=weight_decay)
    return optimizer

def logloss(net_ensemble, test_loader):
    loss = 0
    total = 0
    loss_f = nn.BCEWithLogitsLoss() # Binary cross entopy loss with logits, reduction=mean by default
    for data in test_loader:
        x = data["x"].to(device)
        y = data["y"].to(device)
        # y = (y + 1) / 2
        with torch.no_grad():
            _, out = net_ensemble.forward(x)
        # out = torch.as_tensor(out, dtype=torch.float32).cuda().view(-1, 1)
        loss += loss_f(out, y)
        total += 1

    return loss / total

## Training

In [61]:
y_onehot = np.eye(params["n_classes"])[y_data]


# Split intp train and validation sets
X_train, X_test, y_train, y_test = train_test_split(x_data, y_onehot, test_size=0.2, random_state=42, stratify=y_onehot)

train_ds = TrainDataset(X_train, y_train)
val_ds = TrainDataset(X_test, y_test)

train_loader = DataLoader(train_ds, batch_size=params["batch_size"], shuffle=True)
val_loader = DataLoader(val_ds, batch_size=params["batch_size"],shuffle=False)

print(f"Train size: {len(train_ds)}, Val size: {len(val_ds)}")

Train size: 3256, Val size: 814


In [62]:
# Initialiaze GrowNet
c0 = torch.tensor(np.log(np.mean(y_train, axis=0)), dtype=torch.float).unsqueeze(0).to(device)

net_ensemble = DynamicNet(c0, params["boost_rate"])
net_ensemble.to(device)

loss_stagewise = nn.MSELoss(reduction="none")
loss_corrective = SmoothBCEwLogits(smoothing=0.001, reduction="None")

best_val_loss = float("inf")
best_stage = 0
early_stop = 0
lr = params["lr"]

print("Initial Logloss:", logloss(net_ensemble,val_loader).item())


Initial Logloss: 0.33429235219955444


In [None]:
for stage in range(params["num_nets"]):
    t0 = time.time()

    print(f"\n Training weak learner {stage+1}/{params["num_nets"]}")

    model = MLP_2HL.get_model(stage,params).to(device)
    optimizer = get_optim(model.parameters(), lr, params["weight_decay"])
    net_ensemble.to_train()

    stage_train_losses = []

    for epoch in range(params["epochs_per_stage"]):
        for batch in train_loader:
            x = batch["x"].to(device)
            y = batch["y"].to(device)

            with torch.no_grad():
                _, out_prev = net_ensemble.forward(x)
                h = 1 / ((1 + torch.exp(y * out_prev)) * (1 + torch.exp(-y * out_prev)))
                grad_direction = y * (1 + torch.exp(-y * out_prev))
            
            middle_feat, out = model(x, None if stage == 0 else net_ensemble.forward_grad(x)[0])
            loss = loss_stagewise(net_ensemble.boost_rate*out, grad_direction)
            loss = (loss*h).mean()

            model.zero_grad()
            loss.backward()
            optimizer.step()
            stage_train_losses.append(loss.item())
        
    net_ensemble.add(model)
    avg_stage_loss = np.mean(stage_train_losses)
    print(f"Stage {stage+1} finished | Avg Train Loss: {avg_stage_loss:.5f} | Time: {time.time() - t0:.1f}s")


    # Corrective step
    if stage > 0:
        if stage % 3 == 0:
            lr /= 2
        corrective_optimizer = get_optim(net_ensemble.parameters(), lr/2, params["weight_decay"])
        corrective_losses = []

        for _ in range(params["correct_epoch"]):
            for batch in train_loader:
                x = batch["x"].to(device)
                y = batch["y"].to(device)

                _, out = net_ensemble.forward_grad(x)
                loss = loss_corrective(out,y).mean()
                corrective_optimizer.zero_grad()
                loss.backward()
                corrective_optimizer.step()
                corrective_losses.append(loss.item())
        print(f"Fully corrective step avg losse: {np.mean(corrective_losses):.3f}")

    # Validation
    val_loss = logloss(net_ensemble, val_loader).item()
    print(f"Validation LogLoss: {val_loss:.5f} | Boost rate: {net_ensemble.boost_rate.item():.4f}")

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_stage = stage
        early_stop = 0
    else:
        early_stop += 1
        if early_stop > params["early_stopping_steps"]:
            print("🛑 Early stopping!")
            break

print(f"\nBest model was at stage {best_stage+1} with Val LogLoss: {best_val_loss:.5f}")



 Training weak learner 1/5


KeyboardInterrupt: 