# Nursery dataset
Was originally created to rank and evaluate nursery school applications. 
So an application with for example, a family that is financially stable, has good housing, and has no social or health problems would be classified as priority.
And applications that may involve severe financial, social, or health issues that make it highly unlikely for the application to be accepted, would be classified as

Dit weet ik niet zeker??..

So the ranking from best to worst is:

Very Recommended > Recommended > Priority > Special Priority > Not Recommended.

In [1]:
#!pip install -r ../requirements.txt

In [2]:
# ucimlrepo is a tool that provides easy access to datasets hosted on the UCI Machine Learning Repository
from ucimlrepo import fetch_ucirepo

# Fetch dataset from UCI repository, which has ud 76
nursery = fetch_ucirepo(id=76)

# Display metadata and variable information
print(nursery.metadata) # metadata 
print(nursery.variables) # variable information 
print("\n"+ "The first 5 rows of the dataset:")
print(nursery.data.features.head())  # Display first 5 rows of features
# Show the target variable, possibilities

# Show the target variable and its unique possibilities
unique_targets = nursery.data.targets['class'].unique()  # Access the 'class' column and get unique values
print("\nPossible target classes:")
print(unique_targets)  # Display unique target classes

{'uci_id': 76, 'name': 'Nursery', 'repository_url': 'https://archive.ics.uci.edu/dataset/76/nursery', 'data_url': 'https://archive.ics.uci.edu/static/public/76/data.csv', 'abstract': ' Nursery Database was derived from a hierarchical decision model originally developed to rank applications for nursery schools.', 'area': 'Social Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 12960, 'num_features': 8, 'feature_types': ['Categorical'], 'demographics': [], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1989, 'last_updated': 'Sun Jan 14 2024', 'dataset_doi': '10.24432/C5P88W', 'creators': ['Vladislav Rajkovic'], 'intro_paper': {'ID': 372, 'type': 'NATIVE', 'title': 'An application for admission in public school systems', 'authors': 'M. Olave, V. Rajkovic, M. Bohanec', 'venue': 'Expert Systems in Public Administration', 'year': 1989, 'journal': None, 'DOI': None, 

# Imports

In [3]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score
# from model import Mamba, ModelArgs  # Import your custom Mamba implementation

# Preprocessing
The datta will be preprocessed, and converted into tensors

In [4]:
X = nursery.data.features  # These are all the feature columns in the dataset
Y = nursery.data.targets  # This is the target column in the dataset
print("\nOriginal  values (before encoding):")
print(X[1:4])  # Display a sample of the feature values
print(Y[1:4])  # Display a sample of the target values

# In case of future errors: Y = Y.values.ravel()  # Flatten Y to make it a 1D array if needed
label_encoder = LabelEncoder()  # Used to encode the categorical target variables into numerical values
X = X.apply(label_encoder.fit_transform)  # Encode the feature variables (X)
Y = label_encoder.fit_transform(Y)  # Encode the target variable (Y)

print("\nEncoded target values (after encoding):")
print(X[1:4])  # Display the encoded feature values
print(Y[1:4])  # Display the encoded target values

# Split into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Convert the train/test data into PyTorch tensors
# We must do this because PyTorch models only accept tensors as input
# Both the MambaClassifier and Mamba classes inherit from torch.nn.Module
# which is the base class for all neural network modules in PyTorch.
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
Y_train_tensor = torch.tensor(Y_train, dtype=torch.long)
Y_test_tensor = torch.tensor(Y_test, dtype=torch.long)

# Lets see how these tensors look like
print("\nSample of training data tensor:")
print(X_train_tensor[0:2])  # Display a sample of the training data tensor
print("\nSample of training target tensor:")
print(Y_train_tensor[0:2])  # Display a sample of the training target tensor
print("\nSample of testing data tensor:")
print(X_test_tensor[0:2])  # Display a sample of the testing data tensor
print("\nSample of testing target tensor:")
print(Y_test_tensor[0:2])  # Display a sample of the testing target tensor

# Create PyTorch datasets and data loaders
train_dataset = TensorDataset(X_train_tensor, Y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, Y_test_tensor)

# DataLoader to help in batch processing during model training/testing
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64)


Original  values (before encoding):
  parents has_nurs      form children     housing     finance         social  \
1   usual   proper  complete        1  convenient  convenient        nonprob   
2   usual   proper  complete        1  convenient  convenient        nonprob   
3   usual   proper  complete        1  convenient  convenient  slightly_prob   

        health  
1     priority  
2    not_recom  
3  recommended  
       class
1   priority
2  not_recom
3  recommend

Encoded target values (after encoding):
   parents  has_nurs  form  children  housing  finance  social  health
1        2         3     0         0        0        0       0       1
2        2         3     0         0        0        0       0       0
3        2         3     0         0        0        0       2       2
[1 0 2]

Sample of training data tensor:
tensor([[2., 0., 2., 1., 1., 0., 2., 1.],
        [0., 4., 3., 3., 0., 0., 0., 0.]])

Sample of training target tensor:
tensor([3, 0])

Sample of testing da

  y = column_or_1d(y, warn=True)


# Defining the model

In [13]:
from __future__ import annotations
import math
import json
from typing import Union
import torch
import torch.nn as nn
import torch.nn.functional as F
from dataclasses import dataclass
from einops import rearrange, repeat, einsum

from MAMBARIS.models.model import RMSNorm, ResidualBlock
@dataclass
class ModelArgs:
    d_model: int
    n_layer: int
    vocab_size: int
    d_state: int = 16
    expand: int = 2
    dt_rank: Union[int, str] = 'auto'
    d_conv: int = 4 
    pad_vocab_size_multiple: int = 8
    conv_bias: bool = True
    bias: bool = False
    
    def __post_init__(self):
        self.d_inner = int(self.expand * self.d_model)
        
        if self.dt_rank == 'auto':
            self.dt_rank = math.ceil(self.d_model / 16)
            
        if self.vocab_size % self.pad_vocab_size_multiple != 0:
            self.vocab_size += (self.pad_vocab_size_multiple
                                - self.vocab_size % self.pad_vocab_size_multiple)

class Mambasss(nn.Module):
    def __init__(self, args: ModelArgs):
        """Full Mamba model."""
        super().__init__()
        self.args = args
        
        self.embedding = nn.Embedding(args.vocab_size, args.d_model)
        self.layers = nn.ModuleList([ResidualBlock(args) for _ in range(args.n_layer)])
        self.norm_f = RMSNorm(args.d_model)

        self.lm_head = nn.Linear(args.d_model, args.vocab_size, bias=False)
        self.lm_head.weight = self.embedding.weight  # Tie output projection to embedding weights.
                                                     # See "Weight Tying" paper


    def forward(self, input_ids):
        """
        Args:
            input_ids (long tensor): shape (b, l)    (See Glossary at top for definitions of b, l, d_in, n...)
    
        Returns:
            logits: shape (b, l, vocab_size)

        Official Implementation:
            class MambaLMHeadModel, https://github.com/state-spaces/mamba/blob/main/mamba_ssm/models/mixer_seq_simple.py#L173

        """
        x = self.embedding(input_ids)
        
        for layer in self.layers:
            x = layer(x)
            
        x = self.norm_f(x)
        logits = self.lm_head(x)

        return logits

# Define model parameters
input_size = X_train.shape[1]  # Number of input features
num_classes = len(nursery.data.targets['class'].unique())  # Number of target classes
d_model = 64
n_layer = 4

# Instantiate the Mamba model
model = Mambasss(input_size=input_size, num_classes=num_classes, d_model=d_model, n_layer=n_layer)

ModuleNotFoundError: No module named 'MAMBARIS'

In [5]:
# Custom MambaClassifier for tabular data
class MambaClassifier(nn.Module):
    def __init__(self, input_size, num_classes, d_model=64, n_layer=4):
        super(MambaClassifier, self).__init__()
        
        # Initial Linear Layer to map tabular data to a higher-dimensional space
        self.initial_fc = nn.Linear(input_size, d_model)

        # Simulate Mamba-like architecture using transformer blocks
        self.transformer_blocks = nn.Sequential(
            nn.Linear(d_model, d_model),
            nn.ReLU(),
            nn.Linear(d_model, d_model),
            nn.ReLU()
        )

        # Classification head
        self.classifier = nn.Sequential(
            nn.Linear(d_model, 128),
            nn.ReLU(),
            nn.Linear(128, num_classes)
        )

    def forward(self, x):
        # Pass through initial fully connected layer
        x = self.initial_fc(x)

        # Pass through transformer-like blocks
        x = self.transformer_blocks(x)

        # Classification head
        output = self.classifier(x)
        return output

# Define model parameters
input_size = X_train.shape[1]  # Number of input features
num_classes = len(nursery.data.targets['class'].unique())  # Number of target classes
d_model = 64
n_layer = 4

# Instantiate the Mamba model
model = MambaClassifier(input_size=input_size, num_classes=num_classes, d_model=d_model, n_layer=n_layer)

In [10]:
from einops import rearrange, repeat
from mamba_ssm.ops.selective_scan_interface import selective_scan_fn, mamba_inner_fn
class Mamba(nn.Module):
    def __init__(
        self,
        d_model,
        d_state=16,
        d_conv=4,
        expand=2,
        dt_rank="auto",
        dt_min=0.001,
        dt_max=0.1,
        dt_init="random",
        dt_scale=1.0,
        dt_init_floor=1e-4,
        conv_bias=True,
        bias=False,
        use_fast_path=True,  # Fused kernel options
        layer_idx=None,
        device=None,
        dtype=None,
    ):
        factory_kwargs = {"device": device, "dtype": dtype}
        super().__init__()
        self.d_model = d_model
        self.d_state = d_state
        self.d_conv = d_conv
        self.expand = expand
        self.d_inner = int(self.expand * self.d_model)
        self.dt_rank = math.ceil(self.d_model / 16) if dt_rank == "auto" else dt_rank
        self.use_fast_path = use_fast_path
        self.layer_idx = layer_idx

        self.in_proj = nn.Linear(self.d_model, self.d_inner * 2, bias=bias, **factory_kwargs)

        self.conv1d = nn.Conv1d(
            in_channels=self.d_inner,
            out_channels=self.d_inner,
            bias=conv_bias,
            kernel_size=d_conv,
            groups=self.d_inner,
            padding=d_conv - 1,
            **factory_kwargs,
        )

        self.activation = "silu"
        self.act = nn.SiLU()

        self.x_proj = nn.Linear(
            self.d_inner, self.dt_rank + self.d_state * 2, bias=False, **factory_kwargs
        )
        self.dt_proj = nn.Linear(self.dt_rank, self.d_inner, bias=True, **factory_kwargs)

        # Initialize special dt projection to preserve variance at initialization
        dt_init_std = self.dt_rank**-0.5 * dt_scale
        if dt_init == "constant":
            nn.init.constant_(self.dt_proj.weight, dt_init_std)
        elif dt_init == "random":
            nn.init.uniform_(self.dt_proj.weight, -dt_init_std, dt_init_std)
        else:
            raise NotImplementedError

        # Initialize dt bias so that F.softplus(dt_bias) is between dt_min and dt_max
        dt = torch.exp(
            torch.rand(self.d_inner, **factory_kwargs) * (math.log(dt_max) - math.log(dt_min))
            + math.log(dt_min)
        ).clamp(min=dt_init_floor)
        # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
        inv_dt = dt + torch.log(-torch.expm1(-dt))
        with torch.no_grad():
            self.dt_proj.bias.copy_(inv_dt)
        # Our initialization would set all Linear.bias to zero, need to mark this one as _no_reinit
        self.dt_proj.bias._no_reinit = True

        # S4D real initialization
        A = repeat(
            torch.arange(1, self.d_state + 1, dtype=torch.float32, device=device),
            "n -> d n",
            d=self.d_inner,
        ).contiguous()
        A_log = torch.log(A)  # Keep A_log in fp32
        self.A_log = nn.Parameter(A_log)
        self.A_log._no_weight_decay = True

        # D "skip" parameter
        self.D = nn.Parameter(torch.ones(self.d_inner, device=device))  # Keep in fp32
        self.D._no_weight_decay = True

        self.out_proj = nn.Linear(self.d_inner, self.d_model, bias=bias, **factory_kwargs)

    def forward(self, hidden_states, inference_params=None):
        """
        hidden_states: (B, L, D)
        Returns: same shape as hidden_states
        """
        batch, seqlen, dim = hidden_states.shape

        conv_state, ssm_state = None, None
        if inference_params is not None:
            conv_state, ssm_state = self._get_states_from_cache(inference_params, batch)
            if inference_params.seqlen_offset > 0:
                # The states are updated inplace
                out, _, _ = self.step(hidden_states, conv_state, ssm_state)
                return out

        # We do matmul and transpose BLH -> HBL at the same time
        xz = rearrange(
            self.in_proj.weight @ rearrange(hidden_states, "b l d -> d (b l)"),
            "d (b l) -> b d l",
            l=seqlen,
        )
        if self.in_proj.bias is not None:
            xz = xz + rearrange(self.in_proj.bias.to(dtype=xz.dtype), "d -> d 1")

        A = -torch.exp(self.A_log.float())  # (d_inner, d_state)
        # In the backward pass we write dx and dz next to each other to avoid torch.cat
        if self.use_fast_path and causal_conv1d_fn is not None and inference_params is None:  # Doesn't support outputting the states
            out = mamba_inner_fn(
                xz,
                self.conv1d.weight,
                self.conv1d.bias,
                self.x_proj.weight,
                self.dt_proj.weight,
                self.out_proj.weight,
                self.out_proj.bias,
                A,
                None,  # input-dependent B
                None,  # input-dependent C
                self.D.float(),
                delta_bias=self.dt_proj.bias.float(),
                delta_softplus=True,
            )
        else:
            x, z = xz.chunk(2, dim=1)
            # Compute short convolution
            if conv_state is not None:
                # If we just take x[:, :, -self.d_conv :], it will error if seqlen < self.d_conv
                # Instead F.pad will pad with zeros if seqlen < self.d_conv, and truncate otherwise.
                conv_state.copy_(F.pad(x, (self.d_conv - x.shape[-1], 0)))  # Update state (B D W)
            if causal_conv1d_fn is None:
                x = self.act(self.conv1d(x)[..., :seqlen])
            else:
                assert self.activation in ["silu", "swish"]
                x = causal_conv1d_fn(
                    x=x,
                    weight=rearrange(self.conv1d.weight, "d 1 w -> d w"),
                    bias=self.conv1d.bias,
                    activation=self.activation,
                )

            # We're careful here about the layout, to avoid extra transposes.
            # We want dt to have d as the slowest moving dimension
            # and L as the fastest moving dimension, since those are what the ssm_scan kernel expects.
            x_dbl = self.x_proj(rearrange(x, "b d l -> (b l) d"))  # (bl d)
            dt, B, C = torch.split(x_dbl, [self.dt_rank, self.d_state, self.d_state], dim=-1)
            dt = self.dt_proj.weight @ dt.t()
            dt = rearrange(dt, "d (b l) -> b d l", l=seqlen)
            B = rearrange(B, "(b l) dstate -> b dstate l", l=seqlen).contiguous()
            C = rearrange(C, "(b l) dstate -> b dstate l", l=seqlen).contiguous()
            assert self.activation in ["silu", "swish"]
            y = selective_scan_fn(
                x,
                dt,
                A,
                B,
                C,
                self.D.float(),
                z=z,
                delta_bias=self.dt_proj.bias.float(),
                delta_softplus=True,
                return_last_state=ssm_state is not None,
            )
            if ssm_state is not None:
                y, last_state = y
                ssm_state.copy_(last_state)
            y = rearrange(y, "b d l -> b l d")
            out = self.out_proj(y)
        return out
    
# Define model parameters
input_size = X_train.shape[1]  # Number of input features
num_classes = len(nursery.data.targets['class'].unique())  # Number of target classes
d_model = 64
n_layer = 4

# Instantiate the Mamba model
model = MambaClassifier(input_size=input_size, num_classes=num_classes, d_model=d_model, n_layer=n_layer)

# Training MAMBA on Nursery

In [6]:
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

# Set the device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}')

# Define model parameters
input_size = X_train.shape[1]  # Number of input features
num_classes = len(nursery.data.targets['class'].unique())  # Number of target classes
d_model = 64
n_layer = 4

# Instantiate the Mamba model
model = MambaClassifier(input_size=input_size, num_classes=num_classes, d_model=d_model, n_layer=n_layer)

Epoch [1/10], Loss: 1.3713
Epoch [2/10], Loss: 0.7990
Epoch [3/10], Loss: 0.5517
Epoch [4/10], Loss: 0.4732
Epoch [5/10], Loss: 0.4202
Epoch [6/10], Loss: 0.3816
Epoch [7/10], Loss: 0.3568
Epoch [8/10], Loss: 0.3380
Epoch [9/10], Loss: 0.3209
Epoch [10/10], Loss: 0.3061


# Evaluating the model

In [7]:
# Switch to evaluation mode
model.eval()
y_pred = []
y_true = []

# Test the model
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        y_pred.extend(predicted.cpu().numpy())
        y_true.extend(labels.cpu().numpy())

# Calculate accuracy
accuracy = accuracy_score(y_true, y_pred)
print(f'Accuracy on the test set: {accuracy:.4f}')

Accuracy on the test set: 0.8688
