In [1]:
import math

import torch
from datasets import load_dataset
from fastai.tabular.all import *
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from torch.utils.data import DataLoader, TensorDataset
from tqdm.auto import tqdm

In [2]:
import datasets

# Suppress logs to keep things tidy
datasets.logging.set_verbosity_error()

In [3]:
nsubjet_ds = load_dataset("dl4phys/top_tagging_nsubjettiness")
df = nsubjet_ds["train"].to_pandas()
df.head()

Unnamed: 0,pT,mass,tau_1_0.5,tau_1_1,tau_1_2,tau_2_0.5,tau_2_1,tau_2_2,tau_3_0.5,tau_3_1,...,tau_4_0.5,tau_4_1,tau_4_2,tau_5_0.5,tau_5_1,tau_5_2,tau_6_0.5,tau_6_1,tau_6_2,label
0,543.633944,25.846792,0.165122,0.032661,0.002262,0.04883,0.003711,4.4e-05,0.030994,0.00163,...,0.024336,0.001115,8e-06,0.004252,0.000234,7.706005e-07,0.0,0.0,0.0,0
1,452.41186,13.388679,0.162938,0.027598,0.000876,0.095902,0.015461,0.000506,0.07975,0.009733,...,0.056854,0.005454,7.2e-05,0.044211,0.00443,6.175314e-05,0.037458,0.003396,3.670517e-05,0
2,429.495258,32.021091,0.244436,0.065901,0.005557,0.155202,0.038807,0.002762,0.123285,0.025339,...,0.078205,0.012678,0.000567,0.052374,0.005935,9.395772e-05,0.037572,0.002932,2.237277e-05,0
3,512.675443,6.684734,0.10258,0.011369,0.00017,0.086306,0.00776,7.1e-05,0.068169,0.005386,...,0.044705,0.002376,8e-06,0.027895,0.001364,4.400042e-06,0.009012,0.000379,6.731099e-07,0
4,527.956859,133.985415,0.407009,0.191839,0.065169,0.29146,0.105479,0.029753,0.209341,0.049187,...,0.143768,0.033249,0.003689,0.135407,0.029054,0.00259346,0.110805,0.023179,0.002202088,0


In [4]:
train_df, valid_df = train_test_split(df, random_state=42)
train_df.shape, valid_df.shape

((908250, 21), (302750, 21))

In [5]:
train_df.describe()

Unnamed: 0,pT,mass,tau_1_0.5,tau_1_1,tau_1_2,tau_2_0.5,tau_2_1,tau_2_2,tau_3_0.5,tau_3_1,...,tau_4_0.5,tau_4_1,tau_4_2,tau_5_0.5,tau_5_1,tau_5_2,tau_6_0.5,tau_6_1,tau_6_2,label
count,908250.0,908250.0,908250.0,908250.0,908250.0,908250.0,908250.0,908250.0,908250.0,908250.0,...,908250.0,908250.0,908250.0,908250.0,908250.0,908250.0,908250.0,908250.0,908250.0,908250.0
mean,487.107393,88.09052,0.366716,0.198446,0.319559,0.222759,0.079243,0.072535,0.148137,0.035372,...,0.112024,0.02215,0.00867,0.0884,0.015329,0.004875,0.070679,0.011019,0.002914,0.500366
std,48.568267,48.393646,0.186922,0.339542,2.003898,0.110955,0.125155,0.674091,0.072627,0.051869,...,0.059393,0.032004,0.155468,0.051949,0.022866,0.107641,0.046571,0.017133,0.078247,0.5
min,225.490387,-0.433573,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,452.879289,39.958178,0.224456,0.058381,0.006443,0.139269,0.025638,0.001565,0.094603,0.013308,...,0.069037,0.007949,0.000188,0.051012,0.004936,7.9e-05,0.036142,0.002977,3.3e-05,0.0
50%,485.89405,99.887418,0.380172,0.166016,0.045887,0.222763,0.061597,0.008788,0.14881,0.028501,...,0.11022,0.017609,0.000787,0.086045,0.011755,0.000387,0.067797,0.008028,0.000193,1.0
75%,520.506446,126.518545,0.477122,0.24055,0.074417,0.299708,0.108207,0.022441,0.196156,0.046588,...,0.151137,0.02999,0.002006,0.121905,0.021089,0.001103,0.100437,0.015359,0.000635,1.0
max,647.493145,299.211555,2.431888,6.013309,37.702422,2.218956,5.392683,33.352249,1.917912,4.502011,...,1.61628,3.753716,21.161948,1.407356,3.158352,17.645603,1.388879,3.127371,17.34097,1.0


In [6]:
# Slice out all feature columns
train_x = train_df.iloc[:, :-1].values
# Slice out the label column
train_y = train_df.iloc[:, -1].values

In [7]:
scaler = MinMaxScaler()
train_x = scaler.fit_transform(train_x)
# Sanity check the normalization worked
np.min(train_x), np.max(train_x)

(np.float64(0.0), np.float64(1.0))

In [8]:
# Cast to float32
train_x = torch.from_numpy(train_x).float()
train_y = torch.from_numpy(train_df.iloc[:, -1].values)
# Sanity check on the shapes
train_x.shape, train_y.shape

(torch.Size([908250, 20]), torch.Size([908250]))

In [9]:
set_seed(42)
# Xavier initialisation
weights = torch.randn(20, 2) / math.sqrt(20)
# Track grads after initialization
weights.requires_grad_()
bias = torch.zeros(2, requires_grad=True)

In [11]:
def log_softmax(x):
    return (x - x.max()) - (x - x.max()).exp().sum(-1).log().unsqueeze(-1)

In [12]:
def model(xb):
    return log_softmax(xb @ weights + bias)

In [13]:
# Batch size
bs = 1024
# A mini-batch from x
xb = train_x[0:bs]
# Model predictions
preds = model(xb)
preds[0], preds.shape

(tensor([-0.5103, -0.9171], grad_fn=<SelectBackward0>), torch.Size([1024, 2]))

In [14]:
def nll_loss(predictions, target):
    # Mask predictions according to whether y_hat is 1 or 0
    return -predictions[range(target.shape[0]), target].mean()


loss_func = nll_loss

In [15]:
yb = train_y[0:bs]
print(loss_func(preds, yb))

tensor(0.7619, grad_fn=<NegBackward0>)


In [16]:
def accuracy(out, yb):
    preds = torch.argmax(out, dim=1)
    return (preds == yb).float().mean()


accuracy(preds, yb)

tensor(0.5020)

In [17]:
# Learning rate
lr = 1e-2
# Number of epochs
epochs = 3
n = len(train_df)

for epoch in tqdm(range(epochs), desc="num_epochs"):
    for i in tqdm(range((n - 1) // bs + 1), leave=False):
        # 1. Select mini-batch
        start_i = i * bs
        end_i = start_i + bs
        xb = train_x[start_i:end_i]
        yb = train_y[start_i:end_i]
        # 2. Generate predictions
        pred = model(xb)
        # 3. Compute the loss
        loss = loss_func(pred, yb)
        # 4. Compute the gradients
        loss.backward()
        # 5. Update the weights and biases
        with torch.no_grad():
            weights -= weights.grad * lr
            bias -= bias.grad * lr
            # Set current gradients to zero
            weights.grad.zero_()
            bias.grad.zero_()

num_epochs:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/887 [00:00<?, ?it/s]

  0%|          | 0/887 [00:00<?, ?it/s]

  0%|          | 0/887 [00:00<?, ?it/s]

In [18]:
def print_scores():
    print(f"Loss: {loss_func(model(xb), yb):.3f}")
    print(f"Accuracy: {accuracy(model(xb), yb):.3f}")

In [19]:
print_scores()

Loss: 0.561
Accuracy: 0.857


In [20]:
loss_func = F.cross_entropy


def model(xb):
    return xb @ weights + bias


# Sanity check we get the same scores as before
print_scores()

Loss: 0.561
Accuracy: 0.857


In [23]:
class LogisticRegressor(nn.Module):
    def __init__(self):
        super().__init__()
        self.weights = nn.Parameter(torch.randn(20, 2) / math.sqrt(20))
        self.bias = nn.Parameter(torch.zeros(2))

    def forward(self, xb):
        return xb @ self.weights + self.bias

In [24]:
model = LogisticRegressor()
loss_func(model(xb), yb)

tensor(0.7114, grad_fn=<NllLossBackward0>)

In [25]:
def fit():
    for epoch in tqdm(range(epochs), desc="num_epochs"):
        for i in tqdm(range((n - 1) // bs + 1), leave=False):
            # 1. Select mini-batch
            start_i = i * bs
            end_i = start_i + bs
            xb = train_x[start_i:end_i]
            yb = train_y[start_i:end_i]
            # 2. Generate predictions
            pred = model(xb)
            # 3. Compute the loss
            loss = loss_func(pred, yb)
            # 4. Compute the gradients
            loss.backward()
            # 5. Update the weights and biases
            with torch.no_grad():
                for p in model.parameters():
                    p -= p.grad * lr
                model.zero_grad()


fit()
print_scores()

num_epochs:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/887 [00:00<?, ?it/s]

  0%|          | 0/887 [00:00<?, ?it/s]

  0%|          | 0/887 [00:00<?, ?it/s]

Loss: 0.547
Accuracy: 0.859


In [27]:
class LogisticRegressor(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(20, 2)

    def forward(self, xb):
        return self.linear(xb)


model = LogisticRegressor()
fit()
print_scores()

num_epochs:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/887 [00:00<?, ?it/s]

  0%|          | 0/887 [00:00<?, ?it/s]

  0%|          | 0/887 [00:00<?, ?it/s]

Loss: 0.530
Accuracy: 0.859


In [29]:
def get_model():
    model = LogisticRegressor()
    return model, torch.optim.SGD(model.parameters(), lr=lr)

model, optimizer = get_model()
loss_func(model(xb), yb)

tensor(0.7041, grad_fn=<NllLossBackward0>)

In [30]:
def fit():
    for epoch in tqdm(range(epochs), desc="num_epochs"):
        for i in tqdm(range((n - 1) // bs + 1), leave=False):
            # 1. Select mini-batch
            start_i = i * bs
            end_i = start_i + bs
            xb = train_x[start_i:end_i]
            yb = train_y[start_i:end_i]
            # 2. Generate predictions
            pred = model(xb)
            # 3. Compute the loss
            loss = loss_func(pred, yb)
            # 4. Compute the gradients
            loss.backward()
            # 5. Update the weights and biases
            optimizer.step()
            optimizer.zero_grad()


fit()
print_scores()

num_epochs:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/887 [00:00<?, ?it/s]

  0%|          | 0/887 [00:00<?, ?it/s]

  0%|          | 0/887 [00:00<?, ?it/s]

Loss: 0.539
Accuracy: 0.860


In [31]:
train_ds = TensorDataset(train_x, train_y)

In [32]:
len(train_ds)

908250

In [33]:
train_ds[0]

(tensor([6.1467e-01, 3.4553e-01, 2.0827e-01, 6.1930e-02, 2.9217e-02, 1.1426e-01,
         3.9722e-02, 3.0127e-02, 1.1553e-01, 4.4638e-02, 3.7694e-02, 1.1224e-01,
         5.0610e-02, 4.7183e-02, 7.6042e-02, 4.5295e-03, 2.0119e-05, 5.5926e-02,
         2.8786e-03, 8.6186e-06]),
 tensor(1))

In [34]:
model, optimizer = get_model()


def fit():
    for epoch in tqdm(range(epochs), desc="num_epochs"):
        for i in tqdm(range((n - 1) // bs + 1), leave=False):
            # 1. Select mini-batch
            xb, yb = train_ds[i * bs : i * bs + bs]
            # 2. Generate predictions
            pred = model(xb)
            # 3. Compute the loss
            loss = loss_func(pred, yb)
            # 4. Compute the gradients
            loss.backward()
            # 5. Update the weights and biases
            optimizer.step()
            optimizer.zero_grad()


fit()
print_scores()

num_epochs:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/887 [00:00<?, ?it/s]

  0%|          | 0/887 [00:00<?, ?it/s]

  0%|          | 0/887 [00:00<?, ?it/s]

Loss: 0.541
Accuracy: 0.857


In [35]:
train_ds = TensorDataset(train_x, train_y)
train_dl = DataLoader(train_ds, batch_size=bs, shuffle=True)
next(iter(train_dl))

[tensor([[6.3268e-01, 7.1266e-02, 6.7265e-02,  ..., 1.5679e-02, 5.9481e-04,
          1.0084e-06],
         [5.5470e-01, 3.2794e-01, 1.4114e-01,  ..., 3.8751e-02, 2.0627e-03,
          1.1118e-05],
         [5.8864e-01, 4.2167e-01, 1.9011e-01,  ..., 4.6003e-02, 2.2277e-03,
          5.5698e-06],
         ...,
         [7.7654e-01, 4.6069e-01, 1.9646e-01,  ..., 5.1050e-02, 2.0193e-03,
          4.1955e-06],
         [5.1658e-01, 5.9622e-02, 5.8521e-02,  ..., 5.2730e-02, 2.2624e-03,
          6.7991e-06],
         [8.5039e-01, 3.4621e-02, 4.1556e-02,  ..., 0.0000e+00, 0.0000e+00,
          0.0000e+00]]),
 tensor([0, 1, 1,  ..., 1, 0, 0])]

In [36]:
model, optimizer = get_model()


def fit():
    for epoch in tqdm(range(epochs), desc="num_epochs"):
        # 1. Select mini-batch
        for xb, yb in tqdm(train_dl, leave=False):
            # 2. Generate predictions
            pred = model(xb)
            # 3. Compute the loss
            loss = loss_func(pred, yb)
            # 4. Compute the gradients
            loss.backward()
            # 5. Update the weights and biases
            optimizer.step()
            optimizer.zero_grad()


fit()
print_scores()

num_epochs:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/887 [00:00<?, ?it/s]

  0%|          | 0/887 [00:00<?, ?it/s]

  0%|          | 0/887 [00:00<?, ?it/s]

Loss: 0.541
Accuracy: 0.861


In [37]:
model = nn.Sequential(
    nn.Linear(20,200),
    nn.ReLU(),
    nn.Linear(200,200),
    nn.ReLU(),
    nn.Dropout(p=0.2),
    nn.Linear(200,50),
    nn.ReLU(),
    nn.Linear(50,50),
    nn.ReLU(),
    nn.Dropout(p=0.1),
    nn.Linear(50,2),
    
)

In [38]:
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

fit()
print_scores()

num_epochs:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/887 [00:00<?, ?it/s]

  0%|          | 0/887 [00:00<?, ?it/s]

  0%|          | 0/887 [00:00<?, ?it/s]

Loss: 0.257
Accuracy: 0.888
