In [1]:
import numpy
import uproot

In [2]:
events = uproot.open("~/data/DYJetsToLL.root")["Events"]

In [None]:
events.show()

# Dropping data into machine learning libraries

Define a 2 hidden layer neural network in PyTorch.

In [3]:
import torch

class SimpleNN(torch.nn.Module):
    def __init__(self, input_dim, hidden1_dim, hidden2_dim, output_dim):
        super(SimpleNN, self).__init__()
        self.layer1 = torch.nn.Linear(input_dim, hidden1_dim)
        self.relu1 = torch.nn.ReLU()
        self.layer2 = torch.nn.Linear(hidden1_dim, hidden2_dim)
        self.relu2 = torch.nn.ReLU()
        self.layer3 = torch.nn.Linear(hidden2_dim, output_dim)

    def forward(self, x):
        return self.layer3(self.relu2(self.layer2(self.relu1(self.layer1(x)))))

# 25 input parameters, 20 node hidden layer, 10 node hidden layer, 1 output
simplenn = SimpleNN(25, 20, 10, 1)

criterion = torch.nn.MSELoss()
optimizer = torch.optim.SGD(simplenn.parameters(), lr=0.01)

The 25 input parameters are jet attributes other than the btag.

The 1 output is the supervised learning target: Jet_btagCMVA.

In [4]:
jetarrays = events.arrays("Jet_*")

inputs = numpy.vstack(jetarrays[n] for n in sorted(jetarrays) if not n.startswith("Jet_btag")).T.astype("float32")
expected_output = numpy.array(jetarrays["Jet_btagCMVA"]).reshape(-1, 1)

inputs.shape, expected_output.shape

((7388405, 25), (7388405, 1))

PyTorch, like all other Pythonic ML libraries, has methods to get batches of data from Numpy.

In [5]:
inputs = torch.autograd.Variable(torch.from_numpy(inputs))
expected_output = torch.autograd.Variable(torch.from_numpy(expected_output))

And now we use PyTorch; it doesn't matter where the data came from.

In [6]:
optimizer.zero_grad()
computed_output = simplenn.forward(inputs)
loss = criterion(computed_output, expected_output)
loss.backward()
optimizer.step()
loss

tensor(1.4066)