# Download essential datasets and packages

Setting up PyTorch Geometry

In [3]:
!pip install energyflow



In [4]:
! python -c "import torch; print(torch.__version__)"
! python -c "import torch; print(torch.cuda.is_available())"

1.4.0
True


In [0]:
! PATH=/usr/local/cuda/bin:$PATH
! CPATH=/usr/local/cuda/include:$CPATH    
! LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
! DYLD_LIBRARY_PATH=/usr/local/cuda/lib:$DYLD_LIBRARY_PATH

In [4]:
! nvcc --version
! python -c "import torch; print(torch.version.cuda)"

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2018 NVIDIA Corporation
Built on Sat_Aug_25_21:08:01_CDT_2018
Cuda compilation tools, release 10.0, V10.0.130
10.1


This will take sometime ~ around 10 - 15 mins

In [5]:
! pip install --no-cache-dir torch-scatter==2.0.2
! pip install --no-cache-dir torch-sparse==0.4.4
! pip install --no-cache-dir torch-cluster==1.4.5
! pip install torch-geometric==1.4.1
#! pip install torch-spline-conv==10.1 -f https://pytorch-geometric.com/whl/torch-1.4.0.html

Collecting torch-scatter==2.0.2
  Downloading https://files.pythonhosted.org/packages/28/f7/fa4e61587169924203a32416f7835939fdd79994eaa429b4f8ef8f0e07e2/torch_scatter-2.0.2.tar.gz
Building wheels for collected packages: torch-scatter
  Building wheel for torch-scatter (setup.py) ... [?25l[?25hdone
  Created wheel for torch-scatter: filename=torch_scatter-2.0.2-cp36-cp36m-linux_x86_64.whl size=7260831 sha256=71dee0690ad517666ba81eb7145cc97f61a0a84256cbc16003d09b4f933a1741
  Stored in directory: /tmp/pip-ephem-wheel-cache-y_7r5eju/wheels/21/5b/b3/0299be203ab2eb6b5d74c85a968111ec09be6a30f65c9a3a68
Successfully built torch-scatter
Installing collected packages: torch-scatter
Successfully installed torch-scatter-2.0.2
Collecting torch-sparse==0.4.4
  Downloading https://files.pythonhosted.org/packages/0e/bf/6242893c898621e7e4756e1ad298e903df6dfae208aec1c32adf8cfd1f7f/torch_sparse-0.4.4.tar.gz
Building wheels for collected packages: torch-sparse
  Building wheel for torch-sparse (setup.py)

This will Install RDKit. Takes 2 - 3 minutes

In [10]:
# !wget -c https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
# !chmod +x Miniconda3-latest-Linux-x86_64.sh
# !time bash ./Miniconda3-latest-Linux-x86_64.sh -b -f -p /usr/local
# !time conda install -q -y -c conda-forge rdkit

--2020-03-30 07:57:31--  https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
Resolving repo.continuum.io (repo.continuum.io)... 104.18.200.79, 104.18.201.79, 2606:4700::6812:c84f, ...
Connecting to repo.continuum.io (repo.continuum.io)|104.18.200.79|:443... connected.
HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable

    The file is already fully retrieved; nothing to do.

PREFIX=/usr/local
Unpacking payload ...
Collecting package metadata (current_repodata.json): - \ done
Solving environment: / - \ | / - \ | / done

## Package Plan ##

  environment location: /usr/local

  added / updated specs:
    - _libgcc_mutex==0.1=main
    - asn1crypto==1.3.0=py37_0
    - ca-certificates==2020.1.1=0
    - certifi==2019.11.28=py37_0
    - cffi==1.14.0=py37h2e261b9_0
    - chardet==3.0.4=py37_1003
    - conda-package-handling==1.6.0=py37h7b6447c_0
    - conda==4.8.2=py37_0
    - cryptography==2.8=py37h1ba5d50_0
    - idna==2.8=py37

In [0]:
# import sys
# sys.path.append('/usr/local/lib/python3.7/site-packages/')

# Load data

In [0]:
import energyflow
data = energyflow.qg_jets.load(num_data=100000, generator='pythia', pad=True, with_bc=False, cache_dir='~/.energyflow')


X of shape (100000,M,4) where 100,000 is the number of jets per file,  M is the max multiplicity of the jets and 4 is the number of features per point cloud (particle). The four features are the 

1. particle’s pt
2. rapidity azimuthal 
3. angle 
4. pdgid

M is multiplicity

Jet multiplicity is the number of jets.

The number of jets can be very interesting depending on the kind of events you want to study. Sometimes the jet multiplicity is part of the event selection

In [0]:
X = data[0]
y = data[1]
y = y.astype(int)

In [39]:
X.shape,y.shape

((100000, 139, 4), (100000,))

In [0]:
import os
import torch
from torch.utils import data
from torch_geometric.data import DataLoader
from torch_geometric.data import InMemoryDataset, download_url, extract_zip

# Check Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class QuarkGluonDataset(InMemoryDataset):
    url = 'https://www.dropbox.com/s/fclsl7pukcpobsb/QG_jets.npz'

    categories = {
        '0': '50000',
        '1': '50000',
    }

    def __init__(self,
                 root,
                 category,
                 train=True,
                 transform=None,
                 pre_transform=None,
                 pre_filter=None):
        #assert category in self.categories.keys()
        self.category = category
        super(QuarkGluonDataset, self).__init__(root, transform, pre_transform,
                                       pre_filter)
        path = self.processed_paths[0] if train else self.processed_paths[1]
        self.data, self.slices = torch.load(path)

    @property
    def raw_file_names(self):
        return [
            'train_data', 'train_label', 'test_data','test_label'
        ]

    @property
    def processed_file_names(self):
        names = ['training.pt', 'test.pt']
        return [osp.join(self.category, name) for name in names]

    def download(self):
        for name in self.raw_file_names:
            url = '{}/{}.zip'.format(self.url, name)
            path = download_url(url, self.raw_dir)
            extract_zip(path, self.raw_dir)
            os.unlink(path)

    def process(self):
        idx = self.categories[self.category]
        paths = [osp.join(path, idx) for path in self.raw_paths]
        datasets = []
        for path in zip(paths[::2], paths[1::2]):
            pos_paths = sorted(glob.glob(osp.join(path[0], '*.pts')))
            y_paths = sorted(glob.glob(osp.join(path[1], '*.seg')))
            data_list = []
            for path in zip(pos_paths, y_paths):
                pos = read_txt_array(path[0])
                y = read_txt_array(path[1], dtype=torch.long)
                data = Data(y=y, pos=pos)
                if self.pre_filter is not None and not self.pre_filter(data):
                    continue
                if self.pre_transform is not None:
                    data = self.pre_transform(data)
                data_list.append(data)
            datasets.append(data_list)

        makedirs(osp.join(self.processed_dir, self.category))
        train_data, train_slices = self.collate(datasets[0] + datasets[1])
        test_data, test_slices = self.collate(datasets[2])

        _, train_data.y = train_data.y.unique(return_inverse=True)
        _, test_data.y = test_data.y.unique(return_inverse=True)

        torch.save((train_data, train_slices), self.processed_paths[0])
        torch.save((test_data, test_slices), self.processed_paths[1])

    def __repr__(self):
        return '{}({}, category={})'.format(self.__class__.__name__, len(self),
                                            self.category)


# Exploratory Data Analysis

In [0]:
import torch
from torch_geometric.nn import MessagePassing
from torch_geometric.utils import add_self_loops, degree

class GCNConv(MessagePassing):
    def __init__(self, in_channels, out_channels):
        super(GCNConv, self).__init__(aggr='add')  # "Add" aggregation.
        self.lin = torch.nn.Linear(in_channels, out_channels)

    def forward(self, x, edge_index):
        # x has shape [N, in_channels]
        # edge_index has shape [2, E]

        # Step 1: Add self-loops to the adjacency matrix.
        edge_index, _ = add_self_loops(edge_index, num_nodes=x.size(0))

        # Step 2: Linearly transform node feature matrix.
        x = self.lin(x)

        # Step 3: Compute normalization
        row, col = edge_index
        deg = degree(row, x.size(0), dtype=x.dtype)
        deg_inv_sqrt = deg.pow(-0.5)
        norm = deg_inv_sqrt[row] * deg_inv_sqrt[col]

        # Step 4-6: Start propagating messages.
        return self.propagate(edge_index, size=(x.size(0), x.size(0)), x=x,
                              norm=norm)

    def message(self, x_j, norm):
        # x_j has shape [E, out_channels]

        # Step 4: Normalize node features.
        return norm.view(-1, 1) * x_j

    def update(self, aggr_out):
        # aggr_out has shape [N, out_channels]

        # Step 6: Return new node embeddings.
        return aggr_out


In [0]:
edge_index = torch.tensor([[0, 1, 1, 2],
                           [1, 0, 2, 1]], dtype=torch.long)
x = torch.tensor([[-1], [0], [1]], dtype=torch.float)
conv = GCNConv(1, 32) # x x shape and 16
x = conv(x, edge_index)

In [0]:
from torch.nn import Sequential as Seq, Linear, ReLU
from torch_geometric.nn import MessagePassing

class EdgeConv(MessagePassing):
    def __init__(self, in_channels, out_channels):
        super(EdgeConv, self).__init__(aggr='max') #  "Max" aggregation.
        self.mlp = Seq(Linear(2 * in_channels, out_channels),
                       ReLU(),
                       Linear(out_channels, out_channels))

    def forward(self, x, edge_index):
        # x has shape [N, in_channels]
        # edge_index has shape [2, E]

        return self.propagate(edge_index, size=(x.size(0), x.size(0)), x=x)

    def message(self, x_i, x_j):
        # x_i has shape [E, in_channels]
        # x_j has shape [E, in_channels]

        tmp = torch.cat([x_i, x_j - x_i], dim=1)  # tmp has shape [E, 2 * in_channels]
        return self.mlp(tmp)

    def update(self, aggr_out):
        # aggr_out has shape [N, out_channels]

        return aggr_out

In [0]:
from torch_geometric.nn import knn_graph

class DynamicEdgeConv(EdgeConv):
    def __init__(self, in_channels, out_channels, k=6):
        super(DynamicEdgeConv, self).__init__(in_channels, out_channels)
        self.k = k

    def forward(self, x, batch=None):
        edge_index = knn_graph(x, self.k, batch, loop=False, flow=self.flow)
        return super(DynamicEdgeConv, self).forward(x, edge_index)

In [37]:
conv = DynamicEdgeConv(3, 128, k=6)
x = conv(pos, batch)

AttributeError: ignored

In [40]:
!wget http://ls7-www.cs.uni-dortmund.de/cvpr_geometric_dl/mnist_superpixels.tar.gz

--2020-03-30 12:07:03--  http://ls7-www.cs.uni-dortmund.de/cvpr_geometric_dl/mnist_superpixels.tar.gz
Resolving ls7-www.cs.uni-dortmund.de (ls7-www.cs.uni-dortmund.de)... 129.217.52.147
Connecting to ls7-www.cs.uni-dortmund.de (ls7-www.cs.uni-dortmund.de)|129.217.52.147|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://ls7-www.cs.tu-dortmund.de/cvpr_geometric_dl/mnist_superpixels.tar.gz [following]
--2020-03-30 12:07:04--  https://ls7-www.cs.tu-dortmund.de/cvpr_geometric_dl/mnist_superpixels.tar.gz
Resolving ls7-www.cs.tu-dortmund.de (ls7-www.cs.tu-dortmund.de)... 129.217.52.147
Connecting to ls7-www.cs.tu-dortmund.de (ls7-www.cs.tu-dortmund.de)|129.217.52.147|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://ls7-www.cs.tu-dortmund.de/misc/cvpr/mnist_superpixels.tar.gz [following]
--2020-03-30 12:07:06--  http://ls7-www.cs.tu-dortmund.de/misc/cvpr/mnist_superpixels.tar.gz
Connecting to ls7-w

In [47]:
!tar -xvf mnist_superpixels.tar.gz

training.pt
test.pt


In [49]:
import torch
vocab = torch.load("test.pt")
print(len(vocab))

5
