In [1]:
import sys
COMP_NAME = "icecube-neutrinos-in-deep-ice"
sys.path.append(f"/home/anjum/kaggle/{COMP_NAME}/")

import pandas as pd
import numpy as np
import torch

from torch_geometric.data import Data, Batch
from torch_geometric.loader import DataLoader

from graphnet.models.gnn import DynEdge
from graphnet.models.graph_builders import KNNGraphBuilder

from src.config import INPUT_PATH, OUTPUT_PATH

[1;34mgraphnet[0m: [32mINFO    [0m 2023-01-22 18:58:54 - get_logger - Writing log to [1mlogs/graphnet_20230122-185854.log[0m


In [42]:
_dtype = {
    "batch_id": "int16",
    "event_id": "int64",
    "first_pulse_index": "int32",
    "last_pulse_index": "int32",
    "azimuth": "float32",
    "zenith": "float32",
}

meta = pd.read_parquet(INPUT_PATH / "train_meta.parquet").astype(_dtype)
meta.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131953924 entries, 0 to 131953923
Data columns (total 6 columns):
 #   Column             Dtype  
---  ------             -----  
 0   batch_id           int16  
 1   event_id           int64  
 2   first_pulse_index  int32  
 3   last_pulse_index   int32  
 4   azimuth            float32
 5   zenith             float32
dtypes: float32(2), int16(1), int32(2), int64(1)
memory usage: 3.2 GB


In [43]:
meta.describe()

Unnamed: 0,batch_id,event_id,first_pulse_index,last_pulse_index,azimuth,zenith
count,131953900.0,131953900.0,131953900.0,131953900.0,131953900.0,131953900.0
mean,330.3849,1073797000.0,16433360.0,16433520.0,3.144117,1.534421
std,190.4591,619985300.0,9505803.0,9505803.0,1.81351,0.6901115
min,1.0,24.0,0.0,27.0,6.436839e-08,8.631674e-05
25%,165.0,536833200.0,8202315.0,8202470.0,1.575369,1.005033
50%,330.0,1073825000.0,16419430.0,16419590.0,3.14137,1.526995
75%,495.0,1610750000.0,24652480.0,24652660.0,4.717619,2.054609
max,660.0,2147484000.0,35535340.0,35535410.0,6.283185,3.141562


In [28]:
batch_id = 1
event_id = 24

sensors = pd.read_csv(INPUT_PATH / "sensor_geometry_v2.csv", index_col="sensor_id")
meta = pd.read_parquet(INPUT_PATH / "train_meta.parquet").query(f"batch_id == {batch_id}")
batch = pd.read_parquet(INPUT_PATH / "train" / f"batch_{batch_id}.parquet")

In [35]:
meta.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200000 entries, 0 to 199999
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   batch_id           200000 non-null  int64  
 1   event_id           200000 non-null  Int64  
 2   first_pulse_index  200000 non-null  int64  
 3   last_pulse_index   200000 non-null  int64  
 4   azimuth            200000 non-null  float64
 5   zenith             200000 non-null  float64
dtypes: Int64(1), float64(2), int64(3)
memory usage: 10.9 MB


In [29]:
sensors.head()

Unnamed: 0_level_0,x,y,z,string,qe
sensor_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,-256.14,-521.08,496.03,0,1.0
1,-256.14,-521.08,479.01,0,1.0
2,-256.14,-521.08,461.99,0,1.0
3,-256.14,-521.08,444.97,0,1.0
4,-256.14,-521.08,427.95,0,1.0


In [30]:
meta.head()

Unnamed: 0,batch_id,event_id,first_pulse_index,last_pulse_index,azimuth,zenith
0,1,24,0,60,5.029555,2.087498
1,1,41,61,111,0.417742,1.549686
2,1,59,112,147,1.160466,2.401942
3,1,67,148,289,5.845952,0.759054
4,1,72,290,351,0.653719,0.939117


In [31]:
batch.head()

Unnamed: 0_level_0,sensor_id,time,charge,auxiliary
event_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
24,3918,5928,1.325,True
24,4157,6115,1.175,True
24,3520,6492,0.925,True
24,5041,6665,0.225,True
24,2948,8054,1.575,True


In [32]:
sensors.head()

Unnamed: 0_level_0,x,y,z,string,qe
sensor_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,-256.14,-521.08,496.03,0,1.0
1,-256.14,-521.08,479.01,0,1.0
2,-256.14,-521.08,461.99,0,1.0
3,-256.14,-521.08,444.97,0,1.0
4,-256.14,-521.08,427.95,0,1.0


In [34]:
%%timeit
event = pd.merge(batch.loc[event_id], sensors, on="sensor_id")

# event = batch.loc[event_id].set_index("sensor_id", drop=True).join(sensors)
event

1.33 ms ± 3.93 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [44]:
def preprocessing(event):
    event["x"] /= 500
    event["y"] /= 500
    event["z"] /= 500
    event["time"] = (event["time"] - 1.0e04) / 3.0e4
    event["charge"] = np.log10(event["charge"]) / 3.0
    event["auxiliary"] = event["auxiliary"].astype(int)
    
    return event[["x", "y", "z", "time", "charge", "qe", "auxiliary"]]


event = preprocessing(event)
event

Unnamed: 0,x,y,z,time,charge,qe,auxiliary
0,0.60682,0.67128,0.41316,-0.135733,0.040739,1.0,1
1,-0.29090,0.74848,0.42546,-0.129500,0.023346,1.0,1
2,1.01054,0.51576,-0.34920,-0.116933,-0.011286,1.0,1
3,1.01054,0.51576,-0.34920,0.150767,0.040739,1.0,1
4,-0.01936,-0.15900,0.36200,-0.111167,-0.215939,1.0,1
...,...,...,...,...,...,...,...
56,0.76470,0.47780,-0.09224,0.260400,-0.046554,1.0,1
57,-0.51228,-1.04216,-0.64194,0.268433,-0.003665,1.0,1
58,0.02374,0.35838,0.08556,0.269833,0.017051,1.0,1
59,0.02374,0.35838,0.08556,0.270067,0.051272,1.0,1


In [45]:
event.describe()

Unnamed: 0,x,y,z,time,charge,qe,auxiliary
count,61.0,61.0,61.0,61.0,61.0,61.0,61.0
mean,0.023824,-0.058633,0.052818,0.092286,-0.02003,1.034426,0.786885
std,0.610632,0.520048,0.502226,0.113998,0.078332,0.105096,0.412907
min,-1.05326,-1.04216,-0.87616,-0.135733,-0.252321,1.0,0.0
25%,-0.4699,-0.53504,-0.41094,0.0214,-0.046554,1.0,1.0
50%,0.02374,-0.0312,0.20436,0.0869,-0.011286,1.0,1.0
75%,0.31788,0.35838,0.42426,0.1878,0.029379,1.0,1.0
max,1.15274,0.98044,0.99902,0.301033,0.199779,1.35,1.0


In [46]:
x = torch.tensor(event.values, dtype=torch.float32)
x.shape, x.dtype

(torch.Size([61, 7]), torch.float32)

In [56]:
target = meta.query(f"batch_id == {batch_id} & event_id == {event_id}")
y = torch.tensor(target[["azimuth", "zenith"]].values, dtype=torch.float32)
y

tensor([[5.0296, 2.0875]])

In [57]:
data = Data(x=x, y=y)
data.n_pulses = torch.tensor(x.shape[0], dtype=torch.int32)
data

Data(x=[61, 7], y=[1, 2], n_pulses=61)

In [48]:
builder = KNNGraphBuilder(nb_nearest_neighbours=8)

data_out = builder(data)
data_out

Data(x=[61, 7], n_pulses=61, edge_index=[2, 488])

In [49]:
# In final version use a PTGeo dataset & dataloader
batch = Batch.from_data_list([data_out, data_out])
batch

DataBatch(x=[122, 7], n_pulses=[2], edge_index=[2, 976], batch=[122], ptr=[3])

In [50]:
model = DynEdge(nb_inputs=7, global_pooling_schemes=["min", "max", "mean", "sum"])

In [51]:
out = model(batch)
out.shape

torch.Size([2, 128])

In [52]:
torch.save(data, "test.pt")

In [53]:
test = torch.load("test.pt")
test

Data(x=[61, 7], n_pulses=61, edge_index=[2, 488])