-
Notifications
You must be signed in to change notification settings - Fork 120
/
test_skyhook_job.py
99 lines (83 loc) · 2.63 KB
/
test_skyhook_job.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import os
import awkward as ak
import toml
import uproot
from coffea import processor
from coffea.nanoevents import schemas
from coffea.processor.test_items import NanoEventsProcessor
if __name__ == "__main__":
config_dict = {
"skyhook": {
"ceph_config_path": "/tmp/testskyhookjob/ceph.conf",
"ceph_data_pool": "cephfs_data",
}
}
with open("/root/.coffea.toml", "w") as f:
toml.dump(config_dict, f)
ak.to_parquet(
uproot.lazy("tests/samples/nano_dy.root:Events"),
"nano_dy.parquet",
list_to32=True,
use_dictionary=False,
compression="GZIP",
compression_level=1,
)
ak.to_parquet(
uproot.lazy("tests/samples/nano_dimuon.root:Events"),
"nano_dimuon.parquet",
list_to32=True,
use_dictionary=False,
compression="GZIP",
compression_level=1,
)
os.makedirs("/mnt/cephfs/nanoevents/ZJets")
os.makedirs("/mnt/cephfs/nanoevents/Data")
for i in range(6):
os.system(
f"cp nano_dy.parquet /mnt/cephfs/nanoevents/ZJets/nano_dy.{i}.parquet"
)
os.system(
f"cp nano_dimuon.parquet /mnt/cephfs/nanoevents/Data/nano_dimuon.{i}.parquet"
)
from dask.distributed import Client, LocalCluster
cluster = LocalCluster(processes=True, threads_per_worker=1)
client = Client(cluster)
executor = processor.DaskExecutor(client=client)
run = processor.Runner(
executor=executor,
use_skyhook=True,
format="parquet",
schema=schemas.NanoAODSchema,
)
hists = run(
{
"ZJets": "/mnt/cephfs/nanoevents/ZJets",
"Data": "/mnt/cephfs/nanoevents/Data",
},
"Events",
processor_instance=NanoEventsProcessor(),
)
assert hists["cutflow"]["ZJets_pt"] == 108
assert hists["cutflow"]["ZJets_mass"] == 36
assert hists["cutflow"]["Data_pt"] == 504
assert hists["cutflow"]["Data_mass"] == 396
# now run again on parquet files in cephfs (without any pushdown)
executor_args = {"client": client}
run = processor.Runner(
executor=executor,
format="parquet",
schema=schemas.NanoAODSchema,
use_skyhook=True,
)
hists = run(
{
"ZJets": "/mnt/cephfs/nanoevents/ZJets",
"Data": "/mnt/cephfs/nanoevents/Data",
},
"Events",
processor_instance=NanoEventsProcessor(),
)
assert hists["cutflow"]["ZJets_pt"] == 108
assert hists["cutflow"]["ZJets_mass"] == 36
assert hists["cutflow"]["Data_pt"] == 504
assert hists["cutflow"]["Data_mass"] == 396