In [1]:
from pprint import pprint
from itertools import chain, compress
from functools import reduce, partial
from glob import iglob
from typing import Generator

from numba import jit
from yaml import safe_load
from cytoolz import compose, merge_with
import numpy as np
import matplotlib.pyplot as plt
from pyspark.sql import SparkSession, Row, DataFrame, functions as f
from pyspark.sql.types import ArrayType, BooleanType
from dltools import SpkHits, load_combiner
from dltools.sacla import restructure, load_analyzer

In [2]:
# %% Load PySpark
builder = (
    SparkSession
    .builder
    .config(
        "spark.jars.packages",
        "org.apache.hadoop:hadoop-aws:2.7.0,"
        "org.mongodb.spark:mongo-spark-connector_2.11:2.3.1,"
        "org.diana-hep:spark-root_2.11:0.1.15,"
    )
)

print("Loading PySpark...")
spark = builder.getOrCreate()
print("Done!")

Loading PySpark...
Done!


In [3]:
# %% Load data
files = [
    "/helium/analysis/SACLA_2017B8065_Takanashi/resort_201809/aq028--aq029/aq028_aq029_SortEvent_aq.root",
    "/helium/analysis/SACLA_2017B8065_Takanashi/resort_201809/aq030/aq030_SortEvent_aq.root",
    "/helium/analysis/SACLA_2017B8065_Takanashi/resort_201809/aq032/aq032_SortEvent_aq.root",
    "/helium/analysis/SACLA_2017B8065_Takanashi/resort_201809/aq033--aq034/aq033_aq034_SortEvent_aq.root"
    "/helium/analysis/SACLA_2017B8065_Takanashi/resort_201809/aq035--aq036/aq035_aq036_SortEvent_aq.root",
]

print("Loading data...")
loaded = (spark.read.format("org.dianahep.sparkroot").load(f) for f in files)
df = restructure(reduce(DataFrame.union, loaded))
df.printSchema()
df.show()
print("Done!")

Loading data...
root
 |-- tag: long (nullable = true)
 |-- hits: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- t: double (nullable = false)
 |    |    |-- x: double (nullable = false)
 |    |    |-- y: double (nullable = false)
 |    |    |-- as_: map (nullable = false)
 |    |    |    |-- key: string
 |    |    |    |-- value: struct (valueContainsNull = true)
 |    |    |    |    |-- pz: double (nullable = false)
 |    |    |    |    |-- px: double (nullable = false)
 |    |    |    |    |-- py: double (nullable = false)
 |    |    |    |    |-- ke: double (nullable = false)
 |    |    |-- flag: integer (nullable = true)

+---------+--------------------+
|      tag|                hits|
+---------+--------------------+
|158648231|[[803.53128890943...|
|158648232|[[804.66343712302...|
|158648233|[[794.79463683844...|
|158648234|[[786.43060318885...|
|158648235|[[709.28913731595...|
|158648236|[[627.36191628571...|
|158648237|[[899.62290167172..

In [6]:
# Insert data to MongoDB
print("Inserting data...")
(
    df
    .write
    .format("com.mongodb.spark.sql.DefaultSource")
    .option("uri", "mongodb://mongodb/sacla_2017b8065.resorted")
    .option("replaceDocument", "false")
    .option("shardKey", "{tag: true}")
    .mode("append")
    .save()
)
print("Done!")

Inserting data...
Done!
