In [31]:
from fink_filters.classification import extract_fink_classification
from pyspark.sql.functions import pandas_udf
from pyspark.sql.types import ArrayType, DoubleType
import pandas as pd
from pyspark.sql import Window
import pyspark.sql.functions as F

from dateutil import rrule
from datetime import datetime, timedelta, date

In [32]:
def save_mpc_data(df, save_filename):
    class_df = df.withColumn(
        "class",
        extract_fink_classification(
            df["cdsxmatch"],
            df["roid"], 
            df["mulens"],
            df["snn_snia_vs_nonia"], 
            df["snn_sn_vs_all"], 
            df["rf_snia_vs_nonia"],
            df["candidate.ndethist"], 
            df["candidate.drb"], 
            df["candidate.classtar"], 
            df["candidate.jd"], 
            df["candidate.jdstarthist"], 
            df["rf_kn_vs_nonkn"], 
            df["tracklet"]
        )
    )
    
    sso_class = class_df.filter(class_df["class"] == "Solar System MPC")
    w = Window.partitionBy('candidate.ssnamenr')
    sso_class = sso_class.select(
        sso_class["objectId"],
        sso_class["candidate.candid"],
        sso_class["candidate.ra"],
        sso_class["candidate.dec"],
        sso_class["candidate.jd"],
        sso_class["candidate.nid"],
        sso_class["candidate.fid"],
        sso_class["candidate.ssnamenr"],
        sso_class["candidate.ssdistnr"],
        sso_class["candidate.magpsf"],
        sso_class["candidate.sigmapsf"],
        sso_class["candidate.magnr"],
        sso_class["candidate.sigmagnr"],
        sso_class["candidate.magzpsci"],
        sso_class["candidate.isdiffpos"],
        sso_class["year"],
        sso_class["month"],
        sso_class["day"],
        F.count('candidate.ssnamenr').over(w).alias('nb_detection')
    )
    local_sso = sso_class.toPandas()
    local_sso.to_parquet(save_filename, partition_cols=["year", "month"])

In [33]:
first = date(2019, 11, 1)
last = date(2022, 12, 21)

for dt in rrule.rrule(rrule.MONTHLY, dtstart=first, until=last):
    month = '{:02d}'.format(dt.month)
    year = dt.year
    print("year: {} / month: {}".format(year, month))
    df = spark.read.format("parquet").load("/user/julien.peloton/archive/science/year={}/month={}".format(year, month))
    df = df.withColumn("year", F.lit(year))
    df = df.withColumn("month", F.lit(month))
    save_mpc_data(df, "sso_data/")
    print()

year: 2019 / month: 11

year: 2019 / month: 12

year: 2020 / month: 01

year: 2020 / month: 02

year: 2020 / month: 03

year: 2020 / month: 04

year: 2020 / month: 05

year: 2020 / month: 06

year: 2020 / month: 07

year: 2020 / month: 08

year: 2020 / month: 09

year: 2020 / month: 10

year: 2020 / month: 11

year: 2020 / month: 12

year: 2021 / month: 01

year: 2021 / month: 02

year: 2021 / month: 03

year: 2021 / month: 04

year: 2021 / month: 05

year: 2021 / month: 06

year: 2021 / month: 07

year: 2021 / month: 08

year: 2021 / month: 09

year: 2021 / month: 10

year: 2021 / month: 11

year: 2021 / month: 12

year: 2022 / month: 01

year: 2022 / month: 02

year: 2022 / month: 03

year: 2022 / month: 04

year: 2022 / month: 05

year: 2022 / month: 06

year: 2022 / month: 07

year: 2022 / month: 08

year: 2022 / month: 09

year: 2022 / month: 10

year: 2022 / month: 11

year: 2022 / month: 12

