In [31]:
from fink_filters.classification import extract_fink_classification
from pyspark.sql.functions import pandas_udf
from pyspark.sql.types import ArrayType, DoubleType
import pandas as pd
from pyspark.sql import Window
import pyspark.sql.functions as F

from dateutil import rrule
from datetime import datetime, timedelta, date

In [32]:
def save_mpc_data(df, save_filename):
    class_df = df.withColumn(
        "class",
        extract_fink_classification(
            df["cdsxmatch"],
            df["roid"], 
            df["mulens"],
            df["snn_snia_vs_nonia"], 
            df["snn_sn_vs_all"], 
            df["rf_snia_vs_nonia"],
            df["candidate.ndethist"], 
            df["candidate.drb"], 
            df["candidate.classtar"], 
            df["candidate.jd"], 
            df["candidate.jdstarthist"], 
            df["rf_kn_vs_nonkn"], 
            df["tracklet"]
        )
    )
    
    sso_class = class_df.filter(class_df["class"] == "Solar System MPC")
    w = Window.partitionBy('candidate.ssnamenr')
    sso_class = sso_class.select(
        sso_class["objectId"],
        sso_class["candidate.candid"],
        sso_class["candidate.ra"],
        sso_class["candidate.dec"],
        sso_class["candidate.jd"],
        sso_class["candidate.nid"],
        sso_class["candidate.fid"],
        sso_class["candidate.ssnamenr"],
        sso_class["candidate.ssdistnr"],
        sso_class["candidate.magpsf"],
        sso_class["candidate.sigmapsf"],
        sso_class["candidate.magnr"],
        sso_class["candidate.sigmagnr"],
        sso_class["candidate.magzpsci"],
        sso_class["candidate.isdiffpos"],
        sso_class["year"],
        sso_class["month"],
        sso_class["day"],
        F.count('candidate.ssnamenr').over(w).alias('nb_detection')
    )
    local_sso = sso_class.toPandas()
    local_sso.to_parquet(save_filename, partition_cols=["year", "month"])

In [33]:
first = date(2019, 11, 1)
last = date(2022, 12, 21)

for dt in rrule.rrule(rrule.MONTHLY, dtstart=first, until=last):
    month = '{:02d}'.format(dt.month)
    year = dt.year
    print("year: {} / month: {}".format(year, month))
    df = spark.read.format("parquet").load("/user/julien.peloton/archive/science/year={}/month={}".format(year, month))
    df = df.withColumn("year", F.lit(year))
    df = df.withColumn("month", F.lit(month))
    save_mpc_data(df, "sso_data/")
    print()

year: 2019 / month: 11

year: 2019 / month: 12

year: 2020 / month: 01

year: 2020 / month: 02

year: 2020 / month: 03

year: 2020 / month: 04

year: 2020 / month: 05

year: 2020 / month: 06

year: 2020 / month: 07

year: 2020 / month: 08

year: 2020 / month: 09

year: 2020 / month: 10

year: 2020 / month: 11

year: 2020 / month: 12

year: 2021 / month: 01

year: 2021 / month: 02

year: 2021 / month: 03

year: 2021 / month: 04

year: 2021 / month: 05

year: 2021 / month: 06

year: 2021 / month: 07

year: 2021 / month: 08

year: 2021 / month: 09

year: 2021 / month: 10

year: 2021 / month: 11

year: 2021 / month: 12

year: 2022 / month: 01

year: 2022 / month: 02

year: 2022 / month: 03

year: 2022 / month: 04

year: 2022 / month: 05

year: 2022 / month: 06

year: 2022 / month: 07

year: 2022 / month: 08

year: 2022 / month: 09

year: 2022 / month: 10

year: 2022 / month: 11

year: 2022 / month: 12



In [1]:
import pandas as pd
import os
import numpy as np
import exploring_script as es
from fink_utils.photometry.vect_conversion import vect_dc_mag
from fink_fat.orbit_fitting.orbfit_cluster import orbit_wrapper

from pyspark.sql import functions as F

In [2]:
confirmed_sso = es.load_data()

In [3]:
sso_6pts = confirmed_sso[confirmed_sso["nb_detection"] >= 6]

In [4]:
sso_samples = sso_6pts[sso_6pts["ssnamenr"].isin(sso_6pts["ssnamenr"].unique()[:100])]

In [5]:
sso_samples["dcmag"], sso_samples["sigdcmag"] = vect_dc_mag(
    sso_samples["fid"],
    sso_samples["magpsf"],
    sso_samples["sigmapsf"],
    sso_samples["magnr"],
    sso_samples["sigmagnr"],
    sso_samples["magzpsci"],
    sso_samples["isdiffpos"]
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [6]:
sso_samples

Unnamed: 0,objectId,candid,ra,dec,jd,nid,fid,ssnamenr,ssdistnr,magpsf,...,magnr,sigmagnr,magzpsci,isdiffpos,day,nb_detection,year,month,dcmag,sigdcmag
0,ZTF19actuiia,1052243905515015004,47.623635,28.648661,2.458807e+06,1052,2,461543,0.0,19.409599,...,23.073000,0.344,26.119598,t,19,17,2019,11,19.367519,0.113719
1,ZTF19acoarlo,1037297706315015003,51.248975,28.758817,2.458792e+06,1037,1,461543,0.0,20.099430,...,23.003000,0.286,26.136431,t,4,17,2019,11,20.013827,0.156177
2,ZTF19acodiwv,1037355986315015002,51.234882,28.759560,2.458792e+06,1037,2,461543,0.0,19.665506,...,20.236000,0.049,26.057507,t,4,17,2019,11,19.075148,0.069201
3,ZTF19acmhaos,1035286515815015012,51.707255,28.725952,2.458790e+06,1035,2,461543,0.0,19.507608,...,22.714001,0.252,26.150608,t,2,17,2019,11,19.445875,0.190843
4,ZTF19actkbtj,1051342775515015008,47.839864,28.671509,2.458806e+06,1051,2,461543,0.0,19.390882,...,22.986000,0.319,26.152884,t,18,17,2019,11,19.347451,0.131720
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15101156,ZTF22abyrsbf,2174194253115015006,26.970717,10.406012,2.459929e+06,2174,1,55745,0.0,19.996458,...,22.872999,0.229,26.232458,t,15,6,2022,12,19.915935,0.180188
15101157,ZTF22abyrioi,2174171493115015011,26.971346,10.405367,2.459929e+06,2174,2,55745,0.0,19.445528,...,23.232000,0.307,26.234528,t,15,6,2022,12,19.411603,0.095147
15101158,ZTF22abzvbkf,2180248193115015007,26.972597,10.623364,2.459935e+06,2180,2,55745,0.0,19.146715,...,21.584999,0.089,26.254715,t,21,6,2022,12,19.035518,0.072805
15101159,ZTF22abztywr,2180197013115015007,26.971821,10.621190,2.459935e+06,2180,1,55745,0.0,19.970112,...,21.930000,0.100,26.306112,t,21,6,2022,12,19.802118,0.149626


In [7]:
unique_sso_name = sso_samples["ssnamenr"].unique()
name_to_id = {name:traj_id for traj_id, name in zip(np.arange(len(unique_sso_name)), unique_sso_name)}
sso_samples["trajectory_id"] = sso_samples["ssnamenr"].map(name_to_id)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [8]:
sparkDF = spark.createDataFrame(sso_samples)

In [9]:
spark_gb = (
            sparkDF.groupby("trajectory_id")
            .agg(
                F.sort_array(
                    F.collect_list(F.struct("jd", "ra", "dec", "fid", "dcmag"))
                ).alias("collected_list")
            )
            .withColumn("ra", F.col("collected_list.ra"))
            .withColumn("dec", F.col("collected_list.dec"))
            .withColumn("fid", F.col("collected_list.fid"))
            .withColumn("dcmag", F.col("collected_list.dcmag"))
            .withColumn("jd", F.col("collected_list.jd"))
            .drop("collected_list")
        )

In [20]:
spark_column = spark_gb.withColumn(
            "orbital_elements",
            orbit_wrapper(
                spark_gb.ra,
                spark_gb.dec,
                spark_gb.dcmag,
                spark_gb.fid,
                spark_gb.jd,
                spark_gb.trajectory_id,
                "/tmp/ramdisk/roman",
                30,
                20,
                None,
                verbose=3,
            ),
        )

In [21]:
spark_column.select("orbital_elements").show()

+--------------------+
|    orbital_elements|
+--------------------+
|[-1.0, -1.0, -1.0...|
|[2459791.67768734...|
|[-1.0, -1.0, -1.0...|
|[2459760.81260634...|
|[-1.0, -1.0, -1.0...|
|[2459790.75306924...|
|[2458812.83052294...|
|[2459760.73288404...|
|[2459323.75584704...|
|[2459392.69540724...|
|[2459362.73573134...|
|[2459756.69209704...|
|[2459791.67862484...|
|[2459725.87801134...|
|[-1.0, -1.0, -1.0...|
|[2459936.70944654...|
|[2459906.74986324...|
|[2459699.76252524...|
|[2459694.85387944...|
|[-1.0, -1.0, -1.0...|
+--------------------+
only showing top 20 rows

