In [1]:
from collections import Counter
from datetime import datetime, timedelta
from glob import glob
import re
from astropy.coordinates import SkyCoord
from astropy.table import Table
from astropy.time import Time
import astropy.units as u
import numpy as np

## First, explore around to determine how to identify lines representing observations vs other lines in the log files.

In [8]:
# figure out how to find log lines representing observations vs log lines describing procedures
logs_dir = r"..\..\Files\DSSI Files\DSSI Logs"
logs_to_process = glob(f"{logs_dir}/**/*.olist", recursive=True)

observation_lines = "DSSI observations.txt"
other_lines = "Non observation lines.txt"

with open(observation_lines, mode="w") as obs, open(other_lines, mode="w") as non_obs:
    for file in logs_to_process:
        datestr = ''.join(re.findall("([0-9]{4})....(...[0-9]{2})", file)[0])
        utc_date = datetime.strptime(datestr, "%Y%b%d")
        with open(file) as f:
            title = f"===========\nFile {file} from {utc_date}\n==========="
            obs.write(title + "\n")
            non_obs.write(title + "\n")
            for line in f.readlines():
                if line.count(":") >= 4: # and line.count("TIC") > 0:
                    obs.write(f"{datestr} {line}")
                else:
                    non_obs.write(line)

# Conclusion: requiring 4 or more colon characters exactly splits the logs

## Now extract all observation lines from all files
* Use more than one regex to match the target lines
  * For speed, make first regex attempted be the one that is empirically observed to match the most lines
* Keep track of lines that "fall through" and aren't matched by any regex pattern
* Standardize things for easier use in analysis
  * Output times both in ISO format and in JD
  * Convert sexagesimal coordinates to decimal degrees
* Depending on if the pattern matching a line includes IR information or not, add a comma-separated list of wavelengths
  * Currently, wavelength values are hard coded at the top of this code.
  * Scanning all log files for lines with "Camera A =" (or B), all files unsurprisingly cite the same set of wavelengths
* Output a CSV of all parsed & standardized observation lines

In [9]:
# now extract useful information from the various lines

# logs_dir = r"..\..\Files\DSSI Files\DSSI Logs"
# logs_to_process = glob(f"{logs_dir}/**/*.olist", recursive=True)

wavelengths_optical = "692, 880"
wavelengths_ir = "1450"

dssi_observations = Table(
    names=["Target Name", "TIC ID", "Wavelengths", "Image Number", "UTC DateTime", "Time JD", "Gain 1", "Gain 2", "RA", "Dec", "PMRA", "PMDec", "Mag", "Notes", ],
    dtype=["str", "str", "str", "int", "str", "float", "int", "int", "float", "float", "float", "float", "float", "str", ],
)

line_counts = Counter()

failed_lines = []
for file in logs_to_process:
    with open(file) as f:
        datestr = ''.join(re.findall("([0-9]{4})....(...[0-9]{2})", file)[0])
        utc_date = datetime.strptime(datestr, "%Y%b%d")
        for line in f.readlines():
            fields = {}
            if line.count(":") < 4: # all observation lines follow this pattern
                continue # skip non-observation lines
            if match := re.match(r"(?P<target_name>.{7,13})\s+(?P<image_num>\d{1,3})\s+(?P<hours>\d\d):(?P<minutes>\d\d)\s+(?P<gain_1>\d{1,3})\s+(?P<gain_2>\d{1,3})\s+(?P<ra>\d\d:\d\d:\d\d\.\d+)\s+(?P<dec>[+|-]{0,1}\d\d:\d\d:\d\d\.\d+)\s+(?P<pmra>[0-9.+-]*)\s+(?P<pmdec>[0-9.+-]*)\s+(?P<mag>[0-9.-]+)\s*(?P<notes>.*)", line):
                line_counts["Standard Pattern"] += 1
            elif match := re.match(r"(?P<target_name>\"{0,1}.{7,13}\"{0,1})\s+(?P<ra>\d\d:\d\d:\d\d\.\d+)\s+(?P<dec>[+|-]{0,1}\d\d:\d\d:\d\d\.\d+)\s+(?P<pmra>[0-9.+-]*)\s+(?P<pmdec>[0-9.+-]*)\s+(?P<mag>[0-9.-]+)\s*(?P<notes>.*)", line):
                line_counts["No Image Num or Gains"] += 1
                continue # per Jimmy, these were possible targets that never resulted in an actual observation
            elif match := re.match(r"(?P<target_name>.{7})\s+(?P<image_beg>\d{1,3})-(?P<image_end>\d{1,3})\s+(?P<image_ir>\d{1,3})\s+(?P<hours>\d\d):(?P<minutes>\d\d)\s+(?P<gain>\d{1,3})\s+(?P<ra>\d\d:\d\d:\d\d\.\d+)\s+(?P<dec>[+|-]{0,1}\d\d:\d\d:\d\d\.\d+)\s+(?P<pmra>[0-9\.+-]*)\s+(?P<pmdec>[0-9\.+-]*)\s+(?P<mag>[0-9\.]+)\s*(?P<notes>.*)", line):
                line_counts["Infrared Observations"] += 1
            elif match := re.match(r"(?P<target_name>\".{7}\")\s+(?P<image_num>\d{1,3})\s+(?P<gain>\d{1,3})\s+(?P<hours>\d\d):(?P<minutes>\d\d)\s+(?P<ra>\d\d:\d\d:\d\d\.\d+)\s+(?P<dec>[+|-]{0,1}\d\d:\d\d:\d\d\.\d+)\s+(?P<pmra>[0-9\.+-]*)\s+(?P<pmdec>[0-9\.+-]*)\s+(?P<mag>[0-9\.]+)\s*(?P<notes>.*)", line): #
                line_counts["Single Gain Value"] += 1
            if match:
                fields = {**match.groupdict(), **fields}
                try:
                    if not "pmra" in fields or fields["pmra"] == "":
                        fields["pmra"] = 0
                        fields["pmdec"] = 0
                except Exception as e:
                    pass
                if "gain" in fields: # assume gains are the same for both arms if only one gain specified
                    fields["gain_1"] = fields["gain"]
                    fields["gain_2"] = fields["gain"]
                if "hours" in fields:
                    obs_time = Time(utc_date + timedelta(hours=int(fields["hours"]), minutes=int(fields["minutes"])))
                    datetime_utc = str(obs_time.utc)
                    datetime_jd = obs_time.jd
                else:
                    datetime_utc = str(utc_date)
                    datetime_jd = 0
                if "image_ir" in fields:
                    observations = [(image_num, wavelengths_optical) for image_num in range(int(fields["image_beg"]), int(fields["image_end"]) + 1)]
                    observations.append((fields["image_ir"], wavelengths_ir))
                else:
                    observations = [(fields["image_num"], wavelengths_optical)]
                coord = SkyCoord(ra=fields["ra"], dec=fields["dec"], unit=(u.hourangle, u.deg))
                if matches := re.findall('TIC ?(?:ID)? ?=? ?([0-9]+)', line):
                    fields["tic_id"] = "TIC " + matches[0]
                else:
                    fields["tic_id"] = ""
                for image_num, wavelengths in observations:
                    try:
                        dssi_observations.add_row([fields["target_name"].replace('"', ''), fields["tic_id"], wavelengths, image_num, datetime_utc, datetime_jd, fields["gain_1"], fields["gain_2"], coord.ra, coord.dec, float(fields["pmra"]), float(fields["pmdec"]), fields["mag"], fields["notes"]])
                    except Exception as e:
                        print("error:", fields["tic_id"], line[:-1])
            else:
                failed_lines.append(line)
total_observation_lines = 0
for (regex, count) in line_counts.items():
    print(f"{count:4d} {regex}")
    total_observation_lines += count
print(f"Total lines identified as observations: {total_observation_lines}")
print()
print(len(failed_lines), "failed matches")
for failed_line in failed_lines:
    print(failed_line[:-1])
dssi_observations.sort("Time JD")
dssi_observations.write("DSSI Observations.csv", overwrite=True)


  self.insert_row(len(self), vals, mask)


  81 Single Gain Value
 344 No Image Num or Gains
4299 Standard Pattern
 134 Infrared Observations
Total lines identified as observations: 4858

21 failed matches
H003998 049 03:39 300 300  00:51:21.8 +18:44:21.3â€‚    53.397  -268.549  9.20  Xavier                      
H003998 050 03:41 300 300  00:51:21.8 +18:44:21.3â€‚    53.397  -268.549  9.20  Xavier
H003998 051 03:43 300 300  00:51:21.8 +18:44:21.3â€‚    53.397  -268.549  9.20  Xavier
H003493 204 04:21 300 300  00:44:37.2 -18:56:48.2â€‚   272.691   184.911 10.07  Xavier
H003493 205 04:22 300 300  00:44:37.2 -18:56:48.2â€‚   272.691   184.911 10.07  Xavier
H003493 206 04:24 300 300  00:44:37.2 -18:56:48.2â€‚   272.691   184.911 10.07  Xavier
H003493 207 04:25 300 300  00:44:37.2 -18:56:48.2â€‚   272.691   184.911 10.07  Xavier
H003493 208 04:27 300 300  00:44:37.2 -18:56:48.2â€‚   272.691   184.911 10.07  Xavier
H003493 209 04:29 300 300  00:44:37.2 -18:56:48.2â€‚   272.691   184.911 10.07  Xavier
H003493 210 04:30 300 300  00:44

The above count of lines of observations is smaller than the number of lines output to the `DSSI observations.csv` file.  This is because each line of IR observation results in *n* lines of optical observation and one line of IR observation.  Doing otherwise would require throwing away the image numbers.  Preserving them in the CSV file makes future projects easier, such as correlating the FITS files from the cameras to these records of observations.

## Now collect observations into *sessions*
A session:
* is on the same target
* is one or more speckle observations taken back to back (no other objects targeted in between)
* contains measurements at one or more wavelength, typically 2 but sometimes 3 if the IR arm is used

Each (target + session) combination results in one line in the sessions table.

In [10]:
# group the observations by (target, wavelength) sequences

# first, add a speckle session column that changes with each new target
prev_target = ""
speckle_session = 0
dssi_observations["Speckle Session"] = 0
dssi_observations.sort("Time JD")
for observation in dssi_observations:
    target_name = observation["Target Name"]
    if target_name != prev_target:
        speckle_session += 1
        prev_target = target_name
    observation["Speckle Session"] = speckle_session

dssi_observations

Target Name,TIC ID,Wavelengths,Image Number,UTC DateTime,Time JD,Gain 1,Gain 2,RA,Dec,PMRA,PMDec,Mag,Notes,Speckle Session
str12,str13,str8,int64,str19,float64,int64,int64,float64,float64,float64,float64,float64,str55,int64
HR 583,,"692, 880",1,2022-09-27 07:45:00,2459849.8229166665,20,20,29.94208333333333,-20.824444444444445,18.826,15.86,5.41,,1
H007396,,"692, 880",2,2022-09-27 07:49:00,2459849.8256944446,20,20,23.8375,-21.201083333333333,140.035,-73.844,8.75,Xavier,2
H200063,,"692, 880",3,2022-09-27 07:54:00,2459849.8291666666,100,100,33.94208333333333,-18.23813888888889,-33.316,-124.197,8.09,pri1 RKS0215-1814,3
H011452,,"692, 880",4,2022-09-27 07:59:00,2459849.832638889,100,100,36.94125,4.432138888888889,86.42,240.0,8.67,"SCALE, rho=0.618Â±0.058, rot=0",4
H011452,,"692, 880",5,2022-09-27 08:04:00,2459849.836111111,100,100,36.94125,4.432138888888889,86.42,240.0,8.67,"SCALE, rho=0.618Â±0.058, rot=-30",4
H011452,,"692, 880",6,2022-09-27 08:08:00,2459849.8388888887,100,100,36.94125,4.432138888888889,86.42,240.0,8.67,"SCALE, rho=0.618Â±0.058, rot=-60",4
H011452,,"692, 880",7,2022-09-27 08:11:00,2459849.8409722224,100,100,36.94125,4.432138888888889,86.42,240.0,8.67,"SCALE, rho=0.618Â±0.058, rot=-90",4
H011452,,"692, 880",8,2022-09-27 08:14:00,2459849.8430555556,100,100,36.94125,4.432138888888889,86.42,240.0,8.67,"SCALE, rho=0.618Â±0.058, rot=-120",4
H011452,,"692, 880",9,2022-09-27 08:16:00,2459849.8444444444,100,100,36.94125,4.432138888888889,86.42,240.0,8.67,"SCALE, rho=0.618Â±0.058, rot=-150",4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [11]:

# now, make a group summary of each wavelength in a speckle session
dssi_sessions = Table(
    names=["Target Name", "TIC ID", "Speckle Session", "StartTime JD", "MidTime JD", "EndTime JD", "MidTime UTC", "Num Sequences"],
    dtype=["str", "str", "str", "float", "float", "float", "str", "int"],
)

obs_by_session = dssi_observations.group_by(["Speckle Session", "Target Name", "TIC ID"])
for keys, observations in zip(obs_by_session.groups.keys, obs_by_session.groups):
    start_time = observations["Time JD"].min()
    end_time = observations["Time JD"].max()
    mid_time = (end_time + start_time) / 2
    mid_utc = str(Time(mid_time, format="jd").iso)[:19] if mid_time > 0 else ""
    dssi_sessions.add_row((keys["Target Name"], keys["TIC ID"], str(keys["Speckle Session"]), start_time, mid_time, end_time, mid_utc, len(observations)))

dssi_sessions.write("DSSI sessions.csv", overwrite=True)
dssi_sessions

Target Name,TIC ID,Speckle Session,StartTime JD,MidTime JD,EndTime JD,MidTime UTC,Num Sequences
str12,str13,str4,float64,float64,float64,str19,int64
HR 583,,1,2459849.8229166665,2459849.8229166665,2459849.8229166665,2022-09-27 07:45:00,1
H007396,,2,2459849.8256944446,2459849.8256944446,2459849.8256944446,2022-09-27 07:49:00,1
H200063,,3,2459849.8291666666,2459849.8291666666,2459849.8291666666,2022-09-27 07:54:00,1
H011452,,4,2459849.832638889,2459849.840625,2459849.8486111113,2022-09-27 08:10:30,7
HR 0689,,5,2459849.85,2459849.85,2459849.85,2022-09-27 08:24:00,1
H200064,,6,2459849.853472222,2459849.853472222,2459849.853472222,2022-09-27 08:29:00,1
HR 737,,7,2459849.855555556,2459849.855555556,2459849.855555556,2022-09-27 08:32:00,1
H011565,,8,2459849.857638889,2459849.857638889,2459849.857638889,2022-09-27 08:35:00,1
H200065,,9,2459849.859722222,2459849.859722222,2459849.859722222,2022-09-27 08:38:00,1
...,...,...,...,...,...,...,...


For observations of TIC objects, it looks like the average number of sequences is 6.24, a bit higher than I expected.