In [24]:
from collections import Counter
from datetime import datetime, timedelta
from glob import glob
import re
from astropy.coordinates import SkyCoord
from astropy.table import Table
from astropy.time import Time
import astropy.units as u
import numpy as np

## First, explore around to determine how to identify lines representing observations vs other lines in the log files.

In [28]:
# figure out how to find log lines representing observations vs log lines describing procedures
logs_dir = r"..\..\Files\DSSI Files\DSSI Logs"
logs_to_process = glob(f"{logs_dir}/**/*.olist", recursive=True)

observation_lines = "Observations lines.txt"
other_lines = "Non observation lines.txt"

with open(observation_lines, mode="w") as obs, open(other_lines, mode="w") as non_obs:
    for file in logs_to_process:
        datestr = ''.join(re.findall("([0-9]{4})....(...[0-9]{2})", file)[0])
        utc_date = datetime.strptime(datestr, "%Y%b%d")
        with open(file) as f:
            title = f"===========\nFile {file} from {utc_date}\n==========="
            obs.write(title + "\n")
            non_obs.write(title + "\n")
            for line in f.readlines():
                if line.count(":") >= 4: # and line.count("TIC") > 0:
                    obs.write(f"{datestr} {line}")
                else:
                    non_obs.write(line)

# Conclusion: requiring 4 or more colon characters exactly splits the logs

## Now extract all observation lines from all files
* Use more than one regex to match the target lines
  * For speed, make first regex attempted be the one that is empirically observed to match the most lines
* Keep track of lines that "fall through" and aren't matched by any regex pattern
* Standardize things for easier use in analysis
  * Output times both in ISO format and in JD
  * Convert sexagesimal coordinates to decimal degrees
* Depending on if the pattern matching a line includes IR information or not, add a comma-separated list of wavelengths
  * Currently, wavelength values are hard coded at the top of this code.
  * Scanning all log files for lines with "Camera A =" (or B), all files unsurprisingly cite the same set of wavelengths
* Output a CSV of all parsed & standardized observation lines

In [32]:
# now extract useful information from the various lines

# logs_dir = r"..\..\Files\DSSI Files\DSSI Logs"
# logs_to_process = glob(f"{logs_dir}/**/*.olist", recursive=True)

wavelengths_optical = "692, 880"
wavelengths_ir = "1450"

dssi_observations = Table(
    names=["Target Name", "TIC ID", "Wavelengths", "Image Number", "UTC DateTime", "Time JD", "Gain 1", "Gain 2", "RA", "Dec", "PMRA", "PMDec", "Mag", "Notes", ],
    dtype=["str", "str", "str", "int", "str", "float", "int", "int", "float", "float", "float", "float", "float", "str", ],
)

line_counts = Counter()

failed_lines = []
for file in logs_to_process:
    with open(file) as f:
        file_matches = 0
        datestr = ''.join(re.findall("([0-9]{4})....(...[0-9]{2})", file)[0])
        utc_date = datetime.strptime(datestr, "%Y%b%d")
        for line in f.readlines():
            fields = {}
            if line.count(":") < 4: # all observation lines follow this pattern
                continue # skip non-observation lines
            if match_h := re.match(r"(?P<target_name>.{7,13})\s+(?P<image_num>\d{1,3})\s+(?P<hours>\d\d):(?P<minutes>\d\d)\s+(?P<gain_1>\d{1,3})\s+(?P<gain_2>\d{1,3})\s+(?P<ra>\d\d:\d\d:\d\d\.\d+)\s+(?P<dec>[+|-]{0,1}\d\d:\d\d:\d\d\.\d+)\s+(?P<pmra>[0-9.+-]*)\s+(?P<pmdec>[0-9.+-]*)\s+(?P<mag>[0-9.-]+)\s*(?P<notes>.*)", line):
                line_counts["Standard Pattern"] += 1
            elif match_h := re.match(r"(?P<target_name>\"{0,1}.{7,13}\"{0,1})\s+(?P<ra>\d\d:\d\d:\d\d\.\d+)\s+(?P<dec>[+|-]{0,1}\d\d:\d\d:\d\d\.\d+)\s+(?P<pmra>[0-9.+-]*)\s+(?P<pmdec>[0-9.+-]*)\s+(?P<mag>[0-9.-]+)\s*(?P<notes>.*)", line):
                line_counts["No Image Num, Gains, or Time"] += 1
                # fields["image_num"] = 0
                # fields["gain"] = 0
                continue # per Jimmy, these were possible targets that never resulted in an actual observation
            elif match_h := re.match(r"(?P<target_name>.{7})\s+(?P<image_beg>\d{1,3})-(?P<image_end>\d{1,3})\s+(?P<image_ir>\d{1,3})\s+(?P<hours>\d\d):(?P<minutes>\d\d)\s+(?P<gain>\d{1,3})\s+(?P<ra>\d\d:\d\d:\d\d\.\d+)\s+(?P<dec>[+|-]{0,1}\d\d:\d\d:\d\d\.\d+)\s+(?P<pmra>[0-9\.+-]*)\s+(?P<pmdec>[0-9\.+-]*)\s+(?P<mag>[0-9\.]+)\s*(?P<notes>.*)", line):
                line_counts["Infrared Observations"] += 1
            elif match_h := re.match(r"(?P<target_name>.{7,15})\s+(?P<image_num>\d{1,4})\s+(?P<gain>\d{1,3})\s+(?P<hours>\d\d):(?P<minutes>\d\d)\s+(?P<ra>\d\d:\d\d:\d\d\.\d+)\s+(?P<dec>[+|-]{0,1}\d\d:\d\d:\d\d\.\d+)\s+(?P<pmra>[0-9\.+-]*)\s+(?P<pmdec>[0-9\.+-]*)\s+(?P<mag>[0-9\.]+)\s*(?P<notes>.*)", line): #
                line_counts["Single Gain Value"] += 1
            if match_h:
                file_matches += 1
                fields = {**match_h.groupdict(), **fields}
                try:
                    if not "pmra" in fields or fields["pmra"] == "":
                        fields["pmra"] = 0
                        fields["pmdec"] = 0
                except Exception as e:
                    pass
                if "gain" in fields: # assume gains are the same for both arms if only one gain specified
                    fields["gain_1"] = fields["gain"]
                    fields["gain_2"] = fields["gain"]
                if "hours" in fields:
                    obs_time = Time(utc_date + timedelta(hours=int(fields["hours"]), minutes=int(fields["minutes"])))
                    datetime_utc = str(obs_time.utc)
                    datetime_jd = obs_time.jd
                else:
                    datetime_utc = str(utc_date)
                    datetime_jd = 0
                if "image_ir" in fields:
                    observations = [(image_num, wavelengths_optical) for image_num in range(int(fields["image_beg"]), int(fields["image_end"]) + 1)]
                    observations.append((fields["image_ir"], wavelengths_ir))
                else:
                    observations = [(fields["image_num"], wavelengths_optical)]
                coord = SkyCoord(ra=fields["ra"], dec=fields["dec"], unit=(u.hourangle, u.deg))
                if matches := re.findall('TIC ?(?:ID)? ?=? ?([0-9]+)', line):
                    fields["tic_id"] = "TIC " + matches[0]
                elif fields["target_name"][:3] == "TIC":
                    fields["tic_id"] = f"TIC {fields["target_name"][3:].strip()}"
                else:
                    fields["tic_id"] = ""
                for image_num, wavelengths in observations:
                    try:
                        dssi_observations.add_row([fields["target_name"].replace('"', ''), fields["tic_id"], wavelengths, image_num, datetime_utc, datetime_jd, fields["gain_1"], fields["gain_2"], coord.ra, coord.dec, float(fields["pmra"]), float(fields["pmdec"]), fields["mag"], fields["notes"]])
                    except Exception as e:
                        print("error:", fields["tic_id"], line[:-1])
            else:
                failed_lines.append(line)
        print(f"{file_matches:4d} {f.name}")
total_observation_lines = 0
for (regex, count) in line_counts.items():
    print(f"{count:4d} {regex}")
    total_observation_lines += count
print(f"Total lines identified as observations: {total_observation_lines}")
print()
print(len(failed_lines), "failed matches")
for failed_line in failed_lines:
    print(failed_line[:-1])
dssi_observations.sort("Time JD")
dssi_observations.write("DSSI Observations.csv", overwrite=True)


  self.insert_row(len(self), vals, mask)


  50 ..\..\Files\DSSI Files\DSSI Logs\2022-Q2\may10raw.olist
error:  "HR 5635"           106   20 08:49 15:06:16.7 +54:33:22.7  6  1.2061
  56 ..\..\Files\DSSI Files\DSSI Logs\2022-Q2\may11raw.olist
  63 ..\..\Files\DSSI Files\DSSI Logs\2022-Q2\may12raw.olist
  49 ..\..\Files\DSSI Files\DSSI Logs\2022-Q2\may15raw.olist
  81 ..\..\Files\DSSI Files\DSSI Logs\2022-Q3\sep27raw.olist
 115 ..\..\Files\DSSI Files\DSSI Logs\2022-Q3\sep28raw.olist
 281 ..\..\Files\DSSI Files\DSSI Logs\2022-Q3\sep29raw.olist
 136 ..\..\Files\DSSI Files\DSSI Logs\2022-Q3\sep30raw.olist
 140 ..\..\Files\DSSI Files\DSSI Logs\2022-Q4\nov12raw.olist
 123 ..\..\Files\DSSI Files\DSSI Logs\2022-Q4\nov13raw.olist
   0 ..\..\Files\DSSI Files\DSSI Logs\2022-Q4\nov14raw.olist
 279 ..\..\Files\DSSI Files\DSSI Logs\2022-Q4\nov15raw.olist
 109 ..\..\Files\DSSI Files\DSSI Logs\2022-Q4\nov16raw.olist
  81 ..\..\Files\DSSI Files\DSSI Logs\2023-Q1\mar07raw.olist
 203 ..\..\Files\DSSI Files\DSSI Logs\2023-Q1\mar08raw.olist
 175 ..\

The above count of lines of observations is smaller than the number of lines output to the `DSSI observations.csv` file.  This is because each line of IR observation results in *n* lines of optical observation and one line of IR observation.  Doing otherwise would require throwing away the image numbers.  Preserving them in the CSV file makes future projects easier, such as correlating the FITS files from the cameras to these records of observations.

## Now collect observations into *sessions*
A session:
* is on the same target
* is one or more speckle observations taken back to back (no other objects targeted in between)
* contains measurements at one or more wavelength, typically 2 but sometimes 3 if the IR arm is used

Each (target + session) combination results in one line in the sessions table.

In [None]:
# group the observations by (target, wavelength) sequences

# first, add a speckle session column that changes with each new target
prev_target = ""
speckle_session = 0
dssi_observations["Speckle Session"] = 0
dssi_observations.sort("Time JD")
for observation in dssi_observations:
    target_name = observation["Target Name"]
    if target_name != prev_target:
        speckle_session += 1
        prev_target = target_name
    observation["Speckle Session"] = speckle_session

dssi_observations

In [None]:

# now, make a group summary of each wavelength in a speckle session
dssi_sessions = Table(
    names=["Target Name", "TIC ID", "Speckle Session", "StartTime JD", "MidTime JD", "EndTime JD", "MidTime UTC", "Num Sequences"],
    dtype=["str", "str", "str", "float", "float", "float", "str", "int"],
)

obs_by_session = dssi_observations.group_by(["Speckle Session", "Target Name", "TIC ID"])
for keys, observations in zip(obs_by_session.groups.keys, obs_by_session.groups):
    start_time = observations["Time JD"].min()
    end_time = observations["Time JD"].max()
    mid_time = (end_time + start_time) / 2
    mid_utc = str(Time(mid_time, format="jd").iso)[:19] if mid_time > 0 else ""
    dssi_sessions.add_row((keys["Target Name"], keys["TIC ID"], str(keys["Speckle Session"]), start_time, mid_time, end_time, mid_utc, len(observations)))

dssi_sessions.write("DSSI sessions.csv", overwrite=True)
dssi_sessions

For observations of TIC objects, it looks like the average number of sequences is 6.24, a bit higher than I expected.

# Map Hxxxxxx IDs to TIC IDs via references to TIC IDs in olist files

In [None]:
from glob import glob
import re

# look at correlation between Elliott's H3xxxxx target ids and the TIC numbers referenced on same line
logs_dir = r"..\..\Files\DSSI Files\DSSI Logs"
logs_to_process = glob(f"{logs_dir}/202[3,4]-Q?/*.olist", recursive=True)
# logs_to_process = [log for log in logs_to_process if log.find("2022-Q3") < 0]
# logs_to_process = [log for log in logs_to_process if log.find("2022-Q4") < 0]

coord_patterns = [
    r".*TIC(?P<tic>\d+)",
    r".*TIC (?P<tic>\d+)",
    r".*TIC=(?P<tic>\d+)",
    r".*TIC ID = (?P<tic>\d+)",
]

elliott_id_map = {} # key is elliott id, value is list of (file name, TIC id or None) for each occurrence of that H3xxxxx target 
elliott_tic_counts = {} # key is elliott id, value is a counter of TICs seen
problem_locations = []
h3_no_tic = []
h3_no_tic_ids = set()
observation_count, match_count, tic_count = 0, 0, 0
for olist_path in logs_to_process:
    with open(olist_path) as f:
        for line_num, line in enumerate(f.readlines()):
            if line.count(":") < 5:
                continue
            observation_count += 1
            if match_h := re.match(r"(?P<elliott_id>H30\d{4}) ", line):
                match_count += 1
                elliott_id = match_h.groupdict()["elliott_id"]
                olist_file = olist_path[-22:-9]
                occurrence_list = elliott_id_map.get(elliott_id, list())
                tic_id = None
                if line.lower().find("tic") >= 0:
                    for tic_pattern in coord_patterns:
                        if match_tic := re.match(tic_pattern, line):
                            tic_count += 1
                            tic_id = "TIC " + match_tic.groupdict()["tic"]
                            break
                occurrence_list.append((olist_file, tic_id))
                elliott_id_map[elliott_id] = occurrence_list
                counts = elliott_tic_counts.get(elliott_id, Counter())
                counts.update([tic_id])
                elliott_tic_counts[elliott_id] = counts

print(tic_count, match_count, observation_count)

{elliott_id: occurrences for elliott_id, occurrences in elliott_id_map.items() if len(occurrences) > 1}


for key, value in elliott_tic_counts.items():
    print(value.most_common()[0][0] + ", " + key)

607 607 3751
TIC 309025182, H300063
TIC 336882813, H300029
TIC 317863971, H300038
TIC 348651800, H300045
TIC 95928255, H300053
TIC 27543409, H300046
TIC 414969157, H300058
TIC 367448265, H300025
TIC 443862276, H300044
TIC 161043618, H300075
TIC 123098844, H300083
TIC 63459761, H300086
TIC 219469945, H300079
TIC 265274458, H300097
TIC 327885074, H300092
TIC 328181241, H300010
TIC 441794509, H300081
TIC 285681367, H300008
TIC 89278612, H300084
TIC 283940788, H300002
TIC 9493888, H300022
TIC 470710327, H300096
TIC 139650665, H300020
TIC 178953404, H300021
TIC 79140936, H300033
TIC 80914862, H300035
TIC 292318612, H300009
TIC 251757935, H300013
TIC 309262405, H300028
TIC 282005870, H300132
TIC 304713857, H300137
TIC 278465736, H300131
TIC 346000664, H300144
TIC 284806955, H300133
TIC 407060024, H300158
TIC 191283915, H300197
TIC 266395331, H300127
TIC 470397849, H300171
TIC 392229331, H300015
TIC 260056937, H300018
TIC 167800999, H300116
TIC 349124978, H300012
TIC 20212631, H300118
TIC 270

Conclusions about naming consistency between Hxxxxxx and TIC identifiers:
* Lines that refer to an H3xxxxx target but don't specify a TIC all have IDs for other targets with a J21273580-0038375 pattern, and all occur in 2022
* H3xxxxx targets that 

# Map Hxxxxxx IDs to TIC IDs via coordinates in olist files

In [None]:
from glob import glob
import re

# look at correlation between Elliott's H3xxxxx target ids and the TIC numbers referenced on same line
logs_dir = r"..\..\Files\DSSI Files\DSSI Logs"
logs_to_process = glob(f"{logs_dir}/202[3,4]-Q?/*.olist", recursive=True)

coord_patterns = [
    r".*TIC(?P<tic>\d+)",
    r".*TIC (?P<tic>\d+)",
    r".*TIC=(?P<tic>\d+)",
    r".*TIC ID = (?P<tic>\d+)",
]

elliott_id_map = (
    {}
)  # key is elliott id, value is list of (file name, TIC id or None) for each occurrence of that H3xxxxx target
elliott_tic_counts = {}  # key is elliott id, value is a counter of TICs seen
problem_locations = []
h3_no_tic = []
h3_no_tic_ids = set()
observation_count, match_count, tic_count = 0, 0, 0
for olist_path in logs_to_process:
    with open(olist_path) as f:
        for line_num, line in enumerate(f.readlines()):
            if line.count(":") < 5:
                continue
            observation_count += 1
            if match_h := re.match(r"(?P<elliott_id>H30\d{4}) ", line):
                match_count += 1
                elliott_id = match_h.groupdict()["elliott_id"]
                olist_file = olist_path[-22:-9]
                occurrence_list = elliott_id_map.get(elliott_id, list())
                tic_id = None
                if line.lower().find("tic") >= 0:
                    for tic_pattern in coord_patterns:
                        if match_tic := re.match(tic_pattern, line):
                            tic_count += 1
                            tic_id = "TIC " + match_tic.groupdict()["tic"]
                            break
                occurrence_list.append((olist_file, tic_id))
                elliott_id_map[elliott_id] = occurrence_list
                counts = elliott_tic_counts.get(elliott_id, Counter())
                counts.update([tic_id])
                elliott_tic_counts[elliott_id] = counts

print(tic_count, match_count, observation_count)

{elliott_id: occurrences for elliott_id, occurrences in elliott_id_map.items() if len(occurrences) > 1}


for key, value in elliott_tic_counts.items():
    print(value.most_common()[0][0] + ", " + key)