In [1]:
from collections import Counter
from datetime import datetime, timedelta
from glob import glob
import re
from astropy.coordinates import SkyCoord
from astropy.table import Table
from astropy.time import Time
import astropy.units as u
import numpy as np

In [2]:
# figure out how to find log lines representing observations vs log lines describing procedures
logs_dir = r"..\..\Files\DSSI Logs"
logs_to_process = glob(f"{logs_dir}/**/*.olist", recursive=True)

observation_lines = "DSSI observations.txt"
other_lines = "Non observation lines.txt"

with open(observation_lines, mode="w") as obs, open(other_lines, mode="w") as non_obs:
    for file in logs_to_process:
        datestr = ''.join(re.findall("([0-9]{4})....(...[0-9]{2})", file)[0])
        utc_date = datetime.strptime(datestr, "%Y%b%d")
        with open(file) as f:
            title = f"===========\nFile {file} from {utc_date}\n==========="
            obs.write(title + "\n")
            non_obs.write(title + "\n")
            for line in f.readlines():
                if line.count(":") >= 4: # and line.count("TIC") > 0:
                    obs.write(f"{datestr} {line}")
                else:
                    non_obs.write(line)

# Conclusion: requiring 4 or more colon characters exactly splits the logs

In [7]:
#pattern = '(?:TIC)([0-9]+)'
pattern = 'TIC ?(?:ID)? ?=? ?([0-9]+)'
lines = [
    "TIC=63459761",
    "TIC12345",
    "# TIC ID=123",
    "TIC =123",
]
[re.findall(pattern, line) for line in lines]

pattern = r"(?P<target_name>.{7,13})\s+(?P<image_num>\d{1,3})\s+(?P<hours>\d\d):(?P<minutes>\d\d)\s+(?P<gain_1>\d{1,3})\s+(?P<gain_2>\d{1,3})\s+(?P<ra>\d\d:\d\d:\d\d\.\d+)\s+(?P<dec>[+|-]{0,1}\d\d:\d\d:\d\d\.\d+)\s+(?P<pmra>[0-9.+-]*)\s+(?P<pmdec>[0-9.+-]*)\s+(?P<mag>[0-9.-]+)\s*(?P<notes>.*)"
line = 'HR 7358 016 03:31  20  20  19:22:50.9 +26:15:45.0                       5.18'
re.match(pattern, line).groupdict()

{'target_name': 'HR 7358',
 'image_num': '016',
 'hours': '03',
 'minutes': '31',
 'gain_1': '20',
 'gain_2': '20',
 'ra': '19:22:50.9',
 'dec': '+26:15:45.0',
 'pmra': '',
 'pmdec': '',
 'mag': '5.18',
 'notes': ''}

In [29]:
# now extract useful information from the various lines

logs_dir = r"..\..\Files\DSSI Logs"
logs_to_process = glob(f"{logs_dir}/**/*.olist", recursive=True)
# logs_to_process = ['foo.olist']

filter_blue = 692
filter_red = 880
filter_ir = 1450

dssi_observations = Table(
    names=["Target Name", "TIC ID", "Wavelength", "Image Number", "Time UTC", "Time JD", "Gain 1", "Gain 2", "RA", "Dec", "PMRA", "PMDec", "Mag", "Notes", ],
    dtype=["str", "str", "int", "int", "str", "str", "int", "int", "float", "float", "float", "float", "float", "str", ],
)

line_counts = Counter()
failed_lines = []
for file in logs_to_process:
    with open(file) as f:
        datestr = ''.join(re.findall("([0-9]{4})....(...[0-9]{2})", file)[0])
        utc_date = datetime.strptime(datestr, "%Y%b%d")
        for line in f.readlines():
            fields = {}
            if line.count(":") < 4: # all observation lines follow this pattern
                continue # skip non-observation lines
            if match := re.match(r"(?P<target_name>\".{7}\")\s+(?P<image_num>\d{1,3})\s+(?P<gain>\d{1,3})\s+(?P<hours>\d\d):(?P<minutes>\d\d)\s+(?P<ra>\d\d:\d\d:\d\d\.\d+)\s+(?P<dec>[+|-]{0,1}\d\d:\d\d:\d\d\.\d+)\s+(?P<pmra>[0-9\.+-]*)\s+(?P<pmdec>[0-9\.+-]*)\s+(?P<mag>[0-9\.]+)\s*(?P<notes>.*)", line): #
                line_counts["Pattern 1"] += 1
            elif match := re.match(r"(?P<target_name>\"{0,1}.{7,13}\"{0,1})\s+(?P<ra>\d\d:\d\d:\d\d\.\d+)\s+(?P<dec>[+|-]{0,1}\d\d:\d\d:\d\d\.\d+)\s+(?P<pmra>[0-9.+-]+)\s+(?P<pmdec>[0-9.+-]+)\s+(?P<mag>[0-9.]+)\s*(?P<notes>.*)", line):
                fields["image_num"] = 0
                fields["gain_1"] = 0
                fields["gain_2"] = 0
                line_counts["Pattern 2"] += 1
            elif match := re.match(r"(?P<target_name>.{7,13})\s+(?P<image_num>\d{1,3})\s+(?P<hours>\d\d):(?P<minutes>\d\d)\s+(?P<gain_1>\d{1,3})\s+(?P<gain_2>\d{1,3})\s+(?P<ra>\d\d:\d\d:\d\d\.\d+)\s+(?P<dec>[+|-]{0,1}\d\d:\d\d:\d\d\.\d+)\s+(?P<pmra>[0-9.+-]*)\s+(?P<pmdec>[0-9.+-]*)\s+(?P<mag>[0-9.-]+)\s*(?P<notes>.*)", line):
                line_counts["Pattern 3"] += 1
            elif match := re.match(r"(?P<target_name>.{7})\s+(?P<image_beg>\d{1,3})-(?P<image_end>\d{1,3})\s+(?P<image_ir>\d{1,3})\s+(?P<hours>\d\d):(?P<minutes>\d\d)\s+(?P<gain>\d{1,3})\s+(?P<ra>\d\d:\d\d:\d\d\.\d+)\s+(?P<dec>[+|-]{0,1}\d\d:\d\d:\d\d\.\d+)\s+(?P<pmra>[0-9\.+-]*)\s+(?P<pmdec>[0-9\.+-]*)\s+(?P<mag>[0-9\.]+)\s*(?P<notes>.*)", line):
                line_counts["Pattern 4"] += 1
            if match:
                fields = {**match.groupdict(), **fields}
                try:
                    if not "pmra" in fields or fields["pmra"] == "":
                        fields["pmra"] = 0
                        fields["pmdec"] = 0
                except Exception as e:
                    pass
                if "gain" in fields: # assume gains are the same for both arms if only one gain specified
                    fields["gain_1"] = fields["gain"]
                    fields["gain_2"] = fields["gain"]
                if "hours" in fields:
                    obs_time = Time(utc_date + timedelta(hours=int(fields["hours"]), minutes=int(fields["minutes"])))
                    datetime_utc = str(obs_time.utc)
                    datetime_jd = str(obs_time.jd)
                else:
                    fields["datetime_utc"] = None
                    fields["datetime_jd"] = None
                if "image_ir" in fields:
                    observations = [(fields["image_ir"], filter_ir)]
                    observations += [(image_num, filter_x)
                                     for filter_x in [filter_red, filter_blue]
                                     for image_num in range(int(fields["image_beg"]), int(fields["image_end"]) + 1)]
                else:
                    image_num = fields["image_num"]
                    observations = [(image_num, filter_blue), (image_num, filter_red)]
                coord = SkyCoord(ra=fields["ra"], dec=fields["dec"], unit=(u.hourangle, u.deg))
                if matches := re.findall('TIC ?(?:ID)? ?=? ?([0-9]+)', line):
                    fields["tic_id"] = "TIC " + matches[0]
                else:
                    fields["tic_id"] = ""
                for (image_num, wavelength) in observations:
                    try:
                        dssi_observations.add_row([fields["target_name"], fields["tic_id"], wavelength, image_num, datetime_utc, datetime_jd, fields["gain_1"], fields["gain_2"], coord.ra, coord.dec, float(fields["pmra"]), float(fields["pmdec"]), fields["mag"], fields["notes"]])
                    except Exception as e:
                        print("error:", fields["tic_id"], line[:-1])
            else:
                failed_lines.append(line)
for (regex, count) in line_counts.items():
    print(f"{count:4d} {regex}")
print()
print(len(failed_lines), "failed matches")
for failed_line in failed_lines:
    print(failed_line[:-1])
dssi_observations.write("DSSI Observations.csv", overwrite=True)



Next steps:
  * Edit the produced `DSSI observations.txt` file in Notepad++ to align the columns properly.
  * Remove ~230 lines that are in a different format.
    * Some are different only because the target name is longer.
    * Some represent optical + infrared measurements and therefore have a range of image numbers for optical and a single image number for infrared.
  * Rename resulting file to `DSSI observations.csv` so manual work doesn't get clobbered if code is rerun.

In [4]:
observations = Table.read("DSSI observations.csv")
tic_observations = observations[np.char.find(observations["Notes"], "TIC") >= 0]

tic_observations = tic_observations[~(tic_observations["Name"] == "H300111")] #bad time format

tic_observations["DateTimeUtc"] = None
tic_observations["TIC ID"] = None
tic_observations["BYear"] = None
tic_observations["JD"] = None

for row in tic_observations:
    datetimeutc = datetime.strptime(row["Date"]+row["Time"], "%Y%b%d%H:%M")
    row["DateTimeUtc"] = datetimeutc
    time = Time(datetimeutc, format="datetime")
    row["BYear"] = time.byear
    row["JD"] = time.jd
    row["TIC ID"] = "TIC " + re.findall("[#]*TIC[ ]*[ID ]*[=]*[ ]*([0-9]+)", row["Notes"])[0]
    coord = SkyCoord(ra=row["RA"], dec=row["Dec"], unit=[u.hourangle, u.deg])
    row["RA"] = coord.ra
    row["Dec"] = coord.dec

tic_observations.remove_columns(["Date", "Time", "Notes"])
tic_observations.write("TIC observations.csv", overwrite=True)
tic_observations

  self._table.columns[item][self._index] = val
  self._table.columns[item][self._index] = val
  self._table.columns[item][self._index] = val
  self._table.columns[item][self._index] = val
  self._table.columns[item][self._index] = val
  self._table.columns[item][self._index] = val
  self._table.columns[item][self._index] = val
  self._table.columns[item][self._index] = val
  self._table.columns[item][self._index] = val
  self._table.columns[item][self._index] = val
  self._table.columns[item][self._index] = val
  self._table.columns[item][self._index] = val
  self._table.columns[item][self._index] = val
  self._table.columns[item][self._index] = val
  self._table.columns[item][self._index] = val
  self._table.columns[item][self._index] = val
  self._table.columns[item][self._index] = val
  self._table.columns[item][self._index] = val
  self._table.columns[item][self._index] = val
  self._table.columns[item][self._index] = val
  self._table.columns[item][self._index] = val
  self._table

Name,ImageNum,Gain1,Gain2,RA,Dec,PMRA,PMDec,Mag,DateTimeUtc,TIC ID,BYear,JD
str7,int32,int32,int32,str10,str13,float64,float64,float64,object,object,object,object
H300032,91,300,300,8.85125,62.9015833333,-2.727,-0.154,11.73,2022-09-28 08:22:00,TIC 283940788,2022.7419373794528,2459850.8486111113
H300032,92,300,300,8.85125,62.9015833333,-2.727,-0.154,11.73,2022-09-28 08:23:00,TIC 283940788,2022.7419392807785,2459850.8493055557
H300032,93,300,300,8.85125,62.9015833333,-2.727,-0.154,11.73,2022-09-28 08:25:00,TIC 283940788,2022.7419430834302,2459850.8506944445
H300032,94,300,300,8.85125,62.9015833333,-2.727,-0.154,11.73,2022-09-28 08:26:00,TIC 283940788,2022.7419449847562,2459850.851388889
H300032,95,300,300,8.85125,62.9015833333,-2.727,-0.154,11.73,2022-09-28 08:28:00,TIC 283940788,2022.7419487874079,2459850.8527777777
H300033,97,300,300,11.86375,64.8180277777,-2.559,-0.239,12.02,2022-09-28 08:36:00,TIC 284814380,2022.7419639980149,2459850.8583333334
H300033,98,300,300,11.86375,64.8180277777,-2.559,-0.239,12.02,2022-09-28 08:37:00,TIC 284814380,2022.7419658993408,2459850.859027778
H300033,99,300,300,11.86375,64.8180277777,-2.559,-0.239,12.02,2022-09-28 08:39:00,TIC 284814380,2022.7419697019927,2459850.8604166666
H300033,100,300,300,11.86375,64.8180277777,-2.559,-0.239,12.02,2022-09-28 08:41:00,TIC 284814380,2022.7419735046444,2459850.8618055554
H300033,101,300,300,11.86375,64.8180277777,-2.559,-0.239,12.02,2022-09-28 08:42:00,TIC 284814380,2022.7419754059702,2459850.8625
