In [None]:
import os
import tensorflow as tf

# tces_file = '/mnt/tess/astronet/tces-vetting-v4-toi-train.csv'
# file_pattern = '/mnt/tess/astronet/tfrecords-vetting-5-toi-train/*'
# model_name = 'AstroCNNModelVetting'
# config_name = 'vrevised'
# labels = ['p', 'e', 'n']
tces_file = '/mnt/tess/astronet/tces-v14-val.csv'
file_pattern = '/mnt/tess/astronet/tfrecords-38-val/*'
model_name = 'AstroCNNModel'
config_name = 'final_alpha_1'
labels = ['E', 'N', 'J', 'S', 'B']

filenames = tf.io.gfile.glob(file_pattern)
    
filenames

In [None]:
import pandas as pd

tce_table = pd.read_csv(tces_file, header=0, low_memory=False)
print(len(tce_table))
tce_table.head(3)

In [None]:
series = {}

for filename in filenames:
  tfr = tf.data.TFRecordDataset(filename)
  num_records = 0
  for record in tfr:
    num_records += 1
    ex = tf.train.Example.FromString(record.numpy())
    for k in ex.features.feature.keys():
      f = ex.features.feature[k]
      if f.int64_list.value:
        v = f.int64_list.value[0]
      elif f.float_list.value:
        v = f.float_list.value[0]
      elif f.bytes_list.value:
        v = f.bytes_list.value[0].decode()
      else:
        continue

      if k not in series:
        series[k] = []
      series[k].append(v)
  print(filename, num_records)

In [None]:
import pandas as pd

examples_table = pd.DataFrame.from_dict(series)

pd.set_option('display.max_columns', None)
# examples_table[['secondary_scale']].describe()
examples_table.describe()

In [None]:
from matplotlib import pyplot as plt

counts = [sum(examples_table['disp_{}'.format(l)] > 0) for l in labels]
ax = plt.bar(labels, counts)
for i in range(len(labels)):
    b = ax[i]
    height = b.get_height()
    x, y = b.get_xy()
    plt.annotate(
        '{} - {:.0%}'.format(counts[i], counts[i] / sum(counts)),
        (x + 0.1, y + height + 11))

In [None]:
examples_table.head(3)

In [None]:
tce_table[tce_table.index == 8209]

In [None]:
import numpy as np

print('Label mismtaches between TCE and tfrecords:')
np.array(set(tce_table[tce_table[f'disp_{labels[0]}'] > 0]['Astro ID'].values)
    - set(examples_table[examples_table[f'disp_{labels[0]}'] > 0]['astro_id'].values))

In [None]:
import tensorflow as tf

from astronet import models
from astronet.astro_cnn_model import input_ds

config = models.get_model_config(model_name, config_name)

ds = input_ds.build_dataset(
      file_pattern=file_pattern,
      input_config=config.inputs,
      batch_size=1,
      include_labels=False,
      shuffle_filenames=False,
      repeat=1,
      include_identifiers=True)
labels_ds = input_ds.build_dataset(
      file_pattern=file_pattern,
      input_config=config.inputs,
      batch_size=1,
      include_labels=True,
      shuffle_filenames=False,
      repeat=1,
      include_identifiers=True)
labels_iter = iter(labels_ds)

label_index = {k.lower(): i for i, k in enumerate(config.inputs.label_columns)}
cols = ["disp_E", "disp_N", "disp_J", "disp_S", "disp_B"]

all_ids = []
bad_labels = []
for d in ds:
  lab = next(labels_iter)
  
  def lam(e):
    if e.dtype == tf.int64:
        return e
    if tf.reduce_any(tf.math.is_nan(e)):
        tf.print(e, summarize=-1)
        raise ValueError('data has NaNs.')
    return e
  ex_id = d[1].numpy().item()
  all_ids.append(ex_id)
  
  assert lab[0]['duration'] == d[0]['duration']
  rec = tce_table[tce_table['Astro ID'] == ex_id]
  for c in cols:
    if (lab[1][0][label_index[c.lower()]].numpy() == 0) != (rec[c].values[0] == 0):
      bad_labels.append(ex_id)
      print('bad example: ', ex_id)
      print(rec)
      print(cols)
      print(lab[1][0])
      break
  if bad_labels:
    break
  
  try:
    tf.nest.map_structure(lam, d)
  except ValueError as e:
    print(e)
    print(d[1])
    break
else:
  print('No NaNs or mismtached labels found.')

if len(all_ids) == len(set(all_ids)):
  print('No duplicates found.')
else:
  print('Found duplicates!', len(all_ids) - len(set(all_ids)))
  print([t for t in set(all_ids) if all_ids.count(t) > 1])

In [None]:
def astro_id(tic_id):
  return tce_table[tce_table['TIC ID'] == tic_id]['Astro ID'].values[0]

In [None]:
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np

from astronet.preprocess import preprocess


tess_data_dir = '/mnt/tess/lc'

def find_tce(astro_id):
  with tf.device('cpu'):
    for filename in filenames:
      tfr = tf.data.TFRecordDataset(filename)
      for record in tfr:
        ex = tf.train.Example.FromString(record.numpy())
        if (ex.features.feature["astro_id"].int64_list.value[0] == astro_id):
          print('TIC ID:', tic_id)
          for l in labels:
              print(f'{l}:', ex.features.feature[f"disp_{l}"].int64_list.value[0])
          print('Duration:', ex.features.feature["Duration"].float_list.value[0])
          return ex

    raise ValueError("{} not found in files: {}".format(astro_id, filenames))

In [None]:
ds = ds.cache()
def plot_ds_tce(ds, astro_id):
    for d in ds:
        if d[1] == astro_id:
            for k, v in d[0].items():
                if k.startswith('local_'):
                    continue
                if k.startswith('global_'):
                    continue
                if k.startswith('secondary_'):
                    continue
                if k.startswith('sample_'):
                    continue
                print(f'{k:25}: {v.numpy()}')
            global_view = np.array(d[0]['global_view'][0].numpy())
            local_view = np.array(d[0]['local_view'][0].numpy())
            secondary_view = np.array(d[0]['secondary_view'][0].numpy())
            fig, axes = plt.subplots(2, 3, figsize=(20, 12))
            axes[0, 0].plot(global_view, '.-')
            axes[0, 1].plot(local_view, '.-')
            axes[0, 2].plot(secondary_view, '.-')
            axes[1, 0].plot(d[0]['global_mask'][0].numpy(), '.-')
            axes[1, 1].plot(d[0]['global_view_0.3'][0].numpy(), '.-')
            axes[1, 2].plot(d[0]['global_view_5.0'][0].numpy(), '.-')
            plt.show()
            plt.close('all')
            return

In [None]:
tic_id = 349412074
plot_ds_tce(ds, astro_id(tic_id))

In [None]:
tce_table[tce_table['TIC ID'] == tic_id]

In [None]:
astro_id(tic_id)

In [None]:
examples_table[examples_table['astro_id'] == astro_id(tic_id)]

In [None]:
!ls /mnt/tess/lc-v | grep 237320326

In [None]:
tic_id = 334227600
tce = find_tce(astro_id(tic_id))

list(tce.features.feature.keys())

In [4]:
import pandas as pd

# Load your CSV
df = pd.read_csv(
	r"C:\Users\danie\data\NASA\tces_with_labels_v3.csv",
	comment="#",   # ignore lines starting with #
    header=0,      # first valid row after comments is header
    low_memory=False
)

df.dropna(subset=['TIC ID'], inplace=True)

# Count NaNs in TIC ID column
nan_count = df['TIC ID'].isna().sum()

print("Number of NaNs in TIC ID:", nan_count)


Number of NaNs in TIC ID: 0


In [7]:
df.head(3)

Unnamed: 0,TIC ID,Epoch,Period,Duration,Depth,SMass,SRad,SRadEst,Tmag,Year,...,Consensus Label,L1,L2,L3,L4,L5,L6,L7,L8,Notes
1,387242167.0,1655.407925,4.081254,0.1808811773,330,,11.2034,11.46885329,6.7236,1.0,...,,J,N,J,,,,,,
2,141709436.0,1338.471105,24.959872,0.2188980774,640,0.98,0.886142,0.8787521896,6.9129,1.0,...,J,J,J,J,,,,,,
3,407089973.0,1626.073791,0.365865,0.0647507877,24320,1.31,1.64818,1.653170476,11.3884,1.0,...,B,B,B,B,,,,,,
