In [1]:
import ast
import time
import math
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from numba import njit, prange
from concurrent.futures import ProcessPoolExecutor, as_completed
from scipy.signal import chirp, find_peaks, peak_widths
import ast
import numpy as np
import pandas as pd
from typing import Dict, Tuple, List
from collections import Counter

In [2]:
df = pd.read_parquet('Data/bandpass_qa0_no_partitions.parquet')
df['uid'] = df.index
df = df.reset_index(drop=True)

In [3]:
specs = [np.array(x, dtype=float)
            for x in df['amplitude_corr_tsys'].tolist()]
freqs = [np.array(x, dtype=float)
            for x in df['frequency_array'].tolist()]
keep = [not np.all(s == 0.0) for s in specs]
specs = [s for s,k in zip(specs, keep) if k]
freqs = [f for f,k in zip(freqs, keep) if k]

uid = df['uid'].values[keep]
ref = df['ref_antenna_name'].values[keep]
ant = df['antenna'].values[keep]
pol = df['polarization'].values[keep]

In [4]:
length_groups: Dict[int, List[int]] = {}
for i, s in enumerate(specs):
        L = s.shape[0]
        length_groups.setdefault(L, []).append(i)

In [5]:
result: Dict[int, Tuple[np.ndarray, ...]] = {}
for L, idxs in length_groups.items():
        specs_L = np.vstack([specs[i] for i in idxs])
        freqs_L = np.vstack([freqs[i] for i in idxs])
        uid_L   = uid[idxs]
        ref_L   = ref[idxs]
        ant_L   = ant[idxs]
        pol_L   = pol[idxs]
        result[L] = (specs_L, uid_L, ref_L, ant_L, pol_L, freqs_L)

In [6]:
L = 1920
specs_L, uid_L, ref_L, ant_L, pol_L, freqs_L = result[L]

endpoints = [(f.min(), f.max()) for f in freqs_L]

cnt = Counter(endpoints)

for (fmin, fmax), freq in cnt.items():
    print(f"Range = ({fmin:.3f}, {fmax:.3f})  →  {freq} rows")


Range = (308095367364.451, 309032379083.201)  →  92 rows
Range = (309002875183.549, 309939886902.299)  →  92 rows
Range = (309910429626.909, 310847441345.659)  →  92 rows
Range = (310817956824.174, 311754968542.924)  →  92 rows
Range = (330364493283.570, 332238516721.070)  →  86 rows
Range = (344197005002.320, 346071028439.820)  →  86 rows
Range = (330374586932.094, 332248610369.594)  →  84 rows
Range = (344207098650.844, 346081122088.344)  →  84 rows
Range = (215754178095.913, 217628201533.413)  →  88 rows
Range = (229961689814.663, 231835713252.163)  →  88 rows
Range = (131704101922.256, 133578125359.756)  →  84 rows
Range = (129808601922.256, 131682625359.756)  →  84 rows
Range = (143703613641.006, 145577637078.506)  →  84 rows
Range = (305372788111.420, 306309799830.170)  →  92 rows
Range = (304465280107.149, 305402291825.899)  →  92 rows
Range = (306280342554.779, 307217354273.529)  →  92 rows
Range = (307187869752.045, 308124881470.795)  →  92 rows
Range = (330353827905.328, 3322

In [7]:
fmin_target, fmax_target = cnt.most_common(1)[0][0]
mask = np.array([
    (f.min() == fmin_target and f.max() == fmax_target)
    for f in freqs_L
])

uids_to_keep = uid_L[mask]
df_filtered = df[df["uid"].isin(uids_to_keep)].reset_index(drop=True)

In [8]:
df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype              
---  ------               --------------  -----              
 0   eb_uid               100 non-null    object             
 1   cal_data_id          100 non-null    object             
 2   cal_reduction_id     100 non-null    object             
 3   start_valid_time     100 non-null    datetime64[ns, UTC]
 4   receiver_band        100 non-null    object             
 5   ref_antenna_name     100 non-null    object             
 6   antenna              100 non-null    object             
 7   polarization         100 non-null    object             
 8   sideband             100 non-null    object             
 9   baseband_name        100 non-null    object             
 10  spw_name             100 non-null    object             
 11  frequency_array      100 non-null    object             
 12  amplitude_corr_tsys  10

In [9]:
def match_and_correct(
    freq_array: np.ndarray,
    trans_freqs: np.ndarray,
    trans_vals: np.ndarray
) -> np.ndarray:
    idxs = np.searchsorted(trans_freqs, freq_array)
    idxs[idxs == len(trans_freqs)] = len(trans_freqs) - 1
    left  = np.maximum(idxs - 1, 0)
    right = idxs
    dl = np.abs(freq_array - trans_freqs[left])
    dr = np.abs(trans_freqs[right] - freq_array)
    nearest = np.where(dl <= dr, left, right)
    mt = trans_vals[nearest]
    return mt

In [10]:
df_filtered['frequency_array'] = df_filtered['frequency_array'].apply(lambda s: np.asarray(s, dtype=float))
df_filtered['frequency_array'] = df_filtered['frequency_array'].apply(lambda freqs: [f/1e9 for f in freqs])

In [11]:
trans_df = pd.read_parquet('Data/full_spectrum.gzip')
trans_freqs = trans_df['Frequency (GHz)'].values
trans_vals  = trans_df['Transmission (%)'].values

In [12]:
results = df_filtered.apply(
    lambda row: match_and_correct(
        np.array(row['frequency_array'], dtype=float),
        trans_freqs,
        trans_vals
    ),
    axis=1
)

In [13]:
df_filtered['transmission_array'] = results

In [14]:
interference = []
for index in df_filtered.index:
    freqs = np.array(df_filtered.loc[index, 'frequency_array'], dtype=float)
    trans = np.array(df_filtered.loc[index, 'transmission_array'], dtype=float)

    troguhs, props = find_peaks(-trans, prominence=1)
    _, _, left_ips, right_ips = peak_widths(-trans, troguhs, rel_height=0.75)

    left_freqs  = np.interp(left_ips,  np.arange(len(freqs)), freqs)
    right_freqs = np.interp(right_ips, np.arange(len(freqs)), freqs)
    widths_freq = right_freqs - left_freqs

    trough_freqs  = freqs[troguhs]
    trough_ranges = []
    for i in range(len(trough_freqs)):
        trough_ranges.append((trough_freqs[i] - widths_freq[i] / 2, trough_freqs[i] + widths_freq[i] / 2))
    trough_ranges = np.array(trough_ranges)

    closest_idxs = []
    for troguhs_range in trough_ranges:
        start, end = troguhs_range[0], troguhs_range[1]
        closest_start_idx = int(np.abs(freqs - start).argmin())
        closest_end_idx = int(np.abs(freqs - end).argmin())
        closest_idxs.append((closest_start_idx, closest_end_idx))
    interference.append(closest_idxs)

In [15]:
df_filtered['atmospheric_interference'] = interference

In [16]:
df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 17 columns):
 #   Column                    Non-Null Count  Dtype              
---  ------                    --------------  -----              
 0   eb_uid                    100 non-null    object             
 1   cal_data_id               100 non-null    object             
 2   cal_reduction_id          100 non-null    object             
 3   start_valid_time          100 non-null    datetime64[ns, UTC]
 4   receiver_band             100 non-null    object             
 5   ref_antenna_name          100 non-null    object             
 6   antenna                   100 non-null    object             
 7   polarization              100 non-null    object             
 8   sideband                  100 non-null    object             
 9   baseband_name             100 non-null    object             
 10  spw_name                  100 non-null    object             
 11  frequency_array     

In [17]:
for col in ["transmission_array"]:
    df_filtered[col] = df_filtered[col].apply(lambda arr: arr.tolist())

In [18]:
df_filtered.to_csv('Data/bandpass_filtered_same_freq.csv',index=None)