In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
import streak_cut as sc
import gradient_image as gi
import numpy as np
import matplotlib.pyplot as plt
import utils
import pandas as pd
import os
from tqdm import tqdm as tqdm
import astropy.io.fits as fits



In [None]:
width = 100
height = 100
point1 = (99, 80)
point2 = (0,20)
img = gi.create_gradient_image_with_line(height, width, point1, point2)
cut = sc.cut_around_line(img, point1, point2, 32)
ax, fig = plt.subplots(1, 2)
fig[0].imshow(img, cmap='gray')
fig[1].imshow(cut, cmap='gray')
plt.show()



In [None]:
streaks_csv = utils.read_streaks_csv()


In [None]:
# Sort through r_SDSS.correlated_streaks.csv and add astred.cal.fits to the file name if it is needed

correlated_df = pd.read_csv('data/r_SDSS.correlated_streaks.csv')
for index, row in correlated_df.iterrows():
    path = utils.get_fits_path(row['file_name'], True)
    name = path.split('/')[-1]
    correlated_df.at[index, 'file_name'] = name
correlated_df.to_csv('data/r_SDSS.correlated_streaks_fixed.csv', index=False)



In [None]:
# Aggregate all .streaks files into a single CSV file

# Define the root directory of the project
root_dir = os.path.join('/media/dofri/OBSERVATIONS/VST_BUFFER/')

# List to hold individual dataframes
dataframes = []

type_1 = ['#extension', 'start_x',     'start_y',    'end_x',     'end_y',      'start_ra',      'start_dec',     'end_ra',      'end_dec',      'mjd_start',     'mjd_end']
type_2 = ['extension', 'x_start[px]', 'y_start[px]','x_end[px]', 'y_end[px]',  'ra_start[deg]', 'dec_start[deg]','ra_end[deg]', 'dec_end[deg]', 'JD_start[UTC]', 'JD_end[UTC]']
column_mapping = dict(zip(type_1, type_2))


# Walk through the directory structure
for subdir, dirs, files in os.walk(root_dir):
    # Skip directories named 'L1_DETECTION'
    if 'L1_DETECTION' in subdir:
        continue
    
    for file in files:
        # Check if the file has the extension '.fits.streaks'
        if file.endswith('.fits.streaks'):
            file_path = os.path.join(subdir, file)
            
            # Read the .streaks file into a dataframe
            df = pd.read_csv(file_path)
            file_name = file[:-8]
            df.insert(0, 'file_name', file[:-8])
            # df.insert(1, 'file_path', subdir)
            # Remove the time_method column if it exists
            if ' time_method' in df.columns:
                df.drop(columns=[' time_method'], inplace=True)
            df.rename(columns=column_mapping, inplace=True)
            # Append the dataframe to the list
            dataframes.append(df)

# Concatenate all dataframes into a single dataframe
combined_df = pd.concat(dataframes, ignore_index=True)


# Save the combined dataframe to a single CSV file
combined_df.to_csv('combined_streaks.csv', index=False)

print(combined_df.columns)

print("All CSV files have been successfully concatenated into 'combined_streaks.csv'")

#count the number of rows with no nans or nulls or empty strings except for 'time_method'
print(combined_df.dropna().shape[0])
# Create histogram for each column
# for column in combined_df.columns:
#     if column in ['file_name', 'file_path']:
#         continue
#     combined_df[column].hist()
#     plt.title(column)
#     plt.show()

In [None]:
# Compare and combine the aggregated streaks with the correlated streaks, using the data from the aggregated streaks as the primary data source.
# Now realizing that I only end up using the data from one source so a lot of this is redundant but oh well.

combined_df = pd.read_csv('combined_streaks.csv')
correlated_df = pd.read_csv('data/r_SDSS.correlated_streaks_fixed.csv')
match_columns = ['file_name', 'extension', 'ID']
merged_df = pd.merge(combined_df, correlated_df, on=match_columns, suffixes=('_df1', '_df2'), how='outer', indicator=True)
# Find differences between other columns
difference_columns = [col for col in combined_df.columns if col not in match_columns]
zipped_columns = []
for col in difference_columns:
    zipped_columns.append(f'{col}_df1')
    zipped_columns.append(f'{col}_df2')

# Include the matching columns at the start
final_columns_order = match_columns + zipped_columns
merged_df = merged_df[final_columns_order]
print(merged_df.columns)

merged_df.to_csv('comparison.csv', index=False)
print("Saved comparison to 'comparison.csv'")

def find_diff(col1, col2):
    return merged_df[ (merged_df[col1] != merged_df[col2]) 
                     & merged_df[col1].notna() 
                     & merged_df[col2].notna()][[col1, col2]]

columns_of_interest = ['x_start[px]', 'y_start[px]', 'x_end[px]', 'y_end[px]', 'ang_vel[deg/s]']

# for col in columns_of_interest:
#     diff = find_diff(f'{col}_df1', f'{col}_df2')
#     if not diff.empty:
#         print(f"Differences in column '{col}':")
#         print(diff)
#         
# Save the columns of interest to a new CSV file with file_name, extension and ID. Prioritize the first dataframe in the case of a difference
new_df = merged_df[['file_name', 'extension', 'ID']].copy()
print(merged_df.columns)
new_df.insert(3, 'x_start[px]', merged_df['x_start[px]_df2'])
new_df.insert(4, 'y_start[px]', merged_df['y_start[px]_df2'])
new_df.insert(5, 'x_end[px]', merged_df['x_end[px]_df2'])
new_df.insert(6, 'y_end[px]', merged_df['y_end[px]_df2'])
new_df.insert(7, 'ang_vel[deg/s]', merged_df['ang_vel[deg/s]_df2'])
# Throw away any rows with NaN values
new_df.dropna(inplace=True)
new_df.to_csv('good_data.csv', index=False)


# # Print differences between matching row
# 
# # Print differences between matching rows
# for col in difference_columns:
#     diff = merged_df[f'{col}_df1'] != merged_df[f'{col}_df2']
#     differing_rows = merged_df[diff]
#     if not differing_rows.empty:
#         print(f"Differences in column '{col}':")
#         print(differing_rows[[f'{col}_df1', f'{col}_df2', *match_columns]])
# 
# print("Comparison complete.")
# Math by file_name, extention and ID. Print differences in other columns


In [None]:
good_data = pd.read_csv('good_data.csv') 

# pick 10 random rows
sample = good_data.sample(10)
for index, row in sample.iterrows():
    file_path = utils.get_fits_path(row['file_name'], True)
    with fits.open(file_path) as hdul:
        data = hdul[row['extension']].data
        start = (row['x_start[px]'], row['y_start[px]'])
        end = (row['x_end[px]'], row['y_end[px]'])
        cut = sc.cut_around_line(data, start, end, 32)[:,:]
        # # Display the cut along with a histogram of the cut
        # ax, fig = plt.subplots(1, 2)
        # fig[0].imshow(cut, cmap='gray')
        # # histogram in log scale
        # fig[1].hist(cut.flatten(), bins=256, log=True)
        plt.imshow(cut, cmap='gray', norm=utils.LogNorm())
        title = f"{row['file_name']} - {row['extension']} - {row['ID']}"
        plt.title(title)
        print(title)
        plt.show()
        

In [None]:
# utils.show_streak_with_endpoints('OMEGA.2022-01-05T06:08:00.955.fits', 19, 1, True)
utils.show_streak_with_endpoints('OMEGA.2022-01-08T04:30:31.115.astred.cal.fits', 7, 6, True)

In [None]:
good_data = pd.read_csv('good_data.csv')
# get first row
row = good_data.iloc[1]
bzero_counts = {}

for idx, row in good_data.iterrows():
    with fits.open(utils.get_fits_path(row['file_name'], True)) as hdul:
        # print the extension metadata
        # print(hdul[0].header)
        # hdul.info()
        # for i in range(1, len(hdul)):
        i = 1
        if hdul[i].header['BZERO'] in bzero_counts.keys():
            bzero_counts[hdul[i].header['BZERO']] += 1
        else:
            bzero_counts[hdul[i].header['BZERO']] = 1
        # if hdul[i].header['BSCALE'] != 1.0:
        #     print(row['file_name'] + " BSCALE[" + str(i) + "]: " + str(hdul[i].header['BSCALE']))
        # if hdul[i].header['BZERO'] != 32768:
        #     print(row['file_name'] + " BZERO[" + str(i) + "]: " + str(hdul[i].header['BZERO']))

print(bzero_counts)

In [None]:
good_data = pd.read_csv('good_data.csv')
# get first row
row = good_data.iloc[1]
bzero_counts = {}

with fits.open(utils.get_fits_path(row['file_name'], True)) as hdul:
    print(hdul[1].data.shape)
    print(hdul[0].header.keys)

In [None]:
good_data = pd.read_csv('good_data.csv')

# Create a histogram and boxplot of the ang_vel[deg/s] column in a subplot
fig, ax = plt.subplots(1, 2)
good_data['ang_vel[deg/s]'].hist(ax=ax[0], bins=50)
good_data['ang_vel[deg/s]'].plot(kind='box', ax=ax[1])
fig.suptitle('ang_vel[deg/s]')
plt.show()

# Create a histogram and boxplot of the log(ang_vel[deg/s]) column
log_ang_vel = np.log10(good_data['ang_vel[deg/s]'])
fig, ax = plt.subplots(1, 2)
log_ang_vel.hist(ax=ax[0], bins=50)
log_ang_vel.plot(kind='box', ax=ax[1])
fig.suptitle('log(ang_vel[deg/s])')
plt.show()

In [None]:
# Scale and translate log_ang_vel so the mean is 0 and the standard deviation is 1
print(log_ang_vel.mean())
print(log_ang_vel.std())
log_ang_vel_standardized = (log_ang_vel - log_ang_vel.mean()) / log_ang_vel.std()
print(log_ang_vel_standardized.mean())
# Create a histogram and boxplot of the scaled log(ang_vel[deg/s]) column
fig, ax = plt.subplots(1, 2)
log_ang_vel_standardized.hist(ax=ax[0], bins=50)
log_ang_vel_standardized.plot(kind='box', ax=ax[1])
fig.suptitle('log(ang_vel[deg/s]) standardized')
plt.show()

In [None]:
# Look at the distribution of fits metadata
good_data = pd.read_csv('good_data.csv')
keys_of_interest = ['HIERARCH ESO TEL AMBI IRSKY TEMP',
                    'HIERARCH ESO TEL AMBI TEMP',
                    'HIERARCH ESO TEL AMBI WINDDIR',
                    'HIERARCH ESO TEL AMBI WINDSP'
                    'HIERARCH ESO TEL AMBI PRES START',
                    'HIERARCH ESO TEL AMBI FWHM START',
                    'HIERARCH ESO TEL AMBI FWHM END',
                    'HIERARCH ESO TEL AMBI RHUM',
                    'HIERARCH ESO TEL AMBI TAU0'
]
key_data = {}                

for idx, row in good_data.iterrows():
    with fits.open(utils.get_fits_path(row['file_name'], True)) as hdul:
        for key in keys_of_interest:
            if key in hdul[0].header:
                if key in key_data.keys():
                    key_data[key].append(hdul[0].header[key])
                else:
                    key_data[key] = [hdul[0].header[key]]

# Show histograms for each key
for key, data in key_data.items():
    plt.hist(data, bins=50)
    plt.title(key)
    plt.show()

In [None]:

good_data = pd.read_csv('good_data.csv')

# Get all of the images and create a histogram of the total pixel values of all images
with fits.open(utils.get_fits_path(good_data.iloc[0]['file_name'], True)) as hdul:
    (height, width) = hdul[1].data.shape
pixels = np.zeros((1,))

for(idx, streak) in tqdm(good_data.iterrows()):
    streakName = streak["file_name"]
    extension = streak["extension"]
    with fits.open(utils.get_fits_path(streakName, True)) as hdul:
        streak_x_start = streak["x_start[px]"]
        streak_y_start = streak["y_start[px]"]
        streak_x_end = streak["x_end[px]"]
        streak_y_end = streak["y_end[px]"]
        cut_flat = sc.cut_around_line(hdul[extension].data, (streak_x_start, streak_y_start), (streak_x_end, streak_y_end), 32).flatten()
        pixels = np.concatenate((pixels, cut_flat))

                
        

In [None]:
plt.hist(pixels, bins=512)
plt.title("Pixel Value Histogram")


In [None]:

# Get indices of non-zero elements
nonzero_indices = np.nonzero(pixels)

# Remove zero-valued elements
pixels_nonzero = pixels[nonzero_indices]
log_pixels_nonzero = np.log10(pixels_nonzero)

# Create a histogram and boxplot of the log of the non-zero pixel values
fig, ax = plt.subplots(1, 2)
ax[0].hist(log_pixels_nonzero, bins=500)
ax[1].boxplot(log_pixels_nonzero)
fig.suptitle('log(pixel values) non-zero')
plt.show()

In [None]:
pixels_standardized = (log_pixels_nonzero - log_pixels_nonzero.mean()) / log_pixels_nonzero.std()
fig, ax = plt.subplots(1, 2)
ax[0].hist(pixels_standardized, bins=100)
ax[1].boxplot(pixels_standardized)
fig.suptitle('log(pixel values) standardized')
plt.show()

In [None]:

max_pixel = pixels.max()
min_pixel = pixels.min()
mean_pixel = pixels.mean()
median_pixel = np.median(pixels)
std_pixel = pixels.std()
max_count = np.count_nonzero(pixels == max_pixel)
min_count = np.count_nonzero(pixels == min_pixel)
negative_count = np.count_nonzero(pixels < 0)
print(f"Max pixel value: {max_pixel}, count: {max_count}")
print(f"Min pixel value: {min_pixel}, count: {min_count}")
print(f"Negative pixel value count: {negative_count}. That's {negative_count / pixels.size * 100}% of the pixels.")
print(f"Mean pixel value: {mean_pixel}")
print(f"Median pixel value: {median_pixel}")
print(f"Standard deviation of pixel values: {std_pixel}")

In [None]:
good_data = pd.read_csv('good_data.csv')

for idx, row in tqdm(good_data.iterrows(), total=good_data.shape[0]):
    caught_error = False

    with fits.open(utils.get_fits_path(row['file_name'], True), memmap=False, lazy_load_hdus=True) as hdul:
        for i in range(1, len(hdul)):
            try:
                if hdul[i].header['BZERO'] != 32768.0:
                    tqdm.write("HI")
            except KeyError:
                print(f"KeyError: {row['file_name']} - {i}")
                print(hdul[i].header)
                caught_error = True
                break
    if caught_error:
        break

In [None]:
# This cell verifies that all the files in the good_data.csv file are present in the directory structure
good_data = pd.read_csv('good_data.csv')
missing_files = []
for idx, row in tqdm(good_data.iterrows(), total=good_data.shape[0]):
    if not os.path.exists(utils.get_fits_path(row['file_name'], False)):
        missing_files.append(row['file_name'])
if len(missing_files) == 0:
    print("All files are present.")
else:
    print(f"Missing files: {missing_files}")

In [None]:
good_data = pd.read_csv('good_data.csv')
file_names = good_data['file_name'].unique()

for idx, file_name in tqdm(enumerate(file_names[:50]), total=50):
    with fits.open(utils.get_fits_path(file_name, True), mode='update', output_verify='fix', memmap=True) as hdul:
        for i in range(1, len(hdul)):
            if hdul[i].header['BZERO'] != 32768.0:
                hdul[i].header['BZERO'] = 32768.0
                hdul.flush()

In [None]:
good_data = pd.read_csv('good_data.csv')
file_names = good_data['file_name'].unique()

for idx, file_name in tqdm(enumerate(file_names[50:100]), total=50):
    with fits.open(utils.get_fits_path(file_name, True), mode='update', output_verify='fix', memmap=True) as hdul:
        for i in range(1, len(hdul)):
            if hdul[i].header['BZERO'] != 32768.0:
                hdul[i].header['BZERO'] = 32768.0
                hdul.flush()

In [None]:
good_data = pd.read_csv('good_data.csv')
file_names = good_data['file_name'].unique()

for idx, file_name in tqdm(enumerate(file_names[100:150]), total=50):
    try:
        with fits.open(utils.get_fits_path(file_name, True), mode='update', output_verify='fix', memmap=True) as hdul:
            for i in range(1, len(hdul)):
                if hdul[i].header['BZERO'] != 32768.0:
                    hdul[i].header['BZERO'] = 32768.0
                    hdul.flush()
    except Exception as e:
        print(f"Error in file {file_name}: {e}")

In [None]:
path = 'data/sandbox/OMEGA.2022-01-23T03:36:39.644.fits'
with fits.open(path, mode='update', output_verify='fix') as hdul:
    for i in range(1, len(hdul)):
        if hdul[i].header['BZERO'] != 32768.0:
            hdul[i].header['BZERO'] = 32768.0
            hdul.flush()

In [None]:
file_names = pd.read_csv('good_data.csv')['file_name'].unique()


for file_name in tqdm(file_names, total=file_names.shape[0]):
    drive1_filepath = utils.get_fits_path(file_name, True, drive="MedTina")
    drive2_filepath = utils.get_fits_path(file_name, True, drive="OBSERVATIONS")
    # Check if the filesize is the same
    drive1_size = os.path.getsize(drive1_filepath)
    drive2_size = os.path.getsize(drive2_filepath)
    if drive1_size != drive2_size:
        print(f"File sizes do not match: {file_name}. MedTina: {drive1_size}, OBSERVATIONS: {drive2_size}")
    # Check if the files are the same
    with open(drive1_filepath, 'rb') as file1, open(drive2_filepath, 'rb') as file2:
        if file1.read() != file2.read():
            print(f"Files do not match: {file_name}")

In [None]:
# Print the file differences
file_name = "OMEGA.2022-01-03T01:11:55.387.fits"
def compare_fits_files_by_path(file_path1, file_path2):
    # Open the two fits files, iterate through the extensions and compare the data and headers
    
    with fits.open(file_path1, do_not_scale_image_data=False) as hdul1, fits.open(file_path2, do_not_scale_image_data=False) as hdul2:
        for i in range(1,2):# len(hdul1)):
            if not np.array_equal(hdul1[i].data, hdul2[i].data):
                print(f"Data in extension {i} does not match.")
            if hdul1[i].header != hdul2[i].header:
                print(f"Headers in extension {i} do not match.")
                keys1 = list(hdul1[i].header.keys())
                keys2 = list(hdul2[i].header.keys())
                if( keys1 != keys2):
                    print(f"Keys in extension {i} do not match.")
                keyMismatch = False
                # for key in keys1:
                #     if key not in keys2:
                #         print(f"Key: {key} not in file 2. Header {i}.")
                #         keyMismatch = True
                # for key in keys2:
                #     if key not in keys1:
                #         print(f"Key: {key} not in file 1. Header {i}.")
                #         keyMismatch = True
                # if keyMismatch:
                #     continue
                # for key in hdul1[i].header.keys():
                #     if hdul1[i].header[key] != hdul2[i].header[key] and key not in ['CHECKSUM', 'DATASUM']:
                #         print(f"Key: {key}, value1: {hdul1[i].header[key]}, value2: {hdul2[i].header[key]}")


def compare_fits_files(file_name):
    drive1_filepath = utils.get_fits_path(file_name, True, drive="MedTina")
    drive2_filepath = utils.get_fits_path(file_name, True, drive="OBSERVATIONS")
    compare_fits_files_by_path(drive1_filepath, drive2_filepath)


In [None]:
def compare_fits_files_by_path_debugA(file_path1, file_path2):
    # Open the two fits files, iterate through the extensions and compare the data and headers

    with fits.open(file_path2, do_not_scale_image_data=False, lazy_load_hdus=False) as hdulist:
        print("File: " + file_path2)
        hdu = hdulist[1]
        # hdu.header.keys()
        keys_a = list(hdu.header.keys())
        keys_b = list(hdu.header.keys())
        print(keys_a)
        if keys_a != keys_b:
            print("Weird")
            # Find the difference in keys
            print(len(list(keys_a)))
            print(len(list(keys_b)))
            for key in keys_a:
                if key not in keys_b:
                    print(f"Key: {key} not in file 2.")
            for key in keys_b:
                if key not in keys_a:
                    print(f"Key: {key} not in file 1.")
            

In [None]:
# Print the file differences

def compare_fits_files_by_path_debugB(file_path1, file_path2):
    # Open the two fits files, iterate through the extensions and compare the data and headers

    with fits.open(file_path1, do_not_scale_image_data=False) as hdul1, fits.open(file_path2, do_not_scale_image_data=False) as hdul2:
        hdu1 = hdul1[1]
        hdu2 = hdul2[1]
        if not np.array_equal(hdul1[i].data, hdul2[i].data):
            print(f"Data in extension {1} does not match.")

        if hdu1.header != hdu2.header:
            print(f"Headers in extension {1} do not match.")
            keys1 = list(hdu1.header.keys())
            keys2 = list(hdu2.header.keys())
            if keys1 != keys2:
                print(f"Keys in extension {1} do not match.")
            for key in keys1:
                if key not in keys2:
                    print(f"Key: {key} not in file 2.")
            for key in keys2:
                if key not in keys1:
                    print(f"Key: {key} not in file 1.")



In [None]:
compare_fits_files_by_path_debugB(astred_path, astred_backup_path)


In [None]:
compare_fits_files_by_path(astred_path, astred_backup_path)

In [None]:
do_not_scale_image_data = False 
def try_print_key(hdu, key):
    try:
        print(key, ":", hdu.header[key])
    except KeyError:
        print(f"KeyError: {key}")

def print_relevant_keys(hdu):
    keys = ['BZERO', 'BSCALE', 'BITPIX']
    for key in keys:
        try_print_key(hdu, key)

with fits.open(astred_path, do_not_scale_image_data=do_not_scale_image_data) as hdul:
    print("Astred")
    print_relevant_keys(hdul[1])
    print("Data 0,0: ", hdul[1].data[0,0])
    print_relevant_keys(hdul[1])
with fits.open(astred_backup_path, do_not_scale_image_data=do_not_scale_image_data) as hdul:
    print("Astred Backup")
    print_relevant_keys(hdul[1])
    print("Data 0,0: ", hdul[1].data[0,0])
    print_relevant_keys(hdul[1])
with fits.open(non_astred_path, do_not_scale_image_data=do_not_scale_image_data) as hdul:
    print("Non-Astred")
    print_relevant_keys(hdul[1])
    print("Data 0,0: ", hdul[1].data[0,0])
    print_relevant_keys(hdul[1])
with fits.open(copy_path_scale, do_not_scale_image_data=do_not_scale_image_data) as hdul:
    print("Copy Scale")
    print_relevant_keys(hdul[1])
    print("Data 0,0: ", hdul[1].data[0,0])
    print_relevant_keys(hdul[1])
with fits.open(copy_path_no_scale, do_not_scale_image_data=do_not_scale_image_data) as hdul:
    print("Copy No Scale")
    print_relevant_keys(hdul[1])
    print("Data 0,0: ", hdul[1].data[0,0])
    print_relevant_keys(hdul[1])

In [None]:
file_names = pd.read_csv('good_data.csv')['file_name'].unique()
for file_name in tqdm(file_names, total=file_names.shape[0]):
    compare_fits_files(file_name)

In [None]:
def fix_bzero(file_path):
    with fits.open(file_path, mode='update', output_verify='ignore', memmap=True, do_not_scale_image_data=True) as hdul:
        for i in range(1, len(hdul)):
            if hdul[i].header['BZERO'] != 32768.0:
                hdul[i].header['BZERO'] = 32768.0
        hdul.flush()

In [None]:
def fix_bezero_and_save_as_copy(file_path, new_file_path, do_not_scale_image_data):
    with fits.open(file_path, mode='readonly', output_verify='ignore', memmap=False, do_not_scale_image_data=do_not_scale_image_data) as hdul:
        for i in range(1, len(hdul)):
            if hdul[i].header['BZERO'] != 32768.0:
                hdul[i].header['BZERO'] = 32768.0
        hdul.writeto(new_file_path, overwrite=True)

In [None]:
astred_path = 'data/sandbox/OMEGA.2022-01-02T00:35:59.042.astred.cal.fits'
astred_backup_path = 'data/sandbox/OMEGA.2022-01-02T00:35:59.042.astred.cal_backup.fits'
non_astred_path = 'data/sandbox/OMEGA.2022-01-19T01:58:42.111.fits'
non_astred_backup_path = 'data/sandbox/OMEGA.2022-01-19T01:58:42.111_backup.fits'


fix_bzero(astred_path)
# fix_bzero(non_astred_path)



In [None]:
copy_path_scale = "data/sandbox/OMEGA.2022-01-02T00:35:59.042.astred.cal_fixed_scale.fits"
copy_path_no_scale = "data/sandbox/OMEGA.2022-01-02T00:35:59.042.astred.cal_fixed_no_scale.fits"
fix_bezero_and_save_as_copy(astred_path, copy_path_scale, False)
fix_bezero_and_save_as_copy(astred_path, copy_path_no_scale, True)

In [None]:
# Fix all the astred files by setting BZERO to 32768
good_data = pd.read_csv('good_data.csv')
file_names = good_data['file_name'].unique()
astred_files = [file_name for file_name in file_names if "astred.cal" in file_name]
for file_name in tqdm(astred_files, total=len(astred_files), desc="Fixing astred files", unit="file"):
    tqdm.write(f"Fixing file: {file_name}")
    with fits.open(utils.get_fits_path(file_name, False), mode='update', output_verify='ignore', memmap=False, do_not_scale_image_data=True) as hdul:
        for i in range(1, len(hdul)):
            hdul[i].header['BZERO'] = 32768.0
        hdul.flush()

In [None]:
# compare the files to their backups
print("Comparing astred files")
compare_fits_files_by_path(astred_path, astred_backup_path)
# print("Comparing non-astred files")
# compare_fits_files_by_path(non_astred_path, non_astred_backup_path)

In [None]:
# Verify that the files on the two drives are the same
import filecmp
file_names = pd.read_csv('good_data.csv')['file_name'].unique()
for file_name in tqdm(file_names, total=file_names.shape[0]):
    
    if not filecmp.cmp(utils.get_fits_path(file_name, True, drive="MedTina"), utils.get_fits_path(file_name, False), shallow=False):
        compare_fits_files(file_name)
    

In [None]:
good_data = pd.read_csv('good_data.csv')
name =  'OMEGA.2022-01-07T01:34:16.114.fits'
extension = 23
ID = 2
info_good = good_data[(good_data['file_name'] == name) & (good_data['extension'] == extension) & (good_data['ID'] == ID)]
# Data from .streaks file
streaks_data = pd.read_csv(utils.get_streaks_path(name))
info_streaks = streaks_data[(streaks_data['#extension'] == extension ) & (streaks_data['ID'] == ID )]
print(info_good)
print(info_streaks)
with fits.open(utils.get_fits_path(name, False)) as hdul:
    data = hdul[extension].data
    plt.imshow(np.log(data))
    x_start = info['x_start[px]']
    x_end = info['x_end[px]']
    y_start = info['y_start[px]']
    y_end = info['y_end[px]']
    plt.plot([x_start, x_end], [y_start, y_end])
    
    plt.show()




In [None]:

good_data = pd.read_csv('good_data.csv')
wrong_count = 0
wrong_rows = []


for idx, row in good_data.iterrows():
    name = row['file_name']
    good_ID = row['ID']
    good_extension = row['extension']
    good_x_start = row['x_start[px]']
    good_y_start = row['y_start[px]']
    good_x_end = row['x_end[px]']
    good_y_end = row['y_end[px]']
    
    streaks_data = pd.read_csv(utils.get_streaks_path(name))
    streaks_row = streaks_data[(streaks_data['ID'] == good_ID) & (streaks_data['#extension'] == good_extension)]
    if streaks_row.empty:
        print(f"Streaks file does not contain ID {good_ID} and extension {good_extension}")
        continue
    # px = '[px]' if 'x_start[px]' in streaks_row.columns else ""
    # streaks_x_start = streaks_row['x_start' + px].values[0]
    # streaks_y_start = streaks_row['y_start' + px].values[0]
    # streaks_x_end = streaks_row['x_end' + px].values[0]
    # streaks_y_end = streaks_row['y_end' + px].values[0]
    if 'x_start[px]' in streaks_row.columns: 
        streaks_x_start = streaks_row['x_start[px]'].values[0]
        streaks_y_start = streaks_row['y_start[px]'].values[0]
        streaks_x_end = streaks_row['x_end[px]'].values[0]
        streaks_y_end = streaks_row['y_end[px]'].values[0]
    else:
        streaks_x_start = streaks_row['start_x'].values[0]
        streaks_y_start = streaks_row['start_y'].values[0]
        streaks_x_end = streaks_row['end_x'].values[0]
        streaks_y_end = streaks_row['end_y'].values[0]
    
    if good_x_start != streaks_x_start or good_y_start != streaks_y_start or good_x_end != streaks_x_end or good_y_end != streaks_y_end:
        wrong_count += 1
        wrong_rows.append( (name, good_ID, good_extension))
    
        print(f"File name: {name} - ID: {good_ID} - Extension: {good_extension} does not match.")
        # print(f"Good data: {good_x_start}, {good_y_start}, {good_x_end}, {good_y_end}")
        # print(f"Streaks data: {streaks_x_start}, {streaks_y_start}, {streaks_x_end}, {streaks_y_end}")
        # tqdm.write(f"Good data:    {good_x_start}, {good_y_start}, {good_x_end}, {good_y_end}")
        # tqdm.write(f"Streaks data: {streaks_x_start}, {streaks_y_start}, {streaks_x_end}, {streaks_y_end}")
    
print(f"Wrong count: {wrong_count}")

In [None]:
import pandas as pd

# Creating the first DataFrame
df1 = pd.DataFrame({
    'filename': ['file1', 'file2', 'file3'],
    'A': [10, 20, 30],
    'B': [40, 50, 60],
    'C': [70, 80, 90]
})

# Creating the second DataFrame with duplicates in the 'filename' column
df2 = pd.DataFrame({
    'filename': ['file1', 'file1', 'file2', 'file3', 'file3'],
    'partialkey_1': [1, 2, 1, 1, 2],
    'partialkey_2': [100, 200, 300, 400, 500],
    'D': ['X', 'Y', 'Z', 'W', 'V']
})

# Merging the DataFrames on the 'filename' column
merged_df = pd.merge(df2, df1, on='filename')[df1.columns]

# Displaying the resulting DataFrame
print(merged_df)

In [1]:
from src.models.transformer_artifical_data import artificial_train
artificial_train.train()

Start training


FileNotFoundError: [Errno 2] No such file or directory: '/home/dofri/epfl/semester_project/datasets/artificial_strips/image_parameters.csv'