In [7]:
import os
import gzip
import shutil
import glob

def extract_fits_from_gz(source_dir, destination_dir):
    """
    Extracts all FITS files from .gz files in source_dir to destination_dir.
    
    Args:
        source_dir (str): Path to directory containing .gz files
        destination_dir (str): Path where FITS files should be extracted
    """
    # Create destination directory if it doesn't exist
    os.makedirs(destination_dir, exist_ok=True)
    
    # Get all .gz files in source directory
    gz_files = glob.glob(os.path.join(source_dir, '*.gz'))
    
    if not gz_files:
        print(f"No .gz files found in {source_dir}")
        return
    
    print(f"Found {len(gz_files)} .gz files in {source_dir}")
    
    for gz_path in gz_files:
        try:
            # Determine output filename (remove .gz extension)
            base_name = os.path.basename(gz_path)
            if not base_name.lower().endswith('.gz'):
                print(f"Skipping non-gz file: {gz_path}")
                continue
                
            output_name = base_name[:-3]  # Remove .gz
            output_path = os.path.join(destination_dir, output_name)
            
            # Skip if already exists (comment out to overwrite)
            if os.path.exists(output_path):
                print(f"Skipping {output_name} (already exists)")
                continue
            
            # Extract the .gz file
            with gzip.open(gz_path, 'rb') as f_in:
                with open(output_path, 'wb') as f_out:
                    shutil.copyfileobj(f_in, f_out)
            
            print(f"Extracted {output_name} from {base_name}")
            
        except Exception as e:
            print(f"Error processing {gz_path}: {str(e)}")
    
    print("Extraction complete!")

if __name__ == "__main__":
    # Get user input for paths
    source_dir = r'C:\Users\lesze\orbitfolder\l1bfiles\l1bfiles'
    destination_dir = r'C:\Users\lesze\orbitfolder\orbit16750'
    
    # Validate paths
    if not os.path.isdir(source_dir):
        print(f"Error: Source directory '{source_dir}' does not exist")
    else:
        extract_fits_from_gz(source_dir, destination_dir)


Found 24 .gz files in C:\Users\lesze\orbitfolder\l1bfiles\l1bfiles
Extracted mvn_iuv_l1b_apoapse-orbit16750-muv_20220708T014253_v13_s02.fits from mvn_iuv_l1b_apoapse-orbit16750-muv_20220708T014253_v13_s02.fits.gz
Extracted mvn_iuv_l1b_apoapse-orbit16750-muv_20220708T014307_v13_s02.fits from mvn_iuv_l1b_apoapse-orbit16750-muv_20220708T014307_v13_s02.fits.gz
Extracted mvn_iuv_l1b_apoapse-orbit16750-muv_20220708T014539_v13_s02.fits from mvn_iuv_l1b_apoapse-orbit16750-muv_20220708T014539_v13_s02.fits.gz
Extracted mvn_iuv_l1b_apoapse-orbit16750-muv_20220708T015047_v13_s02.fits from mvn_iuv_l1b_apoapse-orbit16750-muv_20220708T015047_v13_s02.fits.gz
Extracted mvn_iuv_l1b_apoapse-orbit16750-muv_20220708T015101_v13_s02.fits from mvn_iuv_l1b_apoapse-orbit16750-muv_20220708T015101_v13_s02.fits.gz
Extracted mvn_iuv_l1b_apoapse-orbit16750-muv_20220708T015409_v13_s02.fits from mvn_iuv_l1b_apoapse-orbit16750-muv_20220708T015409_v13_s02.fits.gz
Extracted mvn_iuv_l1b_apoapse-orbit16750-muv_20220708T015

In [39]:
# Standard library imports
import os
import re
import json
import warnings

# Third-party scientific computing
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import matplotlib.patches as mpatches
from matplotlib.patches import Patch
import seaborn as sns

# Astronomy/GIS
from astropy.io import fits
import cartopy.crs as ccrs
import cartopy.feature as cfeature

# Image processing
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np
from astropy.io import fits
import cartopy.crs as ccrs
import cartopy.feature as cfeature
from matplotlib.colors import ListedColormap
import matplotlib.patches as mpatches
import os 
import pandas as pd
from matplotlib.patches import Patch
from collections import Counter 

In [41]:
%matplotlib inline
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('png')

# Suppress the Cartopy warning
warnings.filterwarnings("ignore", category=UserWarning, 
                        message="The 'NearsidePerspective' projection does not handle elliptical globes.")

  set_matplotlib_formats('png')


In [43]:
#takes a minute
parsed_file = pd.read_excel(r'C:/knime/cloudy_images.xlsx', index_col=0) #path to parsed classifications file
users_removed = [] #what users should we remove?
parsed_file = parsed_file[~parsed_file['user_name'].isin(users_removed)]
parsed_file['ID'] = range(1, len(parsed_file)+1)
orbit_list = [18211, 17959, 16570, 16750] #what orbits to process?

In [44]:
os.environ['PROJ_IGNORE_CELESTIAL_BODY'] = 'YES' 

def pixel_to_latlon(ax, proj, x_pixel, y_pixel):
    # Transform pixel to data coordinates
    x_data, y_data  = ax.transData.inverted().transform((x_pixel, y_pixel))
     
    # Transform data coordinates to geographic coordinates
    lon, lat = proj.transform_point(x_data, y_data, src_crs=ax.projection)
    coords = ax.format_coord(lon, lat).split('(')[1].split(')')[0]
    return coords

def get_files_to_dataframe(folder_path):
    file_data = []
    
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if not os.path.isfile(file_path):
            continue

        with fits.open(file_path) as hdu_list:
            lat = hdu_list[16].data['PIXEL_CORNER_LAT'][:,:,-1]
            lon = hdu_list[16].data['PIXEL_CORNER_LON'][:,:,-1]
            solar_zenith_angle = hdu_list[16].data['PIXEL_SOLAR_ZENITH_ANGLE']
            emission_angle = hdu_list[16].data['PIXEL_EMISSION_ANGLE']
            zenith_angle = hdu_list[16].data['PIXEL_ZENITH_ANGLE']
            phase_angle = hdu_list[16].data['PIXEL_PHASE_ANGLE']
            local_time = hdu_list[16].data['PIXEL_LOCAL_TIME']
            sspacecraft_lat = hdu_list[15].data['SUB_SPACECRAFT_LAT'][0]
            sspacecraft_lon = hdu_list[15].data['SUB_SPACECRAFT_LON'][0]
            sspacecraft_alt = hdu_list[15].data['SPACECRAFT_ALT'][0]


            # Stack and reshape coordinates
            z = np.stack([lat, lon], axis=-1)
            flattened = z.reshape(-1, 2)
            # Convert longitude from 0-360 to -180-180
            flattened[:, 1] = (flattened[:, 1] - 180)
            all_coords = flattened.tolist()
            

            y = np.stack([lat, lon, solar_zenith_angle, emission_angle,zenith_angle,phase_angle,local_time], axis=-1)
            y = y.tolist()


            file_data.append({
                'file_name': filename,
                'orbit_no': filename.split("orbit")[1].split("-")[0],
                'timestamp': filename.split("muv_")[1].split("_")[0],
                'sspacecraft_lat': sspacecraft_lat,
                'sspacecraft_lon': sspacecraft_lon,
                'sspacecraft_alt': sspacecraft_alt* 10 ** 3,
                'all_coordinates': all_coords,  # New field containing all points,
                'all_columns_data': y
            })
            

    df = pd.DataFrame(file_data)
    df['datetime'] = pd.to_datetime(df['timestamp'], format='%Y%m%dT%H%M%S')
    return df

def dms_to_decimal(coord):
    """
    Convert coordinate string (e.g., "27.915383°S") to decimal float
    Returns: float in [-180, 180] for longitude, [-90, 90] for latitude
    """
    if pd.isna(coord) or coord == '':
        return np.nan
    
    # Remove degree symbol and split direction
    value = coord.replace('°', '').replace('"', '').replace("'", '')
    direction = value[-1]
    number = float(value[:-1])
    
    # Apply sign based on direction
    if direction in ['S', 'W']:
        return -number
    return number
    
def plot_all_files_on_globe(parsed_file, df, orbit_no, df_with_coords):
    max_files=28
    if len(df) == 0:
        print("DataFrame is empty!")
        return
    
    # Limit number of files to display
    if len(df) > max_files:
        print(f"Showing first {max_files} files out of {len(df)}")
        df = df.head(max_files)
        # Make a bounding box such that the image represents 8000 km x 8000 km
    rmars = 3400 * 10 ** 3
    image_width = 4000 * 10 ** 3
    corner_pos = (1 - rmars / image_width) / 2
    bbox = (corner_pos, corner_pos, 1 - 2 * corner_pos, 1 - 2 * corner_pos)

    # Make properties of the image
    dpi = 100  # Adjust DPI if needed
    fig = plt.figure(figsize=(1015/dpi, 1015/dpi), dpi=dpi)
    globe = ccrs.Globe(semimajor_axis=rmars, semiminor_axis=rmars)
    projection=ccrs.NearsidePerspective(
        satellite_height=files_df['sspacecraft_alt'].iloc[len(files_df)//2],
        central_longitude=files_df['sspacecraft_lon'].iloc[len(files_df)//2]-180, 
        central_latitude=files_df['sspacecraft_lat'].iloc[len(files_df)//2], globe=globe)
    ax = plt.axes()
    transform = ccrs.PlateCarree(globe=globe)
    ax = plt.axes(bbox, projection=projection)
    
    # Create a custom colormap with distinct colors
    colors = plt.cm.tab20(np.linspace(0, 1, len(df)))
    cmap = ListedColormap(colors)
    
    # Create legend handles
    legend_handles = []
    
    # Plot each file's data
    for idx, (_, row) in enumerate(df.iterrows()):
        all_coords = np.array(row['all_coordinates'])
        color = colors[idx]
        
        # Plot all points
        ax.scatter(all_coords[:, 1], all_coords[:, 0],
                  color=color, s=8, alpha=0.03,
                  transform=ccrs.PlateCarree(),
                  zorder=4)

    
    classified_coords = parsed_file.loc[parsed_file['subject_data.orbit'] == orbit_no]
    classified_coords = classified_coords[['ID','annotations_1.value.x', 'annotations_1.value.y','annotations_1.value.tool_label']]
    classified_coords['annotations_1.value.y'] = 1015 - classified_coords['annotations_1.value.y']
    classified_coords = classified_coords[classified_coords[['ID','annotations_1.value.x', 'annotations_1.value.y' ,'annotations_1.value.tool_label']].notnull().all(1)]
    classified_coords['annotations_1.value.tool_label'] = classified_coords['annotations_1.value.tool_label'].str.split('!').str[0]
    classified_coords['projection_globe_coords'] = classified_coords.apply(lambda row: pixel_to_latlon(ax,
                                                                                                       projection,
                                                                                                       row['annotations_1.value.x'],
                                                                                                       row['annotations_1.value.y']), axis=1)
    classified_coords[['projection_globe_coords_y', 'projection_globe_coords_x']] = classified_coords['projection_globe_coords'].str.split(',', expand=True)
    classified_coords = classified_coords.drop(columns=['projection_globe_coords'])
    
    df_with_coords = classified_coords[['ID','projection_globe_coords_y', 'projection_globe_coords_x']]
    return df_with_coords
    


columns = ['ID', 'projection_globe_coords_y', 'projection_globe_coords_x']
df_with_coords = pd.DataFrame(columns=columns)
rows_list = []

for orbit_no in orbit_list:
    try:
        print(f'processing orbit {orbit_no}')
        ###----CHANGE THIS ONE---
        folder_path = rf'C:\Users\lesze\orbitfolder\orbit{orbit_no}'
        files_df = get_files_to_dataframe(folder_path)
        new_coords = plot_all_files_on_globe(parsed_file, files_df, orbit_no, df_with_coords)
        df_with_coords = pd.concat([df_with_coords, new_coords], ignore_index=True)
        
        new_coords['projection_globe_coords_y'] =  new_coords['projection_globe_coords_y'].apply(dms_to_decimal)
        new_coords['projection_globe_coords_x'] =  new_coords['projection_globe_coords_x'].apply(dms_to_decimal)
    
        pre_exploded_df = files_df[['file_name', 'orbit_no', 'timestamp', 'sspacecraft_lat',
               'sspacecraft_lon', 'sspacecraft_alt',
               'all_columns_data', 'datetime']]
        exploded_df = pre_exploded_df.explode('all_columns_data').explode('all_columns_data')
        
        coord_columns = ['lat', 'lon','solar_zenith_angle', 'emission_angle','zenith_angle','phase_angle','local_time']
        exploded_df[coord_columns] = pd.DataFrame(
            exploded_df['all_columns_data'].tolist(),
            index=exploded_df.index
        )
        exploded_df['lon'] = (exploded_df['lon'] - 180) 
    
        
        for i in range(len(new_coords)):
            target_lat = new_coords.iloc[i]['projection_globe_coords_x']
            target_lon = new_coords.iloc[i]['projection_globe_coords_y']
        
            # Calculate distances
            exploded_df['distance'] = np.sqrt(
                (exploded_df['lat'] - target_lat)**2 + 
                (exploded_df['lon'] - target_lon)**2
            )
            
            # Get closest row (as Series)
            closest_row = exploded_df.nsmallest(1, 'distance').iloc[0]
            
            # Combine data into a single dictionary
            combined_data = {
                **new_coords.iloc[i].to_dict(),
                **{f'closest_{k}': v for k, v in closest_row.to_dict().items()}
            }
            
            rows_list.append(combined_data)
    except:        
        print(f'Error processing orbit: {orbit_no}. Most likely missing FITS files or no classifications for this orbit. Check it')


result_df = pd.DataFrame(rows_list)

classifications_with_coordinates = parsed_file.merge(
result_df,
on='ID',  # or left_on/right_on if column names differ
how='inner'       # keeps all rows from parsed_file
)

classifications_with_coordinates.drop(['closest_all_columns_data', 'closest_timestamp', 'closest_orbit_no'], axis=1) #remove unnecassary columns

processing orbit 18211
processing orbit 17959
processing orbit 16570
processing orbit 16750
Error processing orbit: 16750. Most likely missing FITS files or no classifications for this orbit. Check it


Unnamed: 0,user_name,user_id,user_ip,workflow_id,workflow_name,workflow_version,created_at,subject_ids,metadata.session,metadata.started_at,...,closest_sspacecraft_alt,closest_datetime,closest_lat,closest_lon,closest_solar_zenith_angle,closest_emission_angle,closest_zenith_angle,closest_phase_angle,closest_local_time,closest_distance
0,LMB60,2670845.0,5aff11e92630bab1c1bb,26820,Are Clouds Present?,40.87,2024-11-07 18:27:21,104282707,3ff8dc8bde930b5653d10b7788882ef6d57bfb699ce93d...,2024-11-07T18:26:16.669Z,...,4.193374e+06,2023-02-15 07:19:05,76.659787,67.986682,89.357347,89.999995,153.450041,65.253713,3.003979,62.082045
1,not-logged-in-950908914f5575dc1433,,950908914f5575dc1433,26820,Are Clouds Present?,40.87,2024-11-10 23:41:47,104284703,05f8a2d52ef7b2fedb92131598bbe7952b4500964c4bcf...,2024-11-10T23:30:13.622Z,...,4.133966e+06,2023-01-07 21:55:11,-28.610144,-2.542928,36.834445,89.999995,153.438661,53.533081,13.355971,64.518383
2,not-logged-in-950908914f5575dc1433,,950908914f5575dc1433,26820,Are Clouds Present?,40.87,2024-11-10 23:41:47,104284703,05f8a2d52ef7b2fedb92131598bbe7952b4500964c4bcf...,2024-11-10T23:30:13.622Z,...,4.133966e+06,2023-01-07 21:55:11,-28.610144,-2.542928,36.834445,89.999995,153.438661,53.533081,13.355971,64.577601
3,not-logged-in-950908914f5575dc1433,,950908914f5575dc1433,26820,Are Clouds Present?,40.87,2024-11-10 23:41:47,104284703,05f8a2d52ef7b2fedb92131598bbe7952b4500964c4bcf...,2024-11-10T23:30:13.622Z,...,4.133966e+06,2023-01-07 21:55:11,-28.610144,-2.542928,36.834445,89.999995,153.438661,53.533081,13.355971,70.742225
4,not-logged-in-950908914f5575dc1433,,950908914f5575dc1433,26820,Are Clouds Present?,40.87,2024-11-10 23:41:47,104284703,05f8a2d52ef7b2fedb92131598bbe7952b4500964c4bcf...,2024-11-10T23:30:13.622Z,...,4.133966e+06,2023-01-07 21:55:11,-28.610144,-2.542928,36.834445,89.999995,153.438661,53.533081,13.355971,65.951832
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
560,silentium_noctis,2784769.0,f0a2e0c0530a16433c81,26820,Are Clouds Present?,40.87,2024-12-30 04:38:57,104284523,380a485109b15f357f1587fb21d7ea2febe01da8ecb249...,2024-12-30T04:37:56.884Z,...,4.053149e+06,2022-06-11 01:43:48,0.994575,-77.105487,112.471082,89.999995,152.784236,66.888489,4.399493,153.020864
561,anaghagaikwad,2727779.0,1cf185278a4f2619f430,26820,Are Clouds Present?,40.87,2025-01-02 05:25:12,104284523,222e226dfa8ed58b5b5fefbfa6022f3130f4e6568db9a3...,2025-01-02T05:15:09.560Z,...,4.053149e+06,2022-06-11 01:43:48,0.994575,-77.105487,112.471082,89.999995,152.784236,66.888489,4.399493,155.098311
562,anaghagaikwad,2727779.0,1cf185278a4f2619f430,26820,Are Clouds Present?,40.87,2025-01-02 05:25:12,104284523,222e226dfa8ed58b5b5fefbfa6022f3130f4e6568db9a3...,2025-01-02T05:15:09.560Z,...,4.319829e+06,2022-06-11 01:27:26,-64.259239,-84.019874,83.148341,13.972537,173.793680,79.427935,3.702083,0.076579
563,anaghagaikwad,2727779.0,1cf185278a4f2619f430,26820,Are Clouds Present?,40.87,2025-01-02 05:25:12,104284523,222e226dfa8ed58b5b5fefbfa6022f3130f4e6568db9a3...,2025-01-02T05:15:09.560Z,...,4.053149e+06,2022-06-11 01:43:48,-9.530926,-77.834082,108.504351,78.964228,153.325381,66.655970,4.350649,0.148963


In [99]:
MIN_POINTS = 0
MIN_UNIQUE_USERS = 5

def get_cluster_dataframe(points, cloud_types, weights, user_names, orbit_no, original_df):
    """Identify clusters and create dataframe with weighted dominant type calculation."""
    if len(points) < MIN_POINTS:
        return pd.DataFrame()
    
    X = np.array(points)
    cloud_types = np.array(cloud_types)
    weights = np.array(weights)
    user_names = np.array(user_names)
    
    radius_rules = {
        'Disk Clouds': 30,
        'Dotted Clouds': 30,
        'Dotted Cloud': 30,
        'Streak Clouds': 50,
        'Streak Cloud': 50,
        'Vortex Clouds': 50,
        'Twilight Clouds': 50,
        'Gravity Waves': 30
    }
    
    point_radii = np.array([radius_rules.get(ctype.strip(), 30) * weights[i] 
                         for i, ctype in enumerate(cloud_types)])
    
    labels = np.full(len(X), -1)
    current_label = 0
    cluster_data = []
    
    for i in range(len(X)):
        if labels[i] != -1:
            continue
            
        distances = np.sqrt(np.sum((X - X[i])**2, axis=1))
        neighbors = np.where((distances <= point_radii[i]) & (labels == -1))[0]
        
        if len(neighbors) >= MIN_POINTS:
            unique_users = list(set(user_names[neighbors]))
            if len(unique_users) >= MIN_UNIQUE_USERS:
                # Calculate weighted votes for each cloud type
                weighted_votes = {}
                for idx in neighbors:
                    ctype = cloud_types[idx]
                    weight = weights[idx]
                    weighted_votes[ctype] = weighted_votes.get(ctype, 0) + weight
                if weighted_votes:
                    # Get dominant type based on weighted votes
                    dominant_type = max(weighted_votes.items(), key=lambda x: x[1])[0]
                    total_weight = sum(weighted_votes.values())
                    confidence = weighted_votes[dominant_type] / total_weight
                    dominant_radius = radius_rules.get(dominant_type, 30)
                    
                    labels[neighbors] = current_label
                    center = np.mean(X[neighbors], axis=0)
                    
                    # Generate cluster name (A, B, ..., Z, AA, AB, etc.)
                    cluster_name = ''
                    n = current_label
                    while n >= 0:
                        cluster_name = chr(ord('A') + n % 26) + cluster_name
                        n = n // 26 - 1
                    
                    # Create cluster info dictionary
                    cluster_data.append({
                        'orbit_no': orbit_no,
                        'cluster_name': cluster_name,
                        'cluster_center_x': center[0],
                        'cluster_center_y': center[1],
                        'cluster_cloud_type': dominant_type,
                        'cluster_user_count': len(unique_users),
                        'cluster_confidence': confidence,  # Weighted confidence
                        'cluster_radius': dominant_radius,
                        'cluster_total_weight': total_weight,  # Added total weight info
                        'cluster_weighted_votes': weighted_votes
                    })
            
                    current_label += 1
    
    return pd.DataFrame(cluster_data)

def process_orbit_data(parsed_file, orbit_no):
    """
    Process data for a specific orbit and return combined DataFrame.
    """
    df = parsed_file.loc[parsed_file['subject_data.orbit'] == orbit_no].copy()
    
    cols = ['ID', 'user_name', 'annotations_1.value.x', 'annotations_1.value.y', 
            'annotations_1.value.tool_label', 'weight']
    df = df[cols][df[cols].notnull().all(1)]
    df['Cloud type'] = df['annotations_1.value.tool_label'].str.split('!').str[0].str.strip()
    df['annotations_1.value.y'] = df['annotations_1.value.y']
    df['weight'] = pd.to_numeric(df['weight'])
    
    points = df[['annotations_1.value.x', 'annotations_1.value.y']].values
    cloud_types = df['Cloud type'].values
    weights = df['weight'].values
    user_names = df['user_name'].values
    
    return get_cluster_dataframe(points, cloud_types, weights, user_names, orbit_no, df)

def process_all_orbits(parsed_file, orbit_list, logged_in_users=True):
    """
    Process multiple orbits and return concatenated DataFrame with all data.
    """
    all_data = []
    
    for orbit_no in orbit_list:
        orbit_data = process_orbit_data(parsed_file, orbit_no)
        if not orbit_data.empty:
            all_data.append(orbit_data)
    
    if all_data:
        return pd.concat(all_data, ignore_index=True)
    return pd.DataFrame()

# Usage example:
# orbit_list = [16570, 16571, 16572]  # Your orbit numbers
all_clusters_df = process_all_orbits(classifications_with_coordinates, orbit_list)
all_clusters_df.to_csv('cloud_clusters.csv', index=False)

In [101]:
all_clusters_df

Unnamed: 0,orbit_no,cluster_name,cluster_center_x,cluster_center_y,cluster_cloud_type,cluster_user_count,cluster_confidence,cluster_radius,cluster_total_weight,cluster_weighted_votes
0,17959,A,295.260276,363.715119,Streak Clouds,5,0.867347,50,98,"{'Streak Clouds': 85, 'Gravity Waves': 12, 'Vo..."
1,17959,B,782.181671,347.696518,Dotted Clouds,5,0.409091,30,22,"{'Dotted Clouds': 9, 'Gravity Waves': 4, 'Stre..."
2,16570,A,447.313565,669.173754,Disk Clouds,5,0.674419,30,86,"{'Disk Clouds': 58, 'Streak Clouds': 6, 'Other..."
3,16570,B,439.710783,520.007529,Streak Clouds,6,0.652174,50,69,"{'Vortex Clouds': 5, 'Twilight Clouds': 2, 'St..."


In [103]:
import pandas as pd
import numpy as np

def join_closest(df_clusters, df_classifications):
    # Create an empty list to store the joined rows
    joined_rows = []
    
    for _, cluster_row in df_clusters.iterrows():
        orbit = cluster_row['orbit_no']
        center_x = cluster_row['cluster_center_x']
        center_y = cluster_row['cluster_center_y']
        
        # Filter classifications for this orbit
        classifications = df_classifications[df_classifications['subject_data.orbit'] == orbit].copy()
        
        if not classifications.empty:
            # Calculate distance from cluster center to each point
            classifications['distance'] = np.sqrt(
                (classifications['annotations_1.value.y'] - center_x)**2 + 
                (classifications['annotations_1.value.x'] - center_y)**2
            )
            
            # Find the row with minimum distance
            closest_row = classifications.loc[classifications['distance'].idxmin()]
            
            # Combine cluster and classification data
            joined_row = {**cluster_row.to_dict(), **closest_row.to_dict()}
            joined_rows.append(joined_row)
    
    # Create DataFrame from joined rows
    return pd.DataFrame(joined_rows)

# Usage:
result_df = join_closest(all_clusters_df, classifications_with_coordinates)

In [105]:
with pd.option_context('display.max_columns', 80):
    print(result_df)

   orbit_no cluster_name  cluster_center_x  cluster_center_y  \
0     17959            A        295.260276        363.715119   
1     17959            B        782.181671        347.696518   
2     16570            A        447.313565        669.173754   
3     16570            B        439.710783        520.007529   

  cluster_cloud_type  cluster_user_count  cluster_confidence  cluster_radius  \
0      Streak Clouds                   5            0.867347              50   
1      Dotted Clouds                   5            0.409091              30   
2        Disk Clouds                   5            0.674419              30   
3      Streak Clouds                   6            0.652174              50   

   cluster_total_weight                             cluster_weighted_votes  \
0                    98  {'Streak Clouds': 85, 'Gravity Waves': 12, 'Vo...   
1                    22  {'Dotted Clouds': 9, 'Gravity Waves': 4, 'Stre...   
2                    86  {'Disk Clouds': 58,

In [107]:
result_df.to_csv('cloud_clusters_enchanced.csv', index=False)