In [6]:
import pandas as pd
import numpy as np
import re

def clean_numeric_string(s):
    """Clean numeric string by replacing Unicode minus with ASCII minus"""
    return s.replace('−', '-').strip()

def magnitude_to_distance(dm, e_dm):
    """Convert distance modulus to distance in Mpc with error propagation"""
    # Distance modulus: dm = 5*log10(d) - 5, where d is in pc
    # Therefore: d = 10^((dm + 5)/5) pc = 10^((dm - 25)/5) Mpc
    if pd.isna(dm):
        return np.nan, np.nan
    
    distance_mpc = 10**((dm - 25) / 5)
    

    if pd.isna(e_dm):
        error_mpc = np.nan
    else:
        error_mpc_min = distance_mpc - 10**((dm - e_dm - 25) / 5)  
        error_mpc_max = 10**((dm + e_dm - 25) / 5) - distance_mpc
    
    return distance_mpc, error_mpc_min, error_mpc_max

def process_hyperleda_data(filename="EDD_data.txt"):
    """Process the HyperLEDA data from file and create a structured DataFrame"""
    
    try:
        # Read data from file
        with open(filename, 'r', encoding='utf-8') as file:
            lines = file.readlines()
        print(f"Successfully loaded data from {filename}")
    except FileNotFoundError:
        print(f"Error: File '{filename}' not found.")
        print("Please make sure the file is in the same directory as this script.")
        return None
    except Exception as e:
        print(f"Error reading file: {e}")
        return None
    
    # Find the data lines (skip headers and metadata)
    data_lines = []
    for line in lines:
        line = line.strip()
        # Skip empty lines, headers, and metadata lines
        if (line and 
            not line.startswith('HYPERLEDA') and 
            not line.startswith('pgc,') and 
            not line.startswith('%') and 
            not line.startswith('---') and 
            not line.startswith('"The Catalogue') and
            not line.startswith('"') and
            line[0].isdigit()):
            data_lines.append(line)
    
    print(f"Found {len(data_lines)} data rows")
    
    # Initialize lists to store processed data
    data = {
        'Name': [],
        'PGC': [],
        'RA': [],
        'Dec': [],
        'Dis': [],
        'e_dis_min': [],
        'e_dis_max': [],
        'ref_dis': [],
        'V_h': [],
        'e_V_h': [],
        'ref_V_h': [],
        'Source': [],
        'Type': []
    }
    
    for line in data_lines:
        try:
            # Split by comma
            parts = [part.strip() for part in line.split(',')]
            
            if len(parts) < 10:
                print(f"Skipping line with insufficient columns: {line}")
                continue
            
            # Extract data according to the mapping:
            # 1st: PGC number
            # 2nd: v_h (heliocentric velocity)
            # 3rd: e_v_h (velocity error)
            # 4th: name
            # 5th, 6th: ignored (RA2000, DEC2000 in h:m:s format)
            # 7th: magnitude (distance modulus)
            # 8th: e_mag (magnitude error)
            # 9th: RA (in degrees)
            # 10th: Dec (in degrees)
            
            pgc_num = parts[0]
            v_h = float(clean_numeric_string(parts[1]))
            e_v_h = float(clean_numeric_string(parts[2]))
            
            # Skip rows where v_h or e_v_h are 0
            if v_h == 0 or e_v_h == 0:
                print(f"Skipping row with v_h={v_h} or e_v_h={e_v_h}: {name if 'name' in locals() else 'unknown'}")
                continue
            
            name = parts[3]
            # Skip parts[4] and parts[5] (RA2000, DEC2000 in h:m:s format)
            dm = float(clean_numeric_string(parts[6]))  # Distance modulus
            e_dm = float(clean_numeric_string(parts[7]))  # Error in distance modulus
            ra = float(clean_numeric_string(parts[8]))
            dec = float(clean_numeric_string(parts[9]))
            
            # Convert distance modulus to distance in Mpc
            dis, e_dis_min, e_dis_max = magnitude_to_distance(dm, e_dm)
            
            # Append to data dictionary
            data['Name'].append(name)
            data['PGC'].append(f"PGC{pgc_num}")
            data['RA'].append(ra)
            data['Dec'].append(dec)
            data['Dis'].append(dis)
            data['e_dis_min'].append(e_dis_min)
            data['e_dis_max'].append(e_dis_max)            
            data['ref_dis'].append(-1)  # Default reference (not provided in this dataset)
            data['V_h'].append(v_h)
            data['e_V_h'].append(e_v_h)
            data['ref_V_h'].append(-1)  # Default reference (not provided in this dataset)
            data['Source'].append('TRGB')  # Source is TRGB measurements
            data['Type'].append('Gxy')  # General galaxy type (not specified in dataset)
            
        except (ValueError, IndexError) as e:
            print(f"Error processing line: {line}")
            print(f"Error: {e}")
            continue
    
    # Create DataFrame
    df = pd.DataFrame(data)
    return df

def main():
    """Main function to process data and display results"""
    
    # Process the data from file
    df = process_hyperleda_data("EDD_data.txt")
    
    if df is None:
        return None
    
    # Display basic information about the dataset
    print("EDD Data Processing Complete!")
    print(f"Number of objects: {len(df)}")
    print(f"Columns: {list(df.columns)}")
    print("\nFirst 5 rows:")
    print(df.head())
    
    print("\nDataset Info:")
    print(df.info())
    
    print("\nBasic Statistics:")
    print(df.describe())
    
    # Display some specific statistics
    print(f"\nDistance range: {df['Dis'].min():.2f} - {df['Dis'].max():.2f} Mpc")
    print(f"Velocity range: {df['V_h'].min():.0f} - {df['V_h'].max():.0f} km/s")
    print(f"Unique PGC entries: {df['PGC'].nunique()}")
    
    # Save to CSV file
    df.to_csv('EDD_data.csv', index=False)
    print("\nData saved to 'EDD_data.csv'")
    
    return df

# Run the script
if __name__ == "__main__":
    df = main()
    
    # Optional: Display the full dataframe
    if df is not None:
        print("\nFull Dataset:")
        print(df.to_string(index=False))

Successfully loaded data from EDD_data.txt
Found 149 data rows
Skipping row with v_h=0.0 or e_v_h=0.0: NGC3368
Skipping row with v_h=0.0 or e_v_h=0.0: NGC3368
Skipping row with v_h=0.0 or e_v_h=0.0: PGC4689210
Skipping row with v_h=0.0 or e_v_h=0.0: NGC3377
Skipping row with v_h=0.0 or e_v_h=0.0: ESO444-078
Skipping row with v_h=0.0 or e_v_h=0.0: NGC5264
Skipping row with v_h=0.0 or e_v_h=0.0: IC4247
Skipping row with v_h=0.0 or e_v_h=0.0: ESO376-016
Skipping row with v_h=0.0 or e_v_h=0.0: ESO318-013
Skipping row with v_h=0.0 or e_v_h=0.0: PGC166152
Skipping row with v_h=0.0 or e_v_h=0.0: ESO324-024
Skipping row with v_h=0.0 or e_v_h=0.0: ESO324-024
Skipping row with v_h=0.0 or e_v_h=0.0: ESO324-024
Skipping row with v_h=0.0 or e_v_h=0.0: ESO324-024
Skipping row with v_h=0.0 or e_v_h=0.0: ESO324-024
Skipping row with v_h=0.0 or e_v_h=0.0: ESO324-024
Skipping row with v_h=0.0 or e_v_h=0.0: ESO324-024
Skipping row with v_h=0.0 or e_v_h=0.0: ESO324-024
Skipping row with v_h=0.0 or e_v_h=0

In [2]:
import pandas as pd
import numpy as np

# Load file, skipping metadata rows
file_path = 'EDD_data_TRGB.txt'
df = pd.read_csv(file_path, skiprows=5, header=None)

# Assign column names manually
df.columns = [
    'pgc', 'v', 'e_v', 'objname', 'RA2000', 'DEC2000', 'pgc_group',
    'D_first', 'DMop', 'eDMop', 'DMir', 'eDMir', 'RAJ_deg', 'DeJ_deg'
]

# Remove rows where v == 0 or e_v == 0
df = df[(df['v'] != 0) & (df['e_v'] != 0)]

# Function to choose best DM and compute D, e_D
def compute_distance(row):
    DMop = row['DMop']
    eDMop = row['eDMop']
    DMir = row['DMir']
    eDMir = row['eDMir']

    # Pick modulus with smaller uncertainty, if both are available
    if pd.notna(DMir) and pd.notna(eDMir) and (pd.isna(eDMop) or eDMir < eDMop):
        DM = DMir
        eDM = eDMir
    else:
        DM = DMop
        eDM = eDMop

    # Compute distance and error
    D = 10 ** ((DM - 25) / 5)
    e_D_min = D - 10**((DM - eDM - 25) / 5)
    e_D_max = 10**((DM + eDM - 25) / 5) - D
    return pd.Series([D, e_D_min,e_D_max])

# Apply function
df[['Dis', 'e_Dis_min','e_Dis_max']] = df.apply(compute_distance, axis=1)

# Convert PGC number to "PGC<number>" format
df['pgc'] = df['pgc'].apply(lambda x: f"PGC{int(x)}")

# Select final columns and rename
final_df = df[['objname', 'pgc', 'RAJ_deg', 'DeJ_deg', 'v', 'e_v', 'Dis', 'e_Dis_min','e_Dis_max']].copy()
final_df.columns = ['Name', 'PGC', 'Ra (en degrees)', 'Dec (en degrees)', 'V_h', 'e_V_h', 'Dis', 'e_Dis_min','e_Dis_max']

# Save to CSV
final_df.to_csv('EDD_data_TRGB.csv', index=False)
print("Saved to EDD_data_TRGB.csv")


Saved to EDD_data_TRGB.csv
