In [8]:
import pandas as pd
import numpy as np
import re

def clean_numeric_string(s):
    """Clean numeric string by replacing Unicode minus with ASCII minus"""
    return s.replace('−', '-').strip()

def parse_distance_error(dist_str):
    """Parse distance string like '3.15±0.09' into distance and error"""
    dist_str = clean_numeric_string(dist_str)
    if '±' in dist_str:
        parts = dist_str.split('±')
        return float(parts[0]), float(parts[1])
    else:
        return float(dist_str), np.nan

def parse_velocity_error(vel_str):
    """Parse velocity string like '744±2' into velocity and error"""
    vel_str = clean_numeric_string(vel_str)
    if '±' in vel_str:
        parts = vel_str.split('±')
        return float(parts[0]), float(parts[1])
    else:
        return float(vel_str), np.nan

def process_astronomical_data(filename="CenA_dwarfs.txt"):
    """Process the astronomical data from file and create a structured DataFrame"""
    
    try:
        # Read data from file
        with open(filename, 'r', encoding='utf-8') as file:
            raw_data = file.read()
        print(f"Successfully loaded data from {filename}")
    except FileNotFoundError:
        print(f"Error: File '{filename}' not found.")
        print("Please make sure the file is in the same directory as this script.")
        return None
    except Exception as e:
        print(f"Error reading file: {e}")
        return None
    
    # Split data into lines and process each line
    lines = [line.strip() for line in raw_data.strip().split('\n') if line.strip()]
    
    # Skip header line if it exists
    if lines and ('Nom' in lines[0] or 'Name' in lines[0]):
        lines = lines[1:]
    
    # Initialize lists to store processed data
    data = {
        'Name': [],
        'PGC': [],
        'RA': [],
        'Dec': [],
        'Dis': [],
        'e_dis_min': [],
        'e_dis_max': [],
        'ref_dis': [],
        'V_h': [],
        'e_V_h': [],
        'ref_V_h': [],
        'Source': [],
        'Type': []
    }
    
    for line in lines:
        # Split by tabs or multiple spaces (in case the file uses spaces instead of tabs)
        if '\t' in line:
            parts = line.split('\t')
        else:
            # Split by multiple whitespace characters
            parts = re.split(r'\s{2,}', line)
        
        # Extract basic information
        name = parts[0].strip()
        pgc = parts[1].strip() if parts[1].strip() else None
        ra = float(clean_numeric_string(parts[2]))
        dec = float(clean_numeric_string(parts[3]))
        
        # Parse distance and error
        dis, e_dis = parse_distance_error(parts[4])
        ref_dis = int(parts[5])
        
        # Parse velocity and error
        v_h, e_v_h = parse_velocity_error(parts[6])
        ref_v_h = int(parts[7])
        
        source = parts[8].strip()
        obj_type = parts[9].strip()
        
        # Append to data dictionary
        data['Name'].append(name)
        data['PGC'].append(pgc)
        data['RA'].append(ra)
        data['Dec'].append(dec)
        data['Dis'].append(dis)
        data['e_dis_min'].append(e_dis)
        data['e_dis_max'].append(e_dis)
        data['ref_dis'].append(ref_dis)
        data['V_h'].append(v_h)
        data['e_V_h'].append(e_v_h)
        data['ref_V_h'].append(ref_v_h)
        data['Source'].append(source)
        data['Type'].append(obj_type)
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    return df

def main():
    """Main function to process data and display results"""
    
    # Process the data from file
    df = process_astronomical_data("CenA_dwarfs.txt")
    
    if df is None:
        return None
    
    # Display basic information about the dataset
    print("Astronomical Data Processing Complete!")
    print(f"Number of objects: {len(df)}")
    print(f"Columns: {list(df.columns)}")
    print("\nFirst 5 rows:")
    print(df.head())
    
    print("\nDataset Info:")
    print(df.info())
    
    print("\nBasic Statistics:")
    print(df.describe())
    
    # Display some specific statistics
    print(f"\nDistance range: {df['Dis'].min():.2f} - {df['Dis'].max():.2f} Mpc")
    print(f"Velocity range: {df['V_h'].min():.0f} - {df['V_h'].max():.0f} km/s")
    print(f"Object types: {df['Type'].unique()}")
    print(f"Sources: {df['Source'].unique()}")
    
    # Save to CSV file
    df.to_csv('CenA_dwarfs.csv', index=False)
    print("\nData saved to 'astronomical_data.csv'")
    
    return df

# Run the script
if __name__ == "__main__":
    df = main()
    
    # Optional: Display the full dataframe
    print("\nFull Dataset:")
    print(df.to_string(index=False))

    


Successfully loaded data from CenA_dwarfs.txt
Astronomical Data Processing Complete!
Number of objects: 28
Columns: ['Name', 'PGC', 'RA', 'Dec', 'Dis', 'e_dis_min', 'e_dis_max', 'ref_dis', 'V_h', 'e_V_h', 'ref_V_h', 'Source', 'Type']

First 5 rows:
         Name        PGC        RA      Dec   Dis  e_dis_min  e_dis_max  \
0  ESO269-037  PGC045916  195.8875 -46.5842  3.15       0.09       0.09   
1     NGC4945  PGC045279  196.3583 -49.4711  3.72       0.03       0.03   
2  ESO269-058  PGC045717  197.6333 -46.9908  3.75       0.02       0.02   
3       KK189  PGC166158  198.1883 -41.8320  4.21       0.17       0.17   
4  ESO269-066  PGC045916  198.2875 -44.8900  3.75       0.03       0.03   

   ref_dis    V_h  e_V_h  ref_V_h Source  Type  
0       -1  744.0    2.0       -2     HI  dIrr  
1       -3  563.0    3.0       -4     HI   Scd  
2       -1  400.0   18.0       -5     HI  dIrr  
3       -6  753.0    4.0       -7  stars  dSph  
4       -1  784.0   31.0       -8  stars  dSph  

Datas