In [4]:
import pandas as pd
import numpy as np

def clean_numeric_string(s):
    return s.replace('−', '-').strip() if isinstance(s, str) else s

def process_updated_data(filename="updated_data.txt"):
    try:
        with open(filename, 'r', encoding='utf-8') as file:
            lines = file.readlines()
    except Exception as e:
        print(f"Error reading file: {e}")
        return None

    data = []

    for i, line in enumerate(lines):
        if i == 0 or not line.strip():
            continue  # Skip header or empty line

        parts = line.strip().split('\t')
        if len(parts) < 10:
            print(f"Skipping line with too few columns: {line}")
            continue

        # Parse names and PGC
        name_field = parts[0]
        name_tokens = [p.strip() for p in name_field.split(',')]
        pgc_token = next((p for p in name_tokens if p.startswith("PGC")), None)
        name_only = ','.join([p for p in name_tokens if not p.startswith("PGC")])
        pgc = pgc_token if pgc_token else None

        try:
            ra = float(clean_numeric_string(parts[1]))
            dec = float(clean_numeric_string(parts[2]))
            v_h = float(clean_numeric_string(parts[3]))
            e_v_h = float(clean_numeric_string(parts[4]))
            mag = float(clean_numeric_string(parts[6])) if parts[6] else np.nan
            e_mag = float(clean_numeric_string(parts[7])) if parts[7] else np.nan
            dis = float(clean_numeric_string(parts[8])) if parts[8] else np.nan
            e_dis = float(clean_numeric_string(parts[9])) if parts[9] else np.nan
        except ValueError as ve:
            print(f"ValueError on line: {line}")
            print(f"Error: {ve}")
            continue

        f_dis = parts[10] if len(parts) > 10 else ""
        ref_dis = parts[11] if len(parts) > 11 else "-1"
        membership = parts[12] if len(parts) > 12 else ""
        article = parts[13] if len(parts) > 13 else ""
        ref_v = parts[5] if len(parts) > 5 else "-1"

        # Source determination
        source = "Cep" if "cep" in f_dis.lower() else "TRGB"

        data.append({
            'Name': name_only,
            'PGC': pgc,
            'RA': ra,
            'Dec': dec,
            'Dis': dis,
            'e_dis': e_dis,
            'ref_dis': ref_dis,
            'V_h': v_h,
            'e_V_h': e_v_h,
            'ref_V_h': ref_v,
            'Source': source,
            'Type': 'Gxy'
        })

    df = pd.DataFrame(data)
    print(f"Processed {len(df)} rows.")
    return df

def main():
    df = process_updated_data("updated_data.txt")
    if df is not None:
        df.to_csv("updated_data.csv", index=False)
        print("Saved to 'updated_data.csv'")
        print(df.head())

if __name__ == "__main__":
    main()



Processed 63 rows.
Saved to 'updated_data.csv'
         Name       PGC          RA        Dec   Dis  e_dis ref_dis    V_h  \
0     E059-01  PGC21199  112.830417 -68.186111  4.57   0.36    K06a  530.0   
1       N2915  PGC26761  141.547917 -76.626389  3.78   0.43     CNG  465.0   
2  SexB,DDO70  PGC28913  150.000417   5.332222  1.36   0.07     CNG  302.0   
3       N3109    PGC128  150.780000 -26.160000  1.33   0.08     CNG  403.0   
4      Antlia  PGC29194  151.016667 -27.331944  1.28   0.13     T06  361.0   

   e_V_h ref_V_h Source Type  
0    4.0           TRGB  Gxy  
1    3.0           TRGB  Gxy  
2    2.0           TRGB  Gxy  
3    2.0           TRGB  Gxy  
4    3.0           TRGB  Gxy  
