In [34]:
import pandas as pd

In [36]:
def combine_columns(input_file, output_file):
    combined_lines = []
    
    with open(input_file, 'r') as f:
        for line in f:
            values = [val.strip() for val in line.split(',')]
            combined = ' '.join(val for val in values if val)
            combined_lines.append(combined)
    
    with open(output_file, 'w') as f:
        for line in combined_lines:
            f.write(f"{line}\n")
    
    return combined_lines

In [37]:
input_path = "../original data/iac_exoplanet_atmospheres-20241215.csv"
output_path = "../preprocessed_data/planet_atmosphere.csv"
combine_columns(input_path, output_path)

['name;planet_status;mass;radius;type;orbital_period;semi_major_axis;star_name;star_distance;star_teff;star_radius;temp_calculated;mag_v;mag_j;mag_k;scale_factor;tsm;esm;alternate_names;updated;observation_type;reference;molecules;albedo;phase_curve;comments',
 '"2M 0103-55 (AB) b";"Confirmed";13.0;9.7604677202041;"Jupiter";;84.0;"2M 0103-55 (AB)";47.2;;;;;7.5;;1.15;;;"Delorme 1 (AB)";"8/12/22";"G-B: High resolution visible";2212.03207;"{"H":"Detection"}";;"No";"UVES/VLT. H emission lines and tentative He I Ca II H/K emission lines"',
 '"2M0437 b";"Confirmed";4.0;4.8793589717681;"Hot Jupiter";;118.0;"2M0437";128.1;3100.0;0.84;1450.0;15.7;11.2980000;10.3860000;1.15;1668.9209;4695.6494;;"10/19/21";"G-B: High resolution IR";2308.13745;"{"CO":"Detection" "CH4":"Detection" "H2O":"Detection"}";;"No";',
 '"51 Eri b";"Confirmed";2.6;1.11;"Warm Jupiter";10260.0;11.1;"51 Eri";29.4;;;700.0;5.223;4.7440000;4.5370000;1.15;;;;"8/30/22";"G-B: High resolution IR";2211.14330;"{"Molecular ratios: C/O et

In [38]:
def process_columns(input_file, output_file):
    df = pd.read_csv(input_file, header=None)
    original_column = df.columns[0]
    column_names = df.iloc[0][original_column].split(';')
    num_columns = len(column_names)
    
    # Process each row individually to handle missing values
    rows_list = []
    for row in df[original_column].iloc[1:]:
        # Split the row by semicolon
        values = row.split(';')
        # If row has fewer values than columns, extend with None
        if len(values) < num_columns:
            values.extend([None] * (num_columns - len(values)))
        # If row has more values than columns, truncate
        elif len(values) > num_columns:
            values = values[:num_columns]
        rows_list.append(values)
    
    # Create DataFrame from processed rows
    data_rows = pd.DataFrame(rows_list, columns=column_names)
    
    # Keep only specified columns if they exist
    selected_columns = ['name', 'mass', 'radius', 'orbital_period', 'tsm', 'observation_type', 'reference','molecules']
    existing_columns = [col for col in selected_columns if col in data_rows.columns]
    
    if len(existing_columns) < len(selected_columns):
        missing_columns = set(selected_columns) - set(existing_columns)
        print(f"WARNING: Missing columns: {missing_columns}")
    
    filtered_data = data_rows[existing_columns]

    filtered_data = filtered_data.rename(columns={'name': 'pl_name'})
    
    # Convert 'tsm' to numeric, handling any errors
    if 'tsm' in filtered_data.columns:
        filtered_data['tsm'] = pd.to_numeric(filtered_data['tsm'], errors='coerce')
    
    filtered_data.to_csv(output_file, index=False)
    
    return

In [39]:
process_columns(output_path, output_path)

Merge data from two sources

In [36]:
def merge_csv_files(file1_path, file2_path, output_path, columns_to_keep=None):
    df1 = pd.read_csv(file1_path)
    df2 = pd.read_csv(file2_path)
    
    if columns_to_keep:
        df2 = df2[columns_to_keep]
    
    merged_df = df1.merge(df2, on='pl_name', how='left')
    merged_df.to_csv(output_path, index=False)

    return

In [None]:
file1_path = '/Users/ghj/Desktop/Databases/final project/preprocessed_data/planet_atmosphere.csv'
file2_path = '/Users/ghj/Desktop/Databases/final project/original data/PS_2024.12.13_18.57.54.csv'
output_path = '/Users/ghj/Desktop/Databases/final project/preprocessed_data/planet_and_system.csv'
columns_to_keep = ['pl_name', 'hostname', 'sy_snum', 'sy_pnum', 'discoverymethod', 'disc_year', 'disc_refname', 'disc_pubdate', 'disc_locale', 'disc_facility', 'disc_telescope', 'disc_instrument']
merge_csv_files(file1_path, file2_path, output_path, columns_to_keep)

  df2 = pd.read_csv(file2_path)
