In [5]:
import pandas as pd
import numpy as np

In [None]:


# 1. Separate the Default Rows from the rest
# We keep the full dataset to 'harvest' data from
df_full = pd.read_csv('nasaconfirmedplanets.csv') 

# Create our working dataframe (only default solutions)
df_default = df_full[df_full['default_flag'] == 1].copy()

# 2. Define the Harvesting Function
def harvest_from_non_default(df_target, df_source):
    # Get a list of all columns except the name and flag
    cols_to_fix = [c for c in df_target.columns if c not in ['pl_name', 'default_flag']]
    
    # We group the source data by planet name to make searching faster
    source_grouped = df_source.groupby('pl_name')

    for col in cols_to_fix:
        # Identify which planets have a null in this specific column
        null_mask = df_target[col].isnull()
        planets_with_nulls = df_target.loc[null_mask, 'pl_name'].unique()

        for planet in planets_with_nulls:
            # Look into the source group for this planet
            if planet in source_grouped.groups:
                potential_values = source_grouped.get_group(planet)[col]
                
                # Pick the first non-null value available in other rows
                actual_value = potential_values.dropna().iloc[0] if not potential_values.dropna().empty else None
                
                if actual_value is not None:
                    # Patch the value into the target dataframe
                    df_target.loc[(df_target['pl_name'] == planet) & (df_target[col].isnull()), col] = actual_value

    return df_target

# 3. Execute the Harvest
df_cleaned = harvest_from_non_default(df_default, df_full)

# 4. Check results
# print(f"Nulls before harvesting: {df_default['pl_masse'].isnull().sum()}")
# print(f"Nulls after harvesting: {df_cleaned['pl_masse'].isnull().sum()}")
print(f"Final shape of high-accuracy dataset: {df_default.shape}")
print(f"Nulls remaining in pl_masse: {df_cleaned.isnull().sum().sum()}")

  df_full = pd.read_csv('nasaconfirmedplanets.csv')


Final shape of high-accuracy dataset: (6065, 289)
Nulls remaining in pl_masse: 635881


In [4]:


# 1. Read the full dataset (reverting your previous drop)
df = pd.read_csv('nasaconfirmedplanets.csv')

# 2. Optimized "Internal Harvest" Logic
# We sort so that default_flag=1 is NOT necessarily the first, 
# but we ensure we preserve the planet groups.
# 'first' finds the first non-null value within each planet group.
harvested_data = df.groupby('pl_name').first()

# 3. Create our final "Default" set
# We filter the full dataset for the default rows
df_default = df[df['default_flag'] == 1].copy()

# 4. Perform the "Patch": Fill nulls in df_default using the harvested data
# We set the index to pl_name to align the rows perfectly
df_default.set_index('pl_name', inplace=True)

# This single line fills EVERY null in the default rows using ANY available 
# data from the non-default rows for that specific planet.
df_default.fillna(harvested_data, inplace=True)

# 5. Reset index to bring pl_name back as a column
df_final = df_default.reset_index()

# Check Results
print(f"Final shape of high-accuracy dataset: {df_final.shape}")
print(f"Nulls remaining in pl_masse: {df_final.isnull().sum().sum()}")

  df = pd.read_csv('nasaconfirmedplanets.csv')


Final shape of high-accuracy dataset: (6065, 289)
Nulls remaining in pl_masse: 635881
