In [1]:
import pandas as pd  #Importing pandas
file_path="/kaggle/input/nasa-exoplanet/nasa_exoplanet.csv"   #Giving file path to kaggle notebook
df=pd.read_csv(file_path,comment='#',engine="python",on_bad_lines="skip")
## comment ---removes header lines starting with '#'
## engine=Makes reading flexible for irregular NASA CSV formatting.
#on_bad_lines="skip"?  Skips corrupted or incomplete rows safely.
needed = [
    "pl_rade", "pl_bmasse", "pl_dens", "pl_eqt",
    "pl_orbper", "sy_dist", "st_teff", 
    "st_lum", "st_spectype", "st_metfe",
    "pl_orbeccen", "pl_orbsmax"
] #required columns as parameters

[col for col in needed if col in df.columns]

df = df.rename(columns={
    "pl_rade": "radius",
    "pl_bmasse": "mass",
    "pl_eqt": "temp",
    "pl_orbper": "orbital_period",
    "sy_dist": "distance_star",
    "st_teff": "star_temp",
    "st_spectype": "star_type",
    "pl_orbeccen": "eccentricity",
    "pl_orbsmax": "semi_major_axis"
}) #renaming to sensible namess :)

selected_cols = [
    "radius", "mass", "temp", "orbital_period",
    "distance_star", "star_temp", "star_type",
    "eccentricity", "semi_major_axis"
] #selected columns from renamed

df=df[selected_cols] #dataframe shd now contain only selected columns
df.isnull().sum()
df.shape
#returns sum of entries with null values for each columns


(39158, 9)

In [2]:
df=df.dropna(subset=["radius","temp","mass","star_temp"])  #drop major columns with null values
df['star_type'] = df['star_type'].fillna("Unknown") #fill null values as unknown
df['eccentricity'] = df['eccentricity'].fillna(df['eccentricity'].mean())  #fill unknown values with mean
df['semi_major_axis'] = df['semi_major_axis'].fillna(df['semi_major_axis'].mean())  #fill unknown values with mean
df.shape


(1746, 9)

In [3]:
df.head()
df.info()
df.shape


<class 'pandas.core.frame.DataFrame'>
Index: 1746 entries, 29 to 39111
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   radius           1746 non-null   float64
 1   mass             1746 non-null   float64
 2   temp             1746 non-null   float64
 3   orbital_period   1733 non-null   float64
 4   distance_star    1729 non-null   float64
 5   star_temp        1746 non-null   float64
 6   star_type        1746 non-null   object 
 7   eccentricity     1746 non-null   float64
 8   semi_major_axis  1746 non-null   float64
dtypes: float64(8), object(1)
memory usage: 136.4+ KB


(1746, 9)

In [4]:
file2_path="/kaggle/input/exoplanetsdata1/exoplanetsdata1.csv"
df2=pd.read_csv(file2_path,comment='#',engine='python',on_bad_lines='skip')
df2 = df2.loc[:, ~df2.columns.str.contains('^Unnamed')]


In [5]:
df2.head()
df2.columns


Index(['pl_name', 'hostname', 'sy_snum', 'sy_pnum', 'discoverymethod',
       'disc_year', 'disc_facility', 'facility_type', 'pl_controv_flag',
       'pl_orbper', 'pl_orbpererr1', 'pl_orbpererr2', 'pl_orbperlim',
       'pl_orbsmax', 'pl_orbsmaxerr1', 'pl_orbsmaxerr2', 'pl_orbsmaxlim',
       'pl_rade', 'pl_radeerr1', 'pl_radeerr2', 'pl_radelim', 'pl_radj',
       'pl_radjerr1', 'pl_radjerr2', 'pl_radjlim', 'pl_bmasse',
       'pl_bmasseerr1', 'pl_bmasseerr2', 'pl_bmasselim', 'pl_bmassj',
       'pl_bmassjerr1', 'pl_bmassjerr2', 'pl_bmassjlim', 'pl_bmassprov',
       'pl_orbeccen', 'pl_orbeccenerr1', 'pl_orbeccenerr2', 'pl_orbeccenlim',
       'pl_insol', 'pl_insolerr1', 'pl_insolerr2', 'pl_insollim', 'pl_eqt',
       'pl_eqterr1', 'pl_eqterr2', 'pl_eqtlim', 'ttv_flag', 'st_spectype',
       'st_teff', 'st_tefferr1', 'st_tefferr2', 'st_tefflim', 'st_rad',
       'st_raderr1', 'st_raderr2', 'st_radlim', 'st_mass', 'st_masserr1',
       'st_masserr2', 'st_masslim', 'st_met', 'st_meterr1

In [6]:
df2 = df2.rename(columns={
    "pl_rade": "radius",
    "pl_bmasse": "mass",
    "pl_eqt": "temp",
    "pl_orbper": "orbital_period",
    "sy_dist": "distance_star",
    "st_teff": "star_temp",
    "st_spectype": "star_type",
    "pl_orbeccen": "eccentricity",
    "pl_orbsmax": "semi_major_axis"
})
df2 = df2[[
    "radius", "mass", "temp", "orbital_period",
    "distance_star", "star_temp", "star_type",
    "eccentricity", "semi_major_axis"
]]
df2.shape

(5470, 9)

In [7]:
combined_df=pd.concat([df,df2],ignore_index=True)
combined_df=combined_df.drop_duplicates()
combined_df.info()
combined_df.shape

<class 'pandas.core.frame.DataFrame'>
Index: 7078 entries, 0 to 7215
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   radius           7060 non-null   float64
 1   mass             7053 non-null   float64
 2   temp             5650 non-null   float64
 3   orbital_period   6827 non-null   float64
 4   distance_star    7039 non-null   float64
 5   star_temp        6869 non-null   float64
 6   star_type        3538 non-null   object 
 7   eccentricity     6330 non-null   float64
 8   semi_major_axis  6783 non-null   float64
dtypes: float64(8), object(1)
memory usage: 553.0+ KB


(7078, 9)

In [8]:
critical = ["radius", "mass", "temp", "star_temp", "orbital_period"]
df_clean = combined_df.dropna(subset=critical)
df_clean.shape

(5585, 9)

In [9]:
df_clean["star_type"] = df_clean["star_type"].fillna("Unknown")
df_clean["eccentricity"] = df_clean["eccentricity"].fillna(df_clean["eccentricity"].mean())
df_clean["semi_major_axis"] = df_clean["semi_major_axis"].fillna(df_clean["semi_major_axis"].mean())
df_clean["distance_star"] = df_clean["distance_star"].fillna(df_clean["distance_star"].mean())


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean["star_type"] = df_clean["star_type"].fillna("Unknown")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean["eccentricity"] = df_clean["eccentricity"].fillna(df_clean["eccentricity"].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean["semi_major_axis"] = df_clean["semi_majo

In [10]:
df_clean.info()
df_clean.shape
df_clean.head()


<class 'pandas.core.frame.DataFrame'>
Index: 5585 entries, 1 to 7202
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   radius           5585 non-null   float64
 1   mass             5585 non-null   float64
 2   temp             5585 non-null   float64
 3   orbital_period   5585 non-null   float64
 4   distance_star    5585 non-null   float64
 5   star_temp        5585 non-null   float64
 6   star_type        5585 non-null   object 
 7   eccentricity     5585 non-null   float64
 8   semi_major_axis  5585 non-null   float64
dtypes: float64(8), object(1)
memory usage: 436.3+ KB


Unnamed: 0,radius,mass,temp,orbital_period,distance_star,star_temp,star_type,eccentricity,semi_major_axis
1,2.08,7.81,1958.0,0.736544,12.5855,5234.0,Unknown,0.061,0.01544
2,4.07,17.0,593.0,8.463,9.7221,3700.0,Unknown,0.0,0.0645
3,3.24,13.6,454.0,18.859019,9.7221,3700.0,Unknown,0.0,0.1101
4,2.23,16.3,546.0,41.6855,179.461,5766.0,G,0.0,0.241
6,21.59,3932.0,2001.0,4.288973,589.423,6935.0,Unknown,0.066,0.0656


In [11]:
df_clean.shape

(5585, 9)