## Loading data into Pandas DataFrame

In [None]:
#importing the necessary library
import pandas as pd
import numpy as np

In [None]:
# downloading the csv file using the id which is taken from the sharable link.
! gdown --id 1wG9iNFB5cmqCTPbEobQUlFMVlcQsF7pY

Downloading...
From: https://drive.google.com/uc?id=1wG9iNFB5cmqCTPbEobQUlFMVlcQsF7pY
To: /content/nasa_exoplanets.csv
100% 603k/603k [00:00<00:00, 121MB/s]


In [None]:
#loading the the csv file and converting it into dataframe
df = pd.read_csv("/content/nasa_exoplanets.csv")
#showing the first 5 rows of the dataset
df.head()

Unnamed: 0,Planet Name,Parsecs from Earth,Planet Mass,Stellar Magnitude,Discovery Date,Planet Radius,Planet Type,Discovery Method,Orbital Radius,Orbital Period,Eccentricity
0,HD 21520 b,79.208,17.7 Earths,9.17,2024,0.241 x Jupiter,Neptune-like,Transit,0.1726 AU,25.1 days,0.0
1,TOI-3568 b,197.906,26.4 Earths,12.879,2024,0.483 x Jupiter,Neptune-like,Transit,0.0485 AU,4.4 days,0.04
2,TOI-2384 b,191.51,1.966 Jupiters,15.103,2024,1.025 x Jupiter,Gas Giant,Transit,0.02793 AU,2.1 days,0.0
3,TOI-2379,211.247,5.76 Jupiters,15.548,2024,1.046 x Jupiter,Gas Giant,Transit,0.05263 AU,5.5 days,0.34
4,TOI-6034 b,117.992,0.798 Jupiters,15.189,2024,1.063 x Jupiter,Gas Giant,Transit,0.02949 AU,2.6 days,0.04


# Check for Null Values and Unknown Values

In [None]:
#showing the missing values in each columns
missing_values = df.isnull().sum()
print('Missing values :\n', missing_values)


Missing values :
 Planet Name              0
Parsecs from Earth       0
Planet Mass              0
Stellar Magnitude        0
Discovery Date           0
Planet Radius          229
Planet Type            230
Discovery Method       230
Orbital Radius         231
Orbital Period         231
Eccentricity          1013
dtype: int64


In [None]:
#replacing the unknown values and empty strings with null values
df.replace(['Unknown', ''], np.nan, inplace=True)
df.isnull().sum()

Unnamed: 0,0
Planet Name,0
Parsecs from Earth,9
Planet Mass,23
Stellar Magnitude,239
Discovery Date,0
Planet Radius,251
Planet Type,237
Discovery Method,230
Orbital Radius,500
Orbital Period,231


# Handling the missing values

Handling the Planet Mass and Planet Radius

In [None]:
# Function to convert "Planet Mass" to numerical values(in Jupiter masses)
def convert_mass(value):
    if pd.isna(value):
        return np.nan
    # Jupiter mass is 317.8 times the mass of Earth
    if 'Earths' in value:
        return float(value.split(' ')[0]) / 317.8  # 1 Jupiter mass = 317.8 Earth masses
    elif 'Jupiters' in value:
        return float(value.split(' ')[0])
    else:
        return np.nan

In [None]:
# Appling the conversion function to the 'Planet Mass' column
df['Planet Mass (Jupiter)'] = df['Planet Mass'].apply(convert_mass)
# Now that 'Planet Mass (Jupiter)' is numerical, filling missing values with the mean
df['Planet Mass (Jupiter)'].fillna(df['Planet Mass (Jupiter)'].mean(), inplace=True)
df.drop(columns=['Planet Mass'], inplace=True)

In [None]:
# Handling other columns (Planet Radius, etc.) similarly
def convert_radius(value):
    if pd.isna(value):
        return np.nan
    # Jupiter's diameter is about 11 times larger than Earth's.
    if 'Earth' in value:
        return float(value.split(' ')[0]) / 11.2  # Converting Earth radii to Jupiter radii
    elif 'Jupiter' in value:
        return float(value.split(' ')[0])
    else:
        return np.nan

In [None]:
# Appling the conversion function to the 'Planet Radius' column
df['Planet Radius (Jupiter)'] = df['Planet Radius'].apply(convert_radius)

# Filling missing values in 'Planet Radius (Jupiter)' with the mean
df['Planet Radius (Jupiter)'].fillna(df['Planet Radius (Jupiter)'].mean(), inplace=True)
df.drop(columns=['Planet Radius'], inplace=True)


In [None]:
# Function to clean the eccentricity values
def clean_eccentricity(value):
    if pd.isna(value) or value == '':
        return np.nan
    if isinstance(value, str):
        if value.startswith('<'):
            return float(value[2:])  # Converting to a float and taking after"< "
        else:
            return float(value)  # Converting to a float
    return value

# Appling the cleaning function to the Eccentricity column
df['Eccentricity'] = df['Eccentricity'].apply(clean_eccentricity)

# Now filling NaN values with the mean
# Handling extreme outliers for Eccentricity (values above 1 are not physically meaningful)
df['Eccentricity'] = df['Eccentricity'].apply(lambda x: x if x <= 1 else np.nan)
df['Eccentricity'].fillna(df['Eccentricity'].mean(), inplace=True)


In [None]:
#converting to light-yearss
df['Parsecs from Earth'] = pd.to_numeric(df['Parsecs from Earth'], errors='coerce')
df['Distance (light-years)'] = df['Parsecs from Earth'] * 3.26156 # 1 Persecs from Earth = 3.26156
df['Distance (light-years)'].fillna(df['Distance (light-years)'].mean(), inplace=True)  # or dropna() if appropriate
df.drop(columns=['Parsecs from Earth'], inplace=True)

In [None]:
# Converting 'Stellar Magnitude' to numeric, coercing errors (non-numeric values will be converted to NaN)
df['Stellar Magnitude'] = pd.to_numeric(df['Stellar Magnitude'], errors='coerce')
df['Stellar Magnitude'].fillna(df['Stellar Magnitude'].mean(), inplace=True)

In [None]:
# Function to clean 'Orbital Radius' values by removing 'AU' and converting to numeric
def clean_orbital_radius(value):
    if pd.isna(value):
        return np.nan
    # Remove any trailing or leading spaces and the 'AU' unit
    value = str(value).replace('AU', '').strip()
    # Convert the cleaned value to a float
    try:
        return float(value)
    except ValueError:
        return np.nan

# Appling the function to the 'Orbital Radius' column
df['Orbital Radius'] = df['Orbital Radius'].apply(clean_orbital_radius)

# Filling missing values in 'Orbital Radius' with the mean
df['Orbital Radius'].fillna(df['Orbital Radius'].mean(), inplace=True)
# Handling extreme outliers for 'Orbital Radius' (remove rows with excessively high values)
df = df[df['Orbital Radius'] <= df['Orbital Radius'].quantile(0.99)]


In [None]:
# Fill missing values with 'Most common' for 'Discovery Method' and 'Planet Type'
# df['Discovery Method'] = df['Discovery Method'].fillna('Unknown').astype('category')
# df['Planet Type'] = df['Planet Type'].fillna('Unknown').astype('category')
df['Discovery Method'] = df['Discovery Method'].fillna(df['Discovery Method'].mode()[0]).astype('category')
df['Planet Type'] = df['Planet Type'].fillna(df['Planet Type'].mode()[0]).astype('category')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Discovery Method'] = df['Discovery Method'].fillna(df['Discovery Method'].mode()[0]).astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Planet Type'] = df['Planet Type'].fillna(df['Planet Type'].mode()[0]).astype('category')


In [None]:
# Function to clean and convert orbital period values to years
def clean_orbital_period(value):
    if pd.isna(value):
        return np.nan
    # Convert to string to handle cases like "days" or "years"
    value = str(value).strip().lower()  # Lowercase for consistent comparison
    try:
        # If the value contains 'days', convert it to years
        if 'days' in value:
            days_value = float(value.replace('days', '').strip())
            return days_value / 365.25
        # If the value contains 'years', just extract the numeric part
        elif 'years' in value or 'year' in value:  # Handle both singular/plural
            return float(value.replace('years', '').replace('year', '').strip())
        # Otherwise, assume it's already in years or numeric
        else:
            return float(value)
    except ValueError:
        # If there's an error converting the value, return NaN
        return np.nan

# Apply the function to the 'Orbital Period' column
df['Orbital Period (Years)'] = df['Orbital Period'].apply(clean_orbital_period)

# Fill missing values with the mean of 'Orbital Period (Years)' if necessary
df['Orbital Period (Years)'].fillna(df['Orbital Period (Years)'].mean(), inplace=True)
df.drop(columns=['Orbital Period'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Orbital Period (Years)'] = df['Orbital Period'].apply(clean_orbital_period)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Orbital Period (Years)'].fillna(df['Orbital Period (Years)'].mean(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=['Orbital Period'], inplace=True)


In [None]:
df.isnull().sum()

Unnamed: 0,0
Planet Name,0
Stellar Magnitude,0
Discovery Date,0
Planet Type,0
Discovery Method,0
Orbital Radius,0
Eccentricity,0
Planet Mass (Jupiter),0
Planet Radius (Jupiter),0
Distance (light-years),0


In [None]:
# Now your DataFrame should be clean and missing values handled
print(df.head())

# Save the cleaned data to a new CSV
df.to_csv('nasa_exoplanets_cleaned.csv', index=False)

print("Data cleaning complete. Cleaned CSV saved as 'nasa_exoplanets_cleaned.csv'.")

  Planet Name  Stellar Magnitude  Discovery Date   Planet Type  \
0  HD 21520 b              9.170            2024  Neptune-like   
1  TOI-3568 b             12.879            2024  Neptune-like   
2  TOI-2384 b             15.103            2024     Gas Giant   
3    TOI-2379             15.548            2024     Gas Giant   
4  TOI-6034 b             15.189            2024     Gas Giant   

  Discovery Method  Orbital Radius  Eccentricity  Planet Mass (Jupiter)  \
0          Transit         0.17260          0.00               0.055695   
1          Transit         0.04850          0.04               0.083071   
2          Transit         0.02793          0.00               1.966000   
3          Transit         0.05263          0.34               5.760000   
4          Transit         0.02949          0.04               0.798000   

   Planet Radius (Jupiter)  Distance (light-years)  Orbital Period (Years)  
0                    0.241              258.341644                0.068720 