In [43]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, explained_variance_score, mean_absolute_percentage_error

In [149]:
df = pd.read_csv('exoplanet_dataset.csv', skiprows=294, low_memory=False)
df


Unnamed: 0,pl_name,hostname,pl_letter,hd_name,hip_name,tic_id,gaia_id,default_flag,sy_snum,sy_pnum,...,rowupdate,pl_pubdate,releasedate,pl_nnotes,st_nphot,st_nrvc,st_nspec,pl_nespec,pl_ntranspec,pl_ndispec
0,AU Mic b,AU Mic,b,HD 197481,HIP 102409,TIC 441420236,Gaia DR2 6794047652729201024,1,1,3,...,2022-05-09,2021-12,2022-05-09,1,1,0,1,0,0,0
1,AU Mic c,AU Mic,c,HD 197481,HIP 102409,TIC 441420236,Gaia DR2 6794047652729201024,1,1,3,...,2022-05-09,2021-12,2022-05-09,2,1,0,1,0,0,0
2,BD-14 3065 b,BD-14 3065 A,b,,,TIC 293607057,Gaia DR2 3751877374435102720,1,3,1,...,2024-06-25,2024-03,2024-06-25,0,0,0,0,0,0,0
3,DS Tuc A b,DS Tuc A,b,HD 222259 A,HIP 116748 A,TIC 410214986,Gaia DR2 6387058411482257536,1,2,1,...,2019-07-03,2019-07,2019-07-11,2,1,0,0,0,0,0
4,GJ 1252 b,GJ 1252,b,,,TIC 370133522,Gaia DR2 6468968316900356736,1,1,1,...,2023-07-10,2022-09,2023-07-10,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,WASP-18 c,WASP-18,c,HD 10069,HIP 7562,TIC 100100827,Gaia DR2 4955371367334610048,1,2,2,...,2019-12-02,2019-12,2019-12-05,4,8,0,0,0,0,0
541,WASP-84 c,WASP-84,c,,,TIC 350293646,Gaia DR2 3078836109158636928,1,1,2,...,2023-07-10,2023-06,2023-07-10,0,0,0,0,0,0,0
542,WD 1856+534 b,WD 1856+534,b,,,TIC 267574918,Gaia DR2 2146576589564898688,1,3,1,...,2020-12-10,2020-09,2020-12-17,4,0,0,0,0,0,0
543,Wolf 327 b,Wolf 327,b,,,TIC 4918918,Gaia DR2 796185407950360192,1,1,1,...,2024-02-05,2024-01,2024-02-05,1,0,0,0,0,0,0


In [151]:
print(df.head())

        pl_name      hostname pl_letter      hd_name      hip_name  \
0      AU Mic b        AU Mic         b    HD 197481    HIP 102409   
1      AU Mic c        AU Mic         c    HD 197481    HIP 102409   
2  BD-14 3065 b  BD-14 3065 A         b          NaN           NaN   
3    DS Tuc A b      DS Tuc A         b  HD 222259 A  HIP 116748 A   
4     GJ 1252 b       GJ 1252         b          NaN           NaN   

          tic_id                       gaia_id  default_flag  sy_snum  \
0  TIC 441420236  Gaia DR2 6794047652729201024             1        1   
1  TIC 441420236  Gaia DR2 6794047652729201024             1        1   
2  TIC 293607057  Gaia DR2 3751877374435102720             1        3   
3  TIC 410214986  Gaia DR2 6387058411482257536             1        2   
4  TIC 370133522  Gaia DR2 6468968316900356736             1        1   

   sy_pnum  ...   rowupdate  pl_pubdate releasedate  pl_nnotes st_nphot  \
0        3  ...  2022-05-09     2021-12  2022-05-09          1   

In [153]:
# List of columns to keep
columns_of_interest = [
    # Orbital Characteristics
    'pl_orbper',    # Orbital Period [days]
    'pl_orbsmax',   # Orbit Semi-Major Axis [au]
    'pl_insol',     # Insolation Flux [Earth Flux]
    'pl_eqt',       # Equilibrium Temperature [K]
    'pl_orbeccen',  # Eccentricity
    
    # Planetary Characteristics
    'pl_rade',      # Planet Radius [Earth Radius]
    'pl_masse',     # Planet Mass [Earth Mass]
    'pl_dens',      # Planet Density [g/cm**3]

    # Optional but Useful
    'st_teff',      # Stellar Effective Temperature [K]
    'pl_orbincl',   # Inclination [deg]
    'pl_ratror'     # Ratio of Planet to Stellar Radius
]


# Filter the DataFrame to keep only the relevant columns
data_filtered = df[columns_of_interest]


In [155]:
data_filtered

Unnamed: 0,pl_orbper,pl_orbsmax,pl_insol,pl_eqt,pl_orbeccen,pl_rade,pl_masse,pl_dens,st_teff,pl_orbincl,pl_ratror
0,8.462999,0.06450,,,0.186,,20.120,1.32,,,
1,18.858991,0.11010,,,,,9.600,1.22,,,
2,4.288973,0.06560,,2001.0,0.066,21.590,3932.000,2.15,6935.0,80.780,0.08430
3,8.138268,,,850.0,0.000,5.700,,,5428.0,89.500,0.05419
4,0.518233,,,,0.000,1.193,1.320,4.20,3458.0,,
...,...,...,...,...,...,...,...,...,...,...,...
540,2.155800,0.03500,,,0.015,,55.200,,6400.0,,
541,1.446885,0.02359,,1329.0,,1.950,15.200,11.20,,83.200,0.02330
542,1.407941,0.02040,0.18,163.0,0.000,10.400,4386.054,,4710.0,88.778,7.28000
543,0.573474,0.01000,233.90,996.0,0.000,1.240,2.530,7.24,3542.0,84.890,0.02800


In [157]:
print(data_filtered.isnull().sum())

pl_orbper        0
pl_orbsmax      49
pl_insol       304
pl_eqt          72
pl_orbeccen    105
pl_rade         11
pl_masse       119
pl_dens        181
st_teff         10
pl_orbincl     143
pl_ratror       94
dtype: int64


In [159]:
# Drop columns with more than 50% missing values
missing_percentage = data_filtered.isnull().mean() * 100
columns_to_drop = missing_percentage[missing_percentage > 50].index
data_filtered_dropped = data_filtered.drop(columns=columns_to_drop)

# Drop rows with more than 50% missing values
row_missing_percentage = data_filtered_dropped.isnull().mean(axis=1) * 100
rows_to_drop = row_missing_percentage[row_missing_percentage > 50].index
data_cleaned = data_filtered_dropped.drop(index=rows_to_drop)

In [161]:
print(data_cleaned.isnull().sum())

pl_orbper        0
pl_orbsmax      46
pl_eqt          67
pl_orbeccen    101
pl_rade          9
pl_masse       115
pl_dens        177
st_teff          8
pl_orbincl     138
pl_ratror       89
dtype: int64


In [163]:
# Fill missing values with the mean of each column
df_cleaned = data_cleaned.fillna(data_cleaned.mean())

In [165]:
print(df_cleaned.isnull().sum())

pl_orbper      0
pl_orbsmax     0
pl_eqt         0
pl_orbeccen    0
pl_rade        0
pl_masse       0
pl_dens        0
st_teff        0
pl_orbincl     0
pl_ratror      0
dtype: int64


In [167]:
data=df_cleaned

In [169]:
def calculate_habitability_score(row):
    score = 0
    if 0.1 < row['pl_masse'] < 5:
        score += 0.1
    if 0 < row['pl_orbeccen'] < 0.6:
        score += 0.15
    if 0 < row['pl_orbsmax'] < 1:
        score += 0.15
    if 3900 < row['st_teff'] < 7400:
        score += 0.1
    if 0.5 < row['pl_ratror'] < 1.5:
        score += 0.1
    if 0 < row['pl_rade'] < 1.5:
        score += 0.1
    if 0 < row['pl_dens'] < 7.6:
        score += 0.1
    if 175 < row['pl_eqt'] < 270:
        score += 0.2
    return score

def calculate_earth_likeness_score(row):
    score = 0
    if 100 < row['pl_orbper'] < 500:
        score += 0.1
    if 0 < row['pl_orbsmax'] < 1.5:
        score += 0.1
    if 0 < row['pl_rade'] < 2.0:
        score += 0.2
    if 0 < row['pl_masse'] < 2.0:
        score += 0.1
    if 4 < row['pl_dens'] < 9:
        score += 0.1
    if 200 < row['pl_eqt'] < 300:
        score += 0.1
    if 10 < row['pl_orbincl'] < 26:
        score += 0.5
    if 0 < row['pl_ratror'] < 5:
        score += 0.5
    if 0 < row['pl_orbeccen'] < 0.08:
        score += 0.2
    return score


In [171]:
data['habitability_score'] = data.apply(calculate_habitability_score, axis=1)
data['earth_likeness_score'] = data.apply(calculate_earth_likeness_score, axis=1)

# Normalize the scores to percentages
data['habitability_percentage'] = data['habitability_score'] * 100
data['earth_likeness_percentage'] = data['earth_likeness_score'] * 100

features_habitability= [ 'pl_orbsmax', 'pl_eqt', 'pl_orbeccen', 'pl_rade', 'pl_masse', 'pl_dens', 'st_teff', 'pl_ratror']
features_earth_likeness = ['pl_orbper', 'pl_orbsmax', 'pl_eqt', 'pl_orbeccen', 'pl_rade', 'pl_masse', 'pl_dens', 'pl_orbincl', 'pl_ratror']

# Prepare data for Habitability Model
X_habitability = data[features_habitability]
y_habitability = data['habitability_percentage']

# Prepare data for Earth Likeness Model
X_earth_likeness = data[features_earth_likeness]
y_earth_likeness = data['earth_likeness_percentage']


In [173]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, mean_absolute_percentage_error

# Step 1: Train-Test Split
# Split data for Habitability Model
X_train_hab, X_test_hab, y_train_hab, y_test_hab = train_test_split(X_habitability, y_habitability, test_size=0.2, random_state=42)

# Split data for Earth Likeness Model
X_train_earth, X_test_earth, y_train_earth, y_test_earth = train_test_split(X_earth_likeness, y_earth_likeness, test_size=0.2, random_state=42)

# Step 2: Feature Scaling
# Initialize the StandardScaler
scaler = StandardScaler()

# Scale features for Habitability Model
X_train_hab = scaler.fit_transform(X_train_hab)
X_test_hab = scaler.transform(X_test_hab)

# Scale features for Earth Likeness Model
X_train_earth = scaler.fit_transform(X_train_earth)
X_test_earth = scaler.transform(X_test_earth)

'''# Step 3: Train and Evaluate the Models

# Train Habitability Model
model_hab = LinearRegression()
model_hab.fit(X_train_hab, y_train_hab)

# Predict and evaluate the Habitability Model
y_pred_hab = model_hab.predict(X_test_hab)
print("Habitability Model Performance:")
print(f"R^2 Score: {r2_score(y_test_hab, y_pred_hab)}")
print(f"Mean Squared Error: {mean_squared_error(y_test_hab, y_pred_hab)}")
print(f"Mean Absolute Error: {mean_absolute_error(y_test_hab, y_pred_hab)}")
print(f"Explained Variance Score: {explained_variance_score(y_test_hab, y_pred_hab)}")
print(f"Mean Absolute Percentage Error: {mean_absolute_percentage_error(y_test_hab, y_pred_hab)}")

# Train Earth Likeness Model
model_earth = LinearRegression()
model_earth.fit(X_train_earth, y_train_earth)

# Predict and evaluate the Earth Likeness Model
y_pred_earth = model_earth.predict(X_test_earth)
print("\nEarth Likeness Model Performance:")
print(f"R^2 Score: {r2_score(y_test_earth, y_pred_earth)}")
print(f"Mean Squared Error: {mean_squared_error(y_test_earth, y_pred_earth)}")
print(f"Mean Absolute Error: {mean_absolute_error(y_test_earth, y_pred_earth)}")
print(f"Explained Variance Score: {explained_variance_score(y_test_earth, y_pred_earth)}")
print(f"Mean Absolute Percentage Error: {mean_absolute_percentage_error(y_test_earth, y_pred_earth)}")
'''


'# Step 3: Train and Evaluate the Models\n\n# Train Habitability Model\nmodel_hab = LinearRegression()\nmodel_hab.fit(X_train_hab, y_train_hab)\n\n# Predict and evaluate the Habitability Model\ny_pred_hab = model_hab.predict(X_test_hab)\nprint("Habitability Model Performance:")\nprint(f"R^2 Score: {r2_score(y_test_hab, y_pred_hab)}")\nprint(f"Mean Squared Error: {mean_squared_error(y_test_hab, y_pred_hab)}")\nprint(f"Mean Absolute Error: {mean_absolute_error(y_test_hab, y_pred_hab)}")\nprint(f"Explained Variance Score: {explained_variance_score(y_test_hab, y_pred_hab)}")\nprint(f"Mean Absolute Percentage Error: {mean_absolute_percentage_error(y_test_hab, y_pred_hab)}")\n\n# Train Earth Likeness Model\nmodel_earth = LinearRegression()\nmodel_earth.fit(X_train_earth, y_train_earth)\n\n# Predict and evaluate the Earth Likeness Model\ny_pred_earth = model_earth.predict(X_test_earth)\nprint("\nEarth Likeness Model Performance:")\nprint(f"R^2 Score: {r2_score(y_test_earth, y_pred_earth)}")\n

In [175]:
from sklearn.ensemble import RandomForestRegressor

# Train Random Forest for Habitability Model
model_hab = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model_hab = model_hab.fit(X_train_hab, y_train_hab)

joblib.dump(rf_model_hab, 'habitability_model.pkl')

# Predict and evaluate the Habitability Model
y_pred_rf_hab = model_hab.predict(X_test_hab)
print("\nRandom Forest Habitability Model Performance:")
print(f"R^2 Score: {r2_score(y_test_hab, y_pred_rf_hab)}")
print(f"Mean Squared Error: {mean_squared_error(y_test_hab, y_pred_rf_hab)}")
print(f"Mean Absolute Error: {mean_absolute_error(y_test_hab, y_pred_rf_hab)}")
print(f"Explained Variance Score: {explained_variance_score(y_test_hab, y_pred_rf_hab)}")
print(f"Mean Absolute Percentage Error: {mean_absolute_percentage_error(y_test_hab, y_pred_rf_hab)}")

# Train Random Forest for Earth Likeness Model
model_earth = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model_earth = model_earth.fit(X_train_earth, y_train_earth)


joblib.dump(rf_model_earth, 'earth_likeness_model.pkl')
# Predict and evaluate the Earth Likeness Model
y_pred_rf_earth = model_earth.predict(X_test_earth)
print("\nRandom Forest Earth Likeness Model Performance:")
print(f"R^2 Score: {r2_score(y_test_earth, y_pred_rf_earth)}")
print(f"Mean Squared Error: {mean_squared_error(y_test_earth, y_pred_rf_earth)}")
print(f"Mean Absolute Error: {mean_absolute_error(y_test_earth, y_pred_rf_earth)}")
print(f"Explained Variance Score: {explained_variance_score(y_test_earth, y_pred_rf_earth)}")
print(f"Mean Absolute Percentage Error: {mean_absolute_percentage_error(y_test_earth, y_pred_rf_earth)}")



Random Forest Habitability Model Performance:
R^2 Score: 0.8741191277917272
Mean Squared Error: 10.208125000000003
Mean Absolute Error: 1.1662037037037036
Explained Variance Score: 0.8742175789612792
Mean Absolute Percentage Error: 0.025301787718454385

Random Forest Earth Likeness Model Performance:
R^2 Score: 0.972647577092511
Mean Squared Error: 5.749074074074066
Mean Absolute Error: 0.994444444444444
Explained Variance Score: 0.9727005873715125
Mean Absolute Percentage Error: 0.011810368900646673


In [73]:
'''import joblib

# Load models
rf_hab = RandomForestRegressor()
rf_earth = RandomForestRegressor()
#earth_likeness_model = joblib.load('earth_likeness_model.pkl')'''


In [177]:
joblib.dump(rf_model_hab, 'habitability_model.pkl')
joblib.dump(rf_model_earth, 'earth_likeness_model.pkl')

['earth_likeness_model.pkl']

In [179]:
import joblib

# Load models
habitability_model = joblib.load('habitability_model.pkl')
earth_likeness_model = joblib.load('earth_likeness_model.pkl')

In [147]:
if hasattr(model_hab, 'feature_importances_'):
    print("Model is fitted.")
else:
    print("Model is not fitted.")


Model is fitted.
