In [146]:
# Import all necessary packages including astropy for constants in equations

# import packages
import numpy as np
import pandas as pd
import requests
import math
from astropy.constants import sigma_sb, L_sun
from bs4 import BeautifulSoup as BS

# Import visual packages
import matplotlib.pyplot as plt
import seaborn as sn
from matplotlib import cm

In [2]:
# Read in the CSV file from NASA Exoplanet Archive 
planets = pd.read_csv('NASA_planets.csv')

In [4]:
# Check the dataframe of all features and observations 
planets

Unnamed: 0,pl_name,sy_snum,sy_pnum,pl_orbper,pl_orbpererr1,pl_orbpererr2,pl_orbperlim,pl_orbsmax,pl_orbsmaxerr1,pl_orbsmaxerr2,...,st_masserr1,st_masserr2,st_masslim,rastr,ra,decstr,dec,sy_dist,sy_disterr1,sy_disterr2
0,11 Com b,2,1,326.03000,0.32,-0.32,0.0,1.29000,0.05000,-0.05000,...,0.30,-0.30,0.0,12h20m42.91s,185.178779,+17d47m35.71s,17.793252,93.1846,1.92380,-1.92380
1,11 Com b,2,1,,,,,1.21000,0.06000,-0.05000,...,0.40,-0.30,0.0,12h20m42.91s,185.178779,+17d47m35.71s,17.793252,93.1846,1.92380,-1.92380
2,11 UMi b,1,1,,,,,1.51000,0.06000,-0.05000,...,0.40,-0.30,0.0,15h17m05.90s,229.274595,+71d49m26.19s,71.823943,125.3210,1.97650,-1.97650
3,11 UMi b,1,1,516.21997,3.20,-3.20,0.0,1.53000,0.07000,-0.07000,...,0.69,-0.69,0.0,15h17m05.90s,229.274595,+71d49m26.19s,71.823943,125.3210,1.97650,-1.97650
4,11 UMi b,1,1,516.22000,3.25,-3.25,0.0,1.54000,0.07000,-0.07000,...,0.25,-0.25,0.0,15h17m05.90s,229.274595,+71d49m26.19s,71.823943,125.3210,1.97650,-1.97650
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29362,ups And d,2,3,1278.10000,2.90,-2.90,0.0,2.53000,0.15000,-0.15000,...,0.16,-0.14,0.0,01h36m47.60s,24.198353,+41d24m13.73s,41.403815,13.4054,0.06350,-0.06290
29363,ups And d,2,3,1276.46000,0.57,-0.57,0.0,2.51329,0.00075,-0.00075,...,,,0.0,01h36m47.60s,24.198353,+41d24m13.73s,41.403815,13.4054,0.06350,-0.06290
29364,ups And d,2,3,1319.00000,18.00,-18.00,0.0,2.57000,,,...,,,,01h36m47.60s,24.198353,+41d24m13.73s,41.403815,13.4054,0.06350,-0.06290
29365,xi Aql b,1,1,,,,,0.58000,0.02000,-0.03000,...,0.20,-0.20,0.0,19h54m14.99s,298.562449,+08d27m39.98s,8.461105,56.1858,0.55975,-0.55975


In [7]:
# Creates a new dataframe with all unique exoplanets and no duplicates 
final_planets = planets.groupby('pl_name').mean().reset_index()

In [8]:
# Find all new unique exoplanet names then put it into a variable
pl_name = final_planets.pl_name.unique()

In [9]:
# Iterate through all the exoplanet names where there is no values for the 
# spectral type of the star and fill it with the appropriate spectral type 
# otherwise replace with Nan value

for name in pl_name:
    try:
        final_planets.loc[final_planets.pl_name == name, 'spec_type'] = planets.loc[planets.pl_name == name, 'st_spectype'].dropna().values[0]
    except:
        final_planets.loc[final_planets.pl_name == name, 'spec_type'] = np.nan
    

In [15]:
# Get all Nan values from the Stellar Effective Surface Temperatures Column
nan_stellar_t = final_planets[final_planets.st_teff.isna()]

In [17]:
# From NASA and Kyoto database going to replace the surface temperature of the host star based on its
# spectral type, by creating dictionary to grab values from
spectral_type =  {'M5.5/M6': 2950,
        'M8.5': 2700,
         np.nan: np.nan,
        'K2 V': 4960,
        'M3.5': 3500,
        'K7': 4000,
        'L1.5': 2400,
        'M2.5': 3600,
        'M3.5 Ve': 3500,
        'M0 V': 3750,
        'M4.5 V': 3400,
        'M3': 3500,
        'G6 V': 5570,
        'K0': 5240,
        'G0': 6050,
        'G1 V': 5930,
        'G8 V': 5440,
        'G8': 5440,
        'G4 V': 5690,
        'G4 IV-V': 5700,
        'M2 V': 3600,
        'M0': 3750,
        'M0.5 Ve': 3800,
        'F5 V': 6700,
        'F7 V': 6400,
        'M': 3225,
        'K': 4620,
        'M2-M7': 3250,
        'K5': 4400,
        'G5 V': 5660,
        'M V': 3225}

In [18]:
# Replace all Nan values in the Stellar Effective Surface Temperature Column with values from spectral_type
# dictionary created above 
final_planets.loc[(final_planets.st_teff.isna()) , 'st_teff'] = \
final_planets.loc[final_planets.st_teff.isna(), 'spec_type'].apply(lambda x: spectral_type[x])

In [20]:
# Feature engineering by converting the exoplanets orbits in days to years 
final_planets['pl_orbper_yrs'] = final_planets['pl_orbper'] / 365.25

In [22]:
# drop all observations of exoplanets that do not have a orbital period, droping 140 columns 
final_planets.dropna(subset = ['pl_orbper'], inplace = True)

In [23]:
# Replaced all Nan values from the planets semimajor axis and calculauted it
final_planets.loc[final_planets.pl_orbsmax.isna(), 'pl_orbsmax'] = \
final_planets.loc[final_planets.pl_orbsmax.isna(), 'pl_orbper_yrs'].apply(lambda x: (x**2)**(1./ 3.))

In [28]:
# Locate all the planet names with Nan value for effective surface temperatures
final_planets.loc[final_planets.st_teff.isna(), 'pl_name'].values[0:76]

array(['2MASS J19383260+4603591 b', 'CoRoT-20 c', 'DE CVn b', 'DP Leo b',
       'EPIC 201238110 b', 'EPIC 201497682 b', 'EPIC 201615463 c',
       'EPIC 201754305 d', 'EPIC 201833600 c', 'EPIC 206024342 b',
       'EPIC 206032309 b', 'EPIC 206042996 b', 'EPIC 206215704 b',
       'EPIC 206317286 b', 'EPIC 212297394 b', 'EPIC 212424622 b',
       'EPIC 212499991 b', 'EPIC 212587672 b', 'EPIC 220554210 c',
       'GJ 163 b', 'GJ 163 c', 'GJ 163 d', 'GJ 180 d', 'GJ 2056 b',
       'GJ 229 A c', 'GJ 3082 b', 'GJ 433 d', 'GJ 676 A c', 'GJ 687 c',
       'HD 102329 c', 'HD 114783 c', 'HD 116029 c', 'HD 156279 c',
       'HD 177565 b', 'HD 33142 c', 'HD 41004 A b', 'HD 47186 c',
       'HD 73526 c', 'HD 99706 c', 'HIP 4845 b', 'HU Aqr AB b',
       'HU Aqr AB c', 'HW Vir b', 'IC 4651 9122 b', 'K2-32 e',
       'Kepler-448 c', 'Kepler-47 d', 'Kepler-65 e', 'Kepler-82 f',
       'MOA-2009-BLG-266L b', 'MOA-2009-BLG-387L b',
       'MOA-2011-BLG-293L b', 'MXB 1658-298 b', 'OGLE-2005-BLG-390L b'

In [29]:
#Webscaped tabke from kyoto database
url = 'http://www.exoplanetkyoto.org/exohtml/A_All_HostStars.html'
resp = requests.get(url)
bs = BS(resp.content, 'html.parser')
x = bs.table.findAll('td')
lst = []
for i in range(0,len(x),10):
    lst.append([f.text.strip() for f in x[i:i+10]])
    df = pd.DataFrame(lst,columns=lst[0])

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


In [30]:
#dropped the 0 column and set index to No
df.drop(0, inplace = True)
df.set_index('No', inplace = True)

In [32]:
#Create dictionary for the effective surface temperature for the specific host star
eff_surf_temp = {'2MASS J19383260+4603591 b': 33500.00,
                 'CoRoT-20 c': 5880.00,
                 'DE CVn b': 5745.00,
                 'DP Leo b': 3500.00,
                 'EPIC 201238110 b': 3587.00,
                 'EPIC 201497682 b': 4670.3500,
                 'EPIC 201615463 c': 3300.00,
                 'EPIC 201754305 d': 4761.00,
                 'EPIC 201833600 c': 3300.00,
                 'EPIC 206024342 b': 5800.00,
                 'EPIC 206032309 b': 3225.00,
                 'EPIC 206042996 b': 4800.00,
                 'EPIC 206215704 b': 3225.00,
                 'EPIC 206317286 b': 4800.00, 
                 'EPIC 212297394 b': 3700.00,
                 'EPIC 212424622 b': 5776.00,
                 'EPIC 212499991 b': 4730.00,
                 'EPIC 212587672 b': 5776.00,
                 'EPIC 220554210 c': 5776.00,
                 'GJ 163 b': 3300.00,
                 'GJ 163 c': 3300.00,
                  'GJ 163 d': 3300.00,
                  'GJ 180 d': 3371.00,
                 'GJ 2056 b': 4730.00,
                'GJ 229 A c': 3564.00,
                 'GJ 3082 b': 3500.00,
                  'GJ 433 d': 3600.00,
                'GJ 676 A c': 3750.00,
                  'GJ 687 c': 3350.00,
               'HD 102329 c': 4830.00,
               'HD 114783 c': 5105.00,
               'HD 116029 c': 4951.00,
               'HD 156279 c': 5453.00,
               'HD 177565 b': 5440.00,
                'HD 33142 c': 4978.00,
              'HD 41004 A b': 5035.00,
                'HD 47186 c': 5675.00,
                'HD 73526 c': 5590.00,
                'HD 99706 c': 4932.00,
                'HIP 4845 b': 3750.00,
              'HU Aqr AB b': 5500.00,
              'HU Aqr AB c': 5500.00,
                 'HW Vir b': 5500.00,
           'IC 4651 9122 b': 3300.00,
                  'K2-32 e': 5315.00,
             'Kepler-448 c': 6820.00,
              'Kepler-47 d': 5636.00,
              'Kepler-65 e': 6211.00,
              'Kepler-82 f': 5428.00,
      'MOA-2009-BLG-266L b': 3300.00,
      'MOA-2009-BLG-387L b': 3300.00,
      'MOA-2011-BLG-293L b': 3300.00,
           'MXB 1658-298 b': 3300.00,
     'OGLE-2005-BLG-390L b': 3300.00,
     'OGLE-2006-BLG-109L b': 3300.00,
     'OGLE-2006-BLG-109L c': 3300.00,
    'OGLE-2016-BLG-1190L b': 3300.00,
           'PSR B0329+54 b': 3300.00,
           'PSR B1257+12 b': 3300.00,
           'PSR B1257+12 c': 3300.00,
           'PSR B1257+12 d': 3300.00,
         'PSR J2322-2650 b': 3300.00,
              'SWEEPS-11 b': 3300.00,
               'SWEEPS-4 b': 3300.00,
                 'UZ For b': 5500.00,
                 'UZ For c': 5500.00,
              'V1298 Tau c': 4970.00,
              'V1298 Tau d': 4970.00,
              'V1298 Tau e': 4970.00,
                 'WASP-8 c': 5600.00,
                'bet Pic c': 8100.00,
                'eps Eri b': 5084.00,
                'tau Cet e': 5344.00,
                'tau Cet f': 5344.00,
                'tau Cet g': 5344.00,
                'tau Cet h': 5344.00}
                 

In [33]:
# Replace all nan values for Stellar Surface Temperatures with the exact temperature from the 
# eff_sur_temp dictionary 
final_planets.loc[(final_planets.st_teff.isna()) , 'st_teff'] = \
final_planets.loc[final_planets.st_teff.isna(), 'pl_name'].apply(lambda x: eff_surf_temp[x])

In [36]:
# Grabbed the stellar radius from the NASA exoplanet database
radi_df = pd.read_csv('stellar_radi.csv')

In [38]:
# dropped unecessary columns from this dataframe 
radi_df.drop(columns = ['st_raderr1', 'st_raderr2',
       'st_radlim','st_masserr1', 'st_masserr2', 'st_masslim'], inplace = True)

In [39]:
# Grabbed all the exoplanets that are repeated more than once
radi_df = radi_df.groupby('pl_name').mean().reset_index()

In [43]:
# merged the two dataframes together on the pl_name column
HZ = pd.merge(radi_df, final_planets, on ='pl_name')

In [45]:
# reranged columns names
new_col = ['pl_name', 'pl_orbper_yrs', 'st_mass_x', 'sy_snum', 'sy_pnum', 'pl_orbper',
       'pl_orbsmax', 'pl_bmasse', 'pl_bmassj', 'st_teff', 'st_mass_y', 'st_rad', 'ra',
       'dec', 'sy_dist', 'spec_type']

HZ = HZ[new_col]

In [112]:
# create two new variables to check for unique spetral types and their values 
spec_list = HZ.spec_type.unique()
spec_list_values = HZ.spec_type.values

4224

In [50]:
# List comprehension to grab each spectral type and put into a string 
spec_list = [x for x in spec_list if type(x) == str]

In [51]:
# Create an empty dictionary called spec_ty
spec_ty = {}

#Iterate through the chr type from A to Z in spectral type, creating a dictionary of a key value pair
# if the spec type started with a specific value out of the OBAFGKM spectral types
for i in range(65, 91):
    let = chr(i)
    lst = [x for x in spec_list if x.startswith(let)]
    if lst:
        spec_ty[let] = lst

{'A': ['A8 V', 'A5', 'A', 'A2', 'A8', 'A7 V', 'A1 IV-V'],
 'B': ['B9.5-A0', 'B'],
 'F': ['F6 V',
  'F0 IV',
  'F9 V',
  'F8 IV',
  'F9',
  'F3 V',
  'F8 V',
  'F5 V',
  'F8',
  'F',
  'F V',
  'F5',
  'F8 IV/V',
  'F7 V',
  'F9 IV/V',
  'F6 IV',
  'F9 IV',
  'F4 V',
  'F7 IV',
  'F5 IV',
  'F2',
  'F6',
  'F7',
  'F1 V',
  'F4',
  'F6 IV-V'],
 'G': ['G8 III',
  'G2.5 V',
  'G6 III',
  'G3 IV',
  'G1 V',
  'G8 V',
  'G5 V',
  'G3 III',
  'G5 III',
  'G',
  'G0 V',
  'G2 V',
  'G9 V',
  'G3 V',
  'G5',
  'G2',
  'G8/9 IV',
  'G2 IV',
  'G0 VI',
  'G6 V',
  'G0',
  'G8 IV/V',
  'G4',
  'G3',
  'G8',
  'G V',
  'G9 III',
  'G5 IV',
  'G4 IV',
  'G7 III',
  'G7 V',
  'G4 V',
  'G2/G3 V',
  'G1 IV',
  'G6',
  'G5 IV/V',
  'G8 IV',
  'G0 IV',
  'G6 IV',
  'G3 IV-V',
  'G2/G3 IV/V',
  'G1 IV/V',
  'G3/G5 V',
  'G4 IV-V',
  'G1.5 V',
  'G8/K0',
  'G1-1.5 V',
  'G2/G3',
  'G7 IV/V',
  'G9/K0',
  'G9 IV',
  'G0/F9 V',
  'G1',
  'G9',
  'G7'],
 'K': ['K4 III',
  'K0 III',
  'K0 V',
  'K1 III',
  '

In [53]:
# New dictionary based on the type of spectral type and the average stellar radius for that specific spectral type
radi_spec = {'A': 2.0, 'B': 7.0, 'F': 1.4 , 'G': 1.1, 'K': 0.9, 'L': 0.83 , 'M': 0.5, 'W': 3.2}

In [54]:
#Grabbed all the nan values for the spectral types 
nan_spec = final_planets.loc[final_planets.spec_type.isna()]

In [55]:
#Create a conditions, and choices for np.select to replace nan values from the spectral type based on ranges of 
# stellar temperatures
cond = [nan_spec.st_teff >= 10000, nan_spec.st_teff >= 7500, nan_spec.st_teff >= 6000,
       nan_spec.st_teff >= 5200, nan_spec.st_teff >= 3700]

choices = ['B', 'A', 'F', 'G', 'K']

input_spec = np.select(cond, choices, default = 'M')

In [56]:
# Replace all the nan values of the spectral type with appropriate values based on their stellar temperatures
HZ.loc[HZ.spec_type.isna(), 'spec_type'] = input_spec

In [57]:
# iterate through all the nan values for stellar radius and spectral type and replace with the average
# stellar radius of that spectral class type
for k, v in radi_spec.items():
    HZ.loc[(HZ.st_rad.isna()) & (HZ.spec_type.str.startswith(k)), 'st_rad'] = v

In [59]:
# Created two variables for the number pi, and stefan boltzman constant to calculate luminosities 
sb = 5.670374419 * (10**-8)
pi = math.pi

In [60]:
# Stellar Radius converted into meters
HZ['st_rad_m'] = (HZ.st_rad) * (6.957*(10**8))

In [62]:
# Calculated the luminosities of each host star based on their radius and effective surface temperture 
HZ['st_lum'] = 4*pi*((HZ.st_rad_m)**2)*sigma_sb.value*((HZ.st_teff)**4)

In [81]:
# Converted each lumonosity from Watts to solar luminosities based off our Sun 
HZ['st_lum_sol'] = HZ['st_lum'] / (L_sun.value)

In [183]:
# Created Outer range of the habitable zone based on the host stars solar luminosity
HZ['r_0'] = np.sqrt(HZ.st_lum_sol / 0.53)

In [184]:
# Created inner range of the habitable zone based on the host stars solar luminosity
HZ['r_1'] = np.sqrt(HZ.st_lum_sol / 1.1)

In [197]:
# Created the target variable of based on habitable zone ranges producing a binary classification 
HZ['hab_zone'] = np.where((HZ.pl_orbsmax >= HZ.r_1) & (HZ.pl_orbsmax <= HZ.r_0), 1, 0)

In [212]:
# Dropped the st_mass_y column, as it is duplicate from st_mass_x
HZ.drop(columns = ['st_mass_y'], inplace = True)

In [215]:
#Create an dictionary for the stellar masses of the with nan vales to raplace witht the average stellar mass based
# on the spectral type
st_mass_nan = {'M': 0.3, 
               'K': 0.8, 
               'G': 1.1, 
               'B': 18}

In [223]:
# Replaced nan values for stellar masses with average stellar mass based on spectral type
HZ.loc[HZ.st_mass_x.isna(), 'st_mass_x'] = \
HZ.loc[HZ.st_mass_x.isna(), 'spec_type'].apply(lambda x: st_mass_nan[x])

In [231]:
#Created CSV file to use in other notebooks
HZ.to_csv('HZ_final.csv')