In [None]:
import re
import exodata
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import exodata.astroquantities as aq

In [None]:
def isNaN(x):
    """pass in a value you'd like to check to see if it is not a number, not to be confused with .isna() which returns checks if null"""
    try:
        float(x)
    except:
        return True
    return False

In [None]:
def getListOfNames(names):
    """Pass in a string from dataframe that has names surrounded by parentheses you'd like to extract the names from
    ex: stars['name'] = Star('11 com b') -> stars['name'] = '11 com b'"""
    temp = ""
    while len(names) > 2:
        start = names.find('(') + 2
        end = names.find(')') - 1
        temp += names[start:end] if len(temp) == 0 else ", " + names[start:end]
        names = names[end + 2 :]
    return temp

In [None]:
def findGLName(primary, alts):
    """extract Gliese name & ID number and arrange them in formatting that matches current standard stellar naming conventions
    ex: stars['altnames'] = ['11 com b', 'Gliese 234', 'HD137'] -> stars['GL'] = 'GL 234'"""
    prefixes = ["GL ", "Gliese ", "NN ", "WO ", "GJ "]
    name = ""
    for i in range(len(prefixes)):
        if primary.startswith(prefixes[i]):
            name = primary
            break
    if name == "" and alts != "":
        for i in range(len(alts)):
            for j in range(len(prefixes)):
                if alts[i].startswith(prefixes[j]):
                    name = alts[i]
                    break
            if name != "":
                break
    if name == "":
        return np.nan
    if name.startswith("GL"):
        name = "Gl" + name[2:]
    elif name.startswith("WO"):
        name = "Wo" + name[2:]
    elif name.startswith("Gliese"):
        name = "GJ" + name[6:]
    return name

In [None]:
def findOtherName(check, primary, alts):
    """extract other names & ID number and arrange them in formatting that matches current standard stellar naming conventions
    ex: stars['altnames'] = ['11 com b', 'Gliese 234', 'HD137'] -> stars['HD'] = '137'"""
    name = ""
    if primary.startswith(check):
        name = primary[len(check) + 1:]
    if name == "" and alts == "":
        return np.nan
    for i in range(len(alts)):
        if alts[i].startswith(check):
            name = alts[i][len(check) + 1:]
    if name == "":
        return np.nan
    while re.search("\D", name):
        name = name[:-1]
    return float(name)

In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [None]:
# combination of telescope data for nearly 200k stars 
star_csv = pd.read_csv('hygdata_v3.csv')

In [None]:
# loading the most current data from the Open Exoplanet Catalouge
exocat = exodata.load_db_from_url('https://github.com/OpenExoplanetCatalogue/oec_gzip/raw/master/systems.xml.gz')

In [None]:
# assign names to tree branches we'll be using 
planets = exocat.planets
systems = exocat.systems
stars = exocat.stars
binary = exocat.binaries 

In [None]:
planets[0].system

In [None]:
# to remove every bit of info from the xml and transfer it to a dataframe, first we'll be making each of the branches into it's own dictionary 
sys_dict = {}
bn_dict = {}
star_dict = {}
pln_dict = {}

#then we iterate over each element in the branch and fill the dictionaries with the information they contain, data is stored in this tree
#in both callable methods, and within a dictionary within one of those methods, which makes it kind of a mess to extract 
#also the module is written to raise errors instead of returning nothing, so everything has to be wrapped in a try statement with nan being fill if not 

In [None]:
# try these with continue instead of the except statement and see what if anything changes, might be able to shorten 
# also change to i in range 
i = 0 
while i < 3307:
    sys_dict[i] = {}
    try:
        sys_dict[i]['right_ascension'] = systems[i].ra
    except:
        sys_dict[i]['right_ascension'] = np.nan
    try:
        sys_dict[i]['altnames'] = systems[i].altnames
    except:
        sys_dict[i]['altnames'] = np.nan
    try:
        sys_dict[i]['list'] = systems[i].list
    except:
        sys_dict[i]['list'] = np.nan
    try:
        sys_dict[i]['declination'] = systems[i].dec
    except:
        sys_dict[i]['declination'] = np.nan
    try:
        sys_dict[i]['distance'] = systems[i].d
    except:
        sys_dict[i]['distance'] = np.nan
    try:
        sys_dict[i]['child_stars_binaries'] = systems[i].stars
    except:
        sys_dict[i]['child_stars_binaries'] = np.nan
    try:
        sys_dict[i]['epoch'] = systems[i].epoch
    except:
        sys_dict[i]['epoch'] = np.nan
    try:
        sys_dict[i]['name'] = systems[i].name
    except:
        sys_dict[i]['name'] = np.nan
    try:
        sys_dict[i]['all_children'] = systems[i].children
    except:
        sys_dict[i]['all_children'] = np.nan
    try:
        sys_dict[i]['flags'] = systems[i].flags
    except:
        sys_dict[i]['flags'] = np.nan
    i += 1 

In [None]:
# try these with continue instead of the except statement and see what if anything changes, might be able to shorten 
# also change to i in range 
i = 0 
while i < 202:
    bn_dict[i] = {}
    bn_dict[i]['alt_names'] = binary[i].params['altnames']
    bn_dict[i]['flags'] = binary[i].flags
    bn_dict[i]['system'] = binary[i].system
    try:
        bn_dict[i]['names'] = binary[i].params['name']
    except:
        bn_dict[i]['names'] = np.nan
    try:
        bn_dict[i]['list'] = binary[i].params['list']
    except:
        bn_dict[i]['list'] = np.nan
    try:
        bn_dict[i]['separation'] = binary[i].params['separation']
    except:
        bn_dict[i]['separation'] = np.nan
    try:
        bn_dict[i]['position_angle'] = binary[i].params['positionangle']
    except:
        bn_dict[i]['position_angle'] = np.nan
    try:
        bn_dict[i]['mass'] = binary[i].params['mass']
    except:
        bn_dict[i]['mass'] = np.nan
    try:
        bn_dict[i]['magUltraviolet'] = binary[i].params['magU']
    except:
        bn_dict[i]['magUltraviolet'] = np.nan
    try: 
        bn_dict[i]['magBlue'] = binary[i].params['magB']
    except:
        bn_dict[i]['magBlue'] = np.nan
    try:
        bn_dict[i]['magH_nearinf'] = binary[i].params['magH']
    except:
        bn_dict[i]['magH_nearinf'] = np.nan
    try:
        bn_dict[i]['magInfared'] = binary[i].params['magI']
    except: 
        bn_dict[i]['magInfared'] = np.nan
    try:
        bn_dict[i]['magJ_nearinf'] = binary[i].params['magJ']
    except: 
        bn_dict[i]['magJ_nearinf'] = np.nan
    try:
        bn_dict[i]['magK_nearinf'] = binary[i].params['magK']
    except: 
        bn_dict[i]['magK_nearinf'] = np.nan
    try:
        bn_dict[i]['magVisual'] = binary[i].params['magV']
    except: 
        bn_dict[i]['magVisual'] = np.nan
    try:
        bn_dict[i]['magL_nq_midinfared'] = binary[i].params['magL']
    except:
        bn_dict[i]['magL_nq_midinfared'] = np.nan
    try:
        bn_dict[i]['magM_midinfared'] = binary[i].params['magM']
    except: 
        bn_dict[i]['magM_midinfared'] = np.nan
    try:
        bn_dict[i]['magN_midinfared'] = binary[i].params['magN']
    except:
        bn_dict[i]['magN_midinfared'] = np.nan
    try:
        bn_dict[i]['distance'] = binary[i].d
    except:
        bn_dict[i]['distance'] = np.nan
    try:
        bn_dict[i]['periastron'] = binary[i].periastron
    except:
        bn_dict[i]['periastron'] = np.nan
    try:
        bn_dict[i]['right_ascension'] = binary[i].ra
    except:
        bn_dict[i]['right_ascension'] = np.nan
    try:
        bn_dict[i]['declination'] = binary[i].dec
    except:
        bn_dict[i]['declination'] = np.nan
    try:
        bn_dict[i]['inclination'] = binary[i].i
    except:
        bn_dict[i]['inclination'] = np.nan
    try:
        bn_dict[i]['parent_obj'] = binary[i].parent
    except: 
        bn_dict[i]['parent_obj'] = np.nan
    try:
        bn_dict[i]['child_obj'] = binary[i].children
    except: 
        bn_dict[i]['child_obj'] = np.nan
    try:
        bn_dict[i]['stars'] = binary[i].stars
    except: 
        bn_dict[i]['stars'] = np.nan
    try:
        bn_dict[i]['period'] = binary[i].P
    except: 
        bn_dict[i]['period'] = np.nan
    try:
        bn_dict[i]['semi_major_axis'] = binary[i].a
    except: 
        bn_dict[i]['semi_major_axis'] = np.nan
    try:
        bn_dict[i]['eccentricity'] = binary[i].e
    except: 
        bn_dict[i]['eccentricity'] = np.nan
    try:
        bn_dict[i]['longitude'] = binary[i].longitude
    except: 
        bn_dict[i]['longitude'] = np.nan
    try:
        bn_dict[i]['ascending_node'] = binary[i].ascendingnode
    except: 
        bn_dict[i]['ascending_node'] = np.nan
    
    i += 1

In [None]:
bn_dict

In [None]:
# try these with continue instead of the except statement and see what if anything changes, might be able to shorten 
# also change to i in range 
i = 0 
while i < 3505:
    star_dict[i] = {}
    try:
        star_dict[i]['spectral_type'] = stars[i].params['spectraltype']
    except: 
        star_dict[i]['spectral_type'] = np.nan
    try:
        star_dict[i]['temp'] = stars[i].params['temperature']
    except:
        star_dict[i]['temp'] = np.nan
    try:
        star_dict[i]['metallicity'] = stars[i].params['metallicity']
    except:
        star_dict[i]['metallicity'] = np.nan
    try:
        star_dict[i]['altnames'] = stars[i].params['altnames']
    except:
        star_dict[i]['altnames'] = np.nan
    try:
        star_dict[i]['mass'] = stars[i].params['mass']
    except:
        star_dict[i]['mass'] = np.nan
    try:
        star_dict[i]['magUltraviolet'] = stars[i].params['magU']
    except:
        star_dict[i]['magUltraviolet'] = np.nan
    try: 
        star_dict[i]['magBlue'] = stars[i].params['magB']
    except:
        star_dict[i]['magBlue'] = np.nan
    try:
        star_dict[i]['magH_nearinfared'] = stars[i].params['magH']
    except:
        star_dict[i]['magH_nearinfared'] = np.nan
    try:
        star_dict[i]['magInfared'] = stars[i].params['magI']
    except: 
        star_dict[i]['magInfared'] = np.nan
    try:
        star_dict[i]['magJ_nearinfared'] = stars[i].params['magJ']
    except: 
        star_dict[i]['magJ_nearinfared'] = np.nan
    try:
        star_dict[i]['magK_nearinfared'] = stars[i].params['magK']
    except: 
        star_dict[i]['magK_nearinfared'] = np.nan
    try:
        star_dict[i]['magVisual'] = stars[i].params['magV']
    except: 
        star_dict[i]['magVisual'] = np.nan
    try:
        star_dict[i]['magL_nq_midinfared'] = stars[i].params['magL']
    except:
        star_dict[i]['magL_nq_midinfared'] = np.nan
    try:
        star_dict[i]['magM_midinfared'] = stars[i].params['magM']
    except: 
        star_dict[i]['magM_midinfared'] = np.nan
    try:
        star_dict[i]['magN_midinfared'] = stars[i].params['magN']
    except:
        star_dict[i]['magN_midinfared'] = np.nan
    try:
        star_dict[i]['distance'] = stars[i].d
    except:
        star_dict[i]['distance'] = np.nan
    try:
        star_dict[i]['periastron'] = stars[i].params['periastron']
    except:
        star_dict[i]['periastron'] = np.nan
    try:
        star_dict[i]['right_ascension'] = stars[i].ra
    except:
        star_dict[i]['right_ascension'] = np.nan
    try:
        star_dict[i]['declination'] = stars[i].dec
    except:
        star_dict[i]['declination'] = np.nan
    try:
        star_dict[i]['parent_obj'] = stars[i].parent
    except: 
        star_dict[i]['parent_obj'] = np.nan
    try:
        star_dict[i]['child_obj'] = stars[i].children
    except: 
        star_dict[i]['child_obj'] = np.nan
    try: 
        star_dict[i]['planet1type'] = stars[i].children[0].type()   
    except: 
        star_dict[i]['planet1type'] = np.nan  
    try: 
        star_dict[i]['planet2type'] = stars[i].children[1].type()   
    except: 
        star_dict[i]['planet2type'] = np.nan 
    try: 
        star_dict[i]['planet3type'] = stars[i].children[2].type()   
    except: 
        star_dict[i]['planet3type'] = np.nan 
    try: 
        star_dict[i]['planet4type'] = stars[i].children[3].type()   
    except: 
        star_dict[i]['planet4type'] = np.nan 
    try: 
        star_dict[i]['planet5type'] = stars[i].children[4].type()   
    except: 
        star_dict[i]['planet5type'] = np.nan 
    try: 
        star_dict[i]['planet6type'] = stars[i].children[5].type()   
    except: 
        star_dict[i]['planet6type'] = np.nan 
    try: 
        star_dict[i]['planet7type'] = stars[i].children[6].type()   
    except: 
        star_dict[i]['planet7type'] = np.nan 
    try: 
        star_dict[i]['planet8type'] = stars[i].children[7].type()   
    except: 
        star_dict[i]['planet8type'] = np.nan 
    try: 
        star_dict[i]['planet9type'] = stars[i].children[8].type()   
    except: 
        star_dict[i]['planet9type'] = np.nan 
    star_dict[i]['proper'] = stars[i].name
    star_dict[i]['flags'] = stars[i].flags
    star_dict[i]['system'] = stars[i].system
    star_dict[i]['radius'] = stars[i].R
    star_dict[i]['age'] = stars[i].age
    star_dict[i]['hip'] = findOtherName("HIP ", star_dict[i]['proper'], star_dict[i]['altnames'])
    star_dict[i]['hd'] = findOtherName("HD ", star_dict[i]['proper'], star_dict[i]['altnames'])
    star_dict[i]['hr'] = findOtherName("HR ", star_dict[i]['proper'], star_dict[i]['altnames'])
    star_dict[i]['gl'] = findGLName(star_dict[i]['proper'], star_dict[i]['altnames'])
    i += 1  

In [None]:
# try these with continue instead of the except statement and see what if anything changes, might be able to shorten 
# also change to i in range 
i = 0
while i < 4499:
    pln_dict[i] = {}
    pln_dict[i]['flags'] = planets[i].flags
    pln_dict[i]['radius'] = planets[i].R
    pln_dict[i]['orbital_inclination'] = planets[i].i
    pln_dict[i]['seperation'] = planets[i].seperation
    pln_dict[i]['age'] = planets[i].age
    pln_dict[i]['transit_time'] = planets[i].transittime
    pln_dict[i]['longitude'] = planets[i].longitude
    pln_dict[i]['ascending_node'] = planets[i].ascendingnode
    pln_dict[i]['discovery_method'] = planets[i].discoveryMethod
    pln_dict[i]['discovery_year'] = planets[i].discoveryYear
    pln_dict[i]['description '] = planets[i].description
    pln_dict[i]['alt_names'] = planets[i].params['altnames']
    pln_dict[i]['list'] = planets[i].params['list']
    try:
        pln_dict[i]['system'] = getListOfNames(str(planets[i].system))
    except:
        pln_dict[i]['system'] = np.nan
    try:
        pln_dict[i]['name'] = planets[i].name
    except:
        pln_dict[i]['name'] = np.nan
    try:
        pln_dict[i]['type'] = planets[i].type()
    except:
        pln_dict[i]['type'] = np.nan
    try:
        pln_dict[i]['mass'] = planets[i].params['mass']
    except:
        pln_dict[i]['mass'] = np.nan
    try:
        pln_dict[i]['semi_major_axis'] = planets[i].params['semimajoraxis']
    except:
        pln_dict[i]['semi_major_axis'] = np.nan
    try:
        pln_dict[i]['orbital_period'] = planets[i].params['period']
    except:
        pln_dict[i]['orbital_period'] = np.nan
    try:
        pln_dict[i]['periastron'] = planets[i].params['periastron']
    except:
        pln_dict[i]['orbital_period'] = np.nan
    try:
        pln_dict[i]['periastron_time'] = planets[i].params['periastrontime']
    except:
        pln_dict[i]['periastron_time'] = np.nan
    try:
        pln_dict[i]['eccentricity'] = planets[i].params['eccentricity']
    except:
        pln_dict[i]['eccentricity'] = np.nan
    try:
        pln_dict[i]['distance'] = planets[i].d
    except:
        pln_dict[i]['distance'] = np.nan
    try:
        pln_dict[i]['periastron'] = planets[i].params['periastron']
    except:
        pln_dict[i]['periastron'] = np.nan
    try:
        pln_dict[i]['temp'] = planets[i].T
    except:
        pln_dict[i]['temp'] = np.nan
    try:
        pln_dict[i]['right_ascension'] = planets[i].ra
    except:
        pln_dict[i]['right_ascension'] = np.nan
    try:
        pln_dict[i]['declination'] = planets[i].dec
    except:
        pln_dict[i]['declination'] = np.nan
    try:
        pln_dict[i]['parent_obj'] = planets[i].parent
    except: 
        pln_dict[i]['parent_obj'] = np.nan
    try:
        pln_dict[i]['star'] = getListOfNames(str(planets[i].star))
    except: 
        pln_dict[i]['star'] = np.nan
    i += 1  

In [None]:
# transform all of my dictionaries to dataframes so I can work with them in pandas 
pdf = pd.DataFrame(pln_dict)
sdf = pd.DataFrame(star_dict)
bdf = pd.DataFrame(bn_dict)
sysdf = pd.DataFrame(sys_dict)

In [None]:
#realign them so my columns are on top and rows go downward 
planets = pdf.transpose()
stars = sdf.transpose()
systems = bdf.transpose()
binaries = sysdf.transpose()

In [None]:
# quickly check over data and drop columns that are already noticably unusable 
planets = planets.drop(labels=['age','seperation','longitude','ascending_node','periastron_time', 'discovery_year'], axis=1)

In [None]:
planets['system']

In [None]:
# switch to get name here? 
# clean up planets to correctly cast columns to dtypes
# planets['system'] = planets['system'].str.strip(to_strip="System('")
# planets['system'] = planets['system'].str.strip(to_strip="')")
planets['radius'] = planets['radius'].str.strip(to_strip=" R_j")
planets['orbital_inclination'] = planets['orbital_inclination'].str.strip(to_strip=" deg")
planets['transit_time'] = planets['transit_time'].str.strip(to_strip=" JD")
# planets['parent_obj'] = planets['parent_obj'].str.strip("Star('")
# planets['parent_obj'] = planets['parent_obj'].str.strip("')")
planets['mass'] = planets['mass'].str.strip(" M_j")
planets['semi_major_axis'] = planets['semi_major_axis'].str.strip(" au")
planets['orbital_period'] = planets['orbital_period'].str.strip(" d")
planets['periastron'] = planets['periastron'].str.strip(" deg")
planets['distance'] = planets['distance'].str.strip(" pc")
planets['temp'] = planets['temp'].str.strip(" K")

In [None]:
#convert all columns to correct dtypes 
planets['radius'] = planets['radius'].astype(float)
planets['orbital_period'] = planets['orbital_period'].astype(float)
planets['mass'] = planets['mass'].astype(float)
planets['transit_time'] = planets['transit_time'].astype(float)
planets['orbital_inclination'] = planets['orbital_inclination'].astype(float)
planets['periastron'] = planets['periastron'].astype(float)
planets['distance'] = planets['distance'].astype(float)
planets['temp'] = planets['temp'].astype(float)

In [None]:
# what was I doing here?
# did this work?
i = 0
letter = []
for i in range(0, 4499):
    letter.append(planets['name'][i][-1:])
planets['letter'] = letter

In [None]:
# try and rename planets to be consistent to that you don't end up with 600 dummy variables where you only need 6 
planets['letter'].replace(to_replace={'A': 'B', 'a':'B', '1':'B','2':'C', '3':'D', '4':'E', '8':'I'}, inplace=True)

In [None]:
# fix individual indices with wonky planet letters
planets.iloc[3766] = planets.iloc[3766].replace({'n': 'b'})
planets.iloc[4178] = planets.iloc[4178].replace({'o': 'h'})
planets.iloc[1669] = planets.iloc[1669].replace({'X': 'd'})
planets.iloc[4174] = planets.iloc[4174].replace({'r': 'j'})
planets.iloc[4170] = planets.iloc[4170].replace({'y': 'b'})
planets.iloc[4175] = planets.iloc[4175].replace({'n': 'f'})
planets.iloc[4171] = planets.iloc[4171].replace({'s': 'b'})
planets.iloc[4173] = planets.iloc[4173].replace({'s': 'd'})
planets.iloc[4176] = planets.iloc[4176].replace({'s': 'f'})
planets.iloc[3754] = planets.iloc[3754].replace({'s': 'c'})
planets['letter'] = planets['letter'].astype(str)
planets['letter'] = planets['letter'].str.capitalize()

In [None]:
# make dummy variables of other applicable columns 
planets['confirmed_planet'] = planets['list'].str.contains('Confirmed planets')
planets['retracted'] = planets['list'].str.contains('Retracted planet candidate')
planets['binary_system'] = planets['list'].str.contains('binary systems')
planets = planets.drop('list', 1)

In [None]:
dummies = pd.get_dummies(planets['discovery_method'], drop_first=True)
planets = planets.merge(dummies, left_index=True, right_index=True)
planets = planets.drop('discovery_method', 1)

In [None]:
# change your bools to ints so they're no longer objects 
planets = planets.replace({True: 1, False:0})

In [None]:
#clean up stars to correctly cast to dtypes 
stars['radius'] = stars['radius'].str.strip(to_strip=" R_s")
stars['age'] = stars['age'].str.strip(" Gyr")
stars['mass'] = stars['mass'].str.strip(" M_s")
stars['distance'] = stars['distance'].str.strip(" pc")
stars['temp'] = stars['temp'].str.strip(" K")

In [None]:
# cannot set value on slices of dataframe, so in order to extract the names they need to be turned into a dictionary and merged with original df 
# going to merge on index, so need index to be something I can pull and add to my dictionary
stars = stars.reset_index()
star_dict = {}

In [None]:
# get names for star objects
for i in range(0, 3505):
    star_dict[i] = {}
    star_dict[i]['child_obj'] = getListOfNames(stars['child_obj'][i])
    star_dict[i]['parent_obj'] = getListOfNames(stars['parent_obj'][i])
    star_dict[i]['system'] = getListOfNames(stars['system'][i])
    star_dict[i]['index'] = stars['index'][i]

In [None]:
# turn into a new dataframe to merge with the old one, drop columns that would otherwise be duplicates, merge on matching index 
star_dict = pd.DataFrame.from_dict(star_dict)
star_dict = star_dict.transpose()
stars = stars.drop(['child_obj', 'system', 'parent_obj'], 1)
stars = stars.merge(star_dict, on=['index'])

In [None]:
# drop columns with little to no information
stars = stars.drop(['magL_nq_midinfared', 'magM_midinfared', 'magN_midinfared', 'periastron'], 1)

In [None]:
#recast everything as dtypes we can work with 
stars['radius'] = stars['radius'].astype(float) 
stars['age'] = stars['age'].astype(float) 
stars['temp'] = stars['temp'].astype(float) 
stars['mass'] = stars['mass'].astype(float) 
stars['distance'] = stars['distance'].astype(float) 

In [None]:
# maybe turn this into a loop to do it? any way to clean it up 
# I want to know how many total of each variety of planet each star has, to see if we can predict which type a star with a planet will have based on 
# its features. I have a maximum of 8 planets in any solar system (excluding pluto which is still on the list as a dwarf)
columns = {'Cold Jupiter':'CJ', 'Cold Neptune':'CN', 'Cold Super-Earth':'CE', 'Hot Jupiter':'HE', 
           'Hot Neptune':'HN', 'Hot Super-Earth':'HSE', 'None Jupiter':'JUP', 'None Neptune':'NEP', 
           'None Super-Earth':'SE', 'Warm Jupiter':'WJ', 'Warm Neptune':'WN', 'Warm Super-Earth':'WSE'}

dumms1 = pd.get_dummies(stars['planet1type'])
dumms1 = dumms1.rename(columns=columns)
dumms2 = pd.get_dummies(stars['planet2type'])
dumms2 = dumms2.rename(columns=columns)
dumms3 = pd.get_dummies(stars['planet3type'])
dumms3 = dumms3.rename(columns=columns)
dumms4 = pd.get_dummies(stars['planet4type'])
dumms4 = dumms4.rename(columns=columns)
dumms5 = pd.get_dummies(stars['planet5type'])
dumms5 = dumms5.rename(columns=columns)
dumms6 = pd.get_dummies(stars['planet6type'])
dumms6 = dumms6.rename(columns=columns)
dumms7 = pd.get_dummies(stars['planet7type'])
dumms7 = dumms7.rename(columns=columns)
dumms8 = pd.get_dummies(stars['planet8type'])
dumms8 = dumms8.rename(columns=columns)

In [None]:
d1 = dumms1.merge(dumms2, left_index=True, right_index=True, suffixes=('1', '2'))
d2 = d1.merge(dumms3, left_index=True, right_index=True)
d3 = d2.merge(dumms4, left_index=True, right_index=True, suffixes=('3', '4'))
d4 = d3.merge(dumms5, left_index=True, right_index=True)
d5 = d4.merge(dumms6, left_index=True, right_index=True, suffixes=('6', '6'))
d6 = d5.merge(dumms7, left_index=True, right_index=True)
pln_types = d6.merge(dumms8, left_index=True, right_index=True, suffixes=('7', '8'))

In [None]:
pln_types = pln_types.astype(float).fillna(0.0)

In [None]:
pln_types['WSE8'].value_counts()

In [None]:
pln_types['CJ'] = pln_types['CJ1'] + pln_types['CJ2'] + pln_types['CJ3'] + pln_types['CJ4'] + pln_types['CJ7'] + pln_types['CJ8'] 
pln_types['CN'] = pln_types['CN1'] + pln_types['CN2'] + pln_types['CN3'] + pln_types['CN4'] + pln_types['CN_x'] + pln_types['CN_y'] + pln_types['CN'] 
pln_types['CE'] = pln_types['CE1'] + pln_types['CE2'] + pln_types['CE3'] + pln_types['CE4'] + pln_types['CE'] 
pln_types['HE'] = pln_types['HE'] 
pln_types['HN'] = pln_types['HN1'] + pln_types['HN2'] 
pln_types['HSE'] = pln_types['HSE1'] + pln_types['HSE2'] + pln_types['HSE3'] 
pln_types['JUP'] = pln_types['JUP1'] + pln_types['JUP2'] + pln_types['JUP3']
pln_types['NEP'] = pln_types['NEP1'] + pln_types['NEP2'] + pln_types['NEP']
pln_types['SE'] = pln_types['SE1'] + pln_types['SE2'] + pln_types['SE3'] + pln_types['SE4']
pln_types['WJ'] = pln_types['WJ1'] + pln_types['WJ2'] + pln_types['WJ3'] + pln_types['WJ4'] 
pln_types['WN'] = pln_types['WN1'] + pln_types['WN2'] + pln_types['WN3'] + pln_types['WN4']
pln_types['WSE'] = pln_types['WSE1'] + pln_types['WSE2'] + pln_types['WSE3'] + pln_types['WSE4'] + pln_types['WSE7'] + pln_types['WSE8']

In [None]:
pln_types = pln_types.drop(columns=['CJ1', 'CN1', 'CE1', 'HN1', 'HSE1', 'JUP1', 'NEP1', 'SE1', 'WJ1', 'WN1', 'WSE1',
                     'CJ2', 'CN2', 'CE2', 'HN2', 'HSE2', 'JUP2', 'NEP2', 'SE2', 'WJ2', 'WN2', 'WSE2', 'CJ3', 
                     'CN3', 'CE3', 'HSE3', 'JUP3', 'SE3', 'WJ3', 'WN3', 'WSE3', 'CJ4', 'CN4', 'CE4', 
                     'HSE4', 'JUP4', 'SE4', 'WJ4', 'WN4', 'WSE4', 'CJ6', 'CN_x', 'CE6', 'SE6', 'WSE6', 'CJ6',
                     'CE6', 'SE6', 'WSE6', 'CJ7', 'CN_y', 'WSE7', 'CJ8', 'WSE8'])

In [None]:
stars = stars.merge(pln_types, right_index=True, left_index=True)

In [None]:
binaries = binaries.reset_index()
binary_dict = {} 

for i in range(len(binaries['child_obj'])):
    binary_dict[i] = {}
    binary_dict[i]['child_obj'] = getListOfNames(binaries['child_obj'][i])
    binary_dict[i]['system'] = getListOfNames(binaries['system'][i])
    binary_dict[i]['stars'] = getListOfNames(binaries['stars'][i])
    binary_dict[i]['parent_obj'] = getListOfNames(binaries['parent_obj'][i])
    binary_dict[i]['index'] = binaries['index'][i]
    
binary_dict = pd.DataFrame.from_dict(binary_dict)
binary_dict = binary_dict.transpose()
binaries = binaries.drop(['child_obj', 'system', 'stars'], 1)
binaries = binaries.merge(binary_dict, on=['index'])

In [None]:
binaries = binaries.drop(['list'], 1)

In [None]:
binaries['distance'] = binaries['distance'].str.strip(" pc")
binaries['distance'] = binaries['distance'].astype(float)
binaries['separation'] = binaries['separation'].str.strip(" au")
binaries['separation'] = binaries['separation'].astype(float)

In [None]:
systems = systems.reset_index()
system_dict = {} 

for system in range(len(systems)):
    system_dict[i] = {}
    system_dict[i]['all_children'] = getListOfNames(systems['all_children'][i])
    system_dict[i]['child_stars_binaries'] = getListOfNames(systems['child_stars_binaries'][i])
    system_dict[i]['index'] = systems['index'][i]
    
system_dict = pd.DataFrame.from_dict(system_dict)
system_dict = system_dict.transpose()
systems = systems.drop(['child_stars_binaries', 'all_children'], 1)
systems = systems.merge(system_dict, on=['index'])

In [None]:
systems['distance'] = systems['distance'].str.strip(" pc")
systems['distance'] = systems['distance'].astype(float)
systems = systems.drop(['altnames', 'list', 'epoch'], 1)

In [None]:
ra = systems['right_ascension'] 
raDict = {}
for i in range(len(ra)):
    raDict[i] = {}
    if type(ra[i]) != str:
        raDict[i]['ra_degrees'] = str('nan')
        raDict[i]['ra_minutes'] = str('nan')
        raDict[i]['ra_seconds'] = str('nan')
    else:
        tempStr = ra[i]
        end = tempStr.find('d')
        raDict[i]['ra_degrees'] = float(tempStr[:end])
        tempStr = tempStr[end + 1 :]
        end = tempStr.find('m')
        raDict[i]['ra_minutes'] = float(tempStr[:end])
        tempStr = tempStr[end + 1 :]
        end = tempStr.find('s')
        raDict[i]['ra_seconds'] = float(tempStr[:end])
ra = None
raDict = pd.DataFrame.from_dict(raDict, orient='index', dtype=float)


dec = systems['declination']
decDict = {}
for i in range(len(dec)):
    decDict[i] = {}
    if type(dec[i]) != str:
        decDict[i]['dec_degrees'] = str('nan')
        decDict[i]['dec_minutes'] = str('nan')
        decDict[i]['dec_seconds'] = str('nan')
    else: 
        tempStr = dec[i]
        end = tempStr.find('d')
        decDict[i]['dec_degrees'] = float(tempStr[:end])
        tempStr = tempStr[end + 1 :]
        end = tempStr.find('m')
        decDict[i]['dec_minutes'] = float(tempStr[:end])
        tempStr = tempStr[end + 1 :]
        end = tempStr.find('s')
        decDict[i]['dec_seconds'] = float(tempStr[:end])
dec = None
decDict = pd.DataFrame.from_dict(decDict, orient='index', dtype=float)

systems = raDict.merge(systems, right_index=True, left_index=True)
systems = decDict.merge(systems, right_index=True, left_index=True)
systems = systems.drop(['right_ascension', 'declination'], 1)

In [None]:
ra = stars['right_ascension'] 
raDict = {}
for i in range(len(ra)):
    raDict[i] = {}
    if type(ra[i]) != str:
        raDict[i]['ra_degrees'] = str('nan')
        raDict[i]['ra_minutes'] = str('nan')
        raDict[i]['ra_seconds'] = str('nan')
    else:
        tempStr = ra[i]
        end = tempStr.find('d')
        raDict[i]['ra_degrees'] = float(tempStr[:end])
        tempStr = tempStr[end + 1 :]
        end = tempStr.find('m')
        raDict[i]['ra_minutes'] = float(tempStr[:end])
        tempStr = tempStr[end + 1 :]
        end = tempStr.find('s')
        raDict[i]['ra_seconds'] = float(tempStr[:end])
ra = None
raDict = pd.DataFrame.from_dict(raDict, orient='index', dtype=float)


dec = stars['declination']
decDict = {}
for i in range(len(dec)):
    decDict[i] = {}
    if type(dec[i]) != str:
        decDict[i]['dec_degrees'] = str('nan')
        decDict[i]['dec_minutes'] = str('nan')
        decDict[i]['dec_seconds'] = str('nan')
    else:
        tempStr = dec[i]
        end = tempStr.find('d')
        decDict[i]['dec_degrees'] = float(tempStr[:end])
        tempStr = tempStr[end + 1 :]
        end = tempStr.find('m')
        decDict[i]['dec_minutes'] = float(tempStr[:end])
        tempStr = tempStr[end + 1 :]
        end = tempStr.find('s')
        decDict[i]['dec_seconds'] = float(tempStr[:end])
dec = None
decDict = pd.DataFrame.from_dict(decDict, orient='index', dtype=float)

child = stars['child_obj']
childDict = {}
for i in range(len(child)):
    childDict[i] = {}
    if stars['child_obj'][i] == '':
        childDict[i]['children'] = 0
    else:
        childDict[i]['children'] = float(stars['child_obj'][i].count(',') + 1)
child = None
childDict = pd.DataFrame.from_dict(childDict, orient='index', dtype=float)

num_names = stars['altnames']
nameDict = {}
for i in range(len(num_names)):
    nameDict[i] = {}
    if stars['altnames'][i] == '':
        nameDict[i]['num_names'] = 1
    else:
        nameDict[i]['num_names'] = float(stars['altnames'][i].count(',') + 1)
num_names = None
nameDict = pd.DataFrame.from_dict(nameDict, orient='index', dtype=float)

stars = raDict.merge(stars, right_index=True, left_index=True)
stars = childDict.merge(stars, right_index=True, left_index=True)
stars = decDict.merge(stars, right_index=True, left_index=True)
stars = nameDict.merge(stars, right_index=True, left_index=True)

In [None]:
stars['altnames'] = stars['altnames'].str.strip('[')
stars['altnames'] = stars['altnames'].str.strip(']')
stars['all_names'] = stars['proper'] + stars['altnames'].astype(str)
stars['all_names'] = "'" + stars['all_names']
stars['all_names'] = stars['all_names'].str.split("'")
stars['all_names'] = stars['altnames'].str.strip('[')
stars['all_names'] = stars['altnames'].str.strip(']')
stars['all_names'] = stars['all_names'].replace({',', ''})
stars['all_names'] = stars['all_names'].str.split(',')

In [None]:
stars = stars.drop(['parent_obj'], 1)

In [None]:
HIP = stars.loc[stars['hip'] > 0]
HD = stars.loc[stars['hd'] > 0]
HR = stars.loc[stars['hr'] > 0]
GL = stars.loc[stars['gl'].notnull()]

In [None]:
GL_csv = GL.merge(star_csv, on='gl')
HD_csv = HD.merge(star_csv, on='hd')
HR_csv = HR.merge(star_csv, on='hr')
HIP_csv = HIP.merge(star_csv, on='hip')
stars_csv = stars.merge(star_csv, on='proper')

In [None]:
HIP_csv = HIP_csv.append(GL_csv)
HIP_csv = HIP_csv.append(HD_csv)
HIP_csv = HIP_csv.append(stars_csv)
HIP_csv = HIP_csv.append(HR_csv)
HIP_csv['all_names'] = HIP_csv['all_names'].astype(str)
HIP_csv = HIP_csv.drop_duplicates('all_names')

In [None]:
star_csv['other_names'] = star_csv['flam'].astype(str).str[:-2].replace('n', '') + ' ' + star_csv['bayer'].astype(str).replace('nan', '') + ' ' + star_csv['con'].astype(str).replace('nan','')

In [None]:
star_csv['other_names'] = star_csv['other_names'].str.strip()

In [None]:
# other_name = star_csv.head(1).add(stars.loc[1])

# for i in range(len(stars['all_names'])):
#     for j in range(len(stars['all_names'][i])):
#         for k in range(len(star_csv['other_names'])): 
#             if len(star_csv['other_names'][k]) <= 3:
#                 continue
#             else:
#                 if str(star_csv['bayer'][k]) in stars['all_names'][i][j]:
#                     to_app = pd.concat(objs=[star_csv.loc[k], stars.loc[i]], axis=1)
#                     to_app[k] = to_app[k].fillna(to_app[i])
#                     to_app = to_app.drop(i, axis=1)
#                     to_app = to_app.transpose()
#                     other_name = other_name.append(to_app)
#                     break

In [None]:
# other_name.to_csv('vauge_names.csv')
other_name = pd.read_csv('vauge_names.csv')

In [None]:
other_name = HIP_csv.append(other_name)
other_name = other_name.drop_duplicates(subset='id')

In [None]:
# new estimates roughly 1 in 4 sunlike stars have planets, adjusting for how much data will be lost when dropping null values 
data = other_name.append(star_csv.sample(2000, random_state=42))

In [None]:
list_ = ['HE', 'NEP', 'CE', 'CN', 'CJ', 'HN', 'HSE', 'JUP', 'SE', 'WJ', 'WN', 'WSE', 'children']
for item in list_:
    data[item] = data[item].fillna(0.0)

In [None]:
data = data[['children', 'ra', 'dec', 'dist', 'pmra', 'pmdec', 'rv', 'mag', 'absmag', 'spect', 'ci', 'x', 'y', 'z', 'vx', 'vy', 'vz', 'rarad', 
             'decrad', 'pmrarad', 'pmdecrad','comp_primary', 'lum','HE', 'NEP', 'CE', 'CN', 'CJ', 'HN', 'HSE', 'JUP', 'SE', 'WJ', 'WN', 'WSE']].copy()
data = data.dropna()

In [None]:
data['temp_class'] = data['spect'].str[:1]
data['temp_class'] = data['temp_class'].fillna('nan')

data['heat_class'] = data['spect'].str[1:2]
data['heat_class'] = data['heat_class'].fillna('nan')

data['harv_class'] = data['spect'].str[:2]
data['harv_class'] = data['harv_class'].fillna('nan')

data['lum_class'] = data['spect'].str[2:]
data['lum_class'] = data['lum_class'].fillna('nan')

In [None]:
data = data.reset_index()
data = pd.get_dummies(data, columns=['temp_class'], prefix='_y')
data = pd.get_dummies(data, columns=['heat_class'], prefix='_x')
data = pd.get_dummies(data, columns=['lum_class'], prefix='')

In [None]:
data['I'] = data['_.5Ia'] + data['_.5Ib'] + data['_/B8Ib'] + data['_Ia'] + data['_Ib'] + data['_Ib/II'] + data['_Ib+...'] + data['_Ib'] 
data['I'] = data['_Iab:'] + data['I'] + data['_-G2Ie'] + data['_Ia0:'] + data['_Ib-II']

data['II'] = data['_Ib/II'] + data['_Ib-II'] + data['_/B9II/III'] + data['_/K0II'] + data['_II'] + data['_II-III'] + data['_II/III'] + data['_IICNp...']
data['II'] = data['_IIb'] + data['II']

data['III'] = data['_.5III'] + data['_.5III:'] + data['_/A3III'] + data['_/A8III'] + data['_/B8III'] + data['_/B9II/III'] + data['_/B9III/IV'] 
data['III'] = data['III'] + data['_/G5III'] + data['_/G8III'] + data['_/G8III:'] + data['_/K0III'] + data['_/K0III:'] + data['_/K1III']
data['III'] = data['III'] + data['_/K1III+..'] + data['_/K2III'] + data['_/K3III'] + data['_/K3III:'] + data['_/K4III'] + data['_/K4III:'] + data['_/K5III']
data['III'] = data['III'] + data['_/M0III'] + data['_/M1III'] + data['_/M2III'] + data['_/M3III'] + data['_:III:'] + data['_III+...'] + data['_II-III']
data['III'] = data['III'] + data['_II/IIICNV:'] + data['_III'] + data['_III + (F)'] + data['_III-IV'] + data['_III...'] + data['_III/IV'] + data['_III/IVCN.']
data['III'] = data['III'] + data['_III:'] + data['_IIICN...'] + data['_IIICNII'] + data['_IIICNp...'] + data['_IIIb'] + data['_IIIp'] + data['_IIIvar']

data['IV'] = data['_.5IV'] + data['_.5IV-V'] + data['_/B9III/IV'] + data['_/A3IV'] + data['_/A3IV/V'] + data['_/A7IV'] + data['_/F2IV'] + data['_/F2IV/V']
data['IV'] = data['IV'] + data['_/F3IV'] + data['_/F5IV'] + data['_/F7IV/V'] + data['_/K0IV/V'] + data['_/K1IV'] + data['_/K1IV/V:'] + data['_III-IV']
data['IV'] = data['IV'] + data['_III/IV'] + data['_III/IVCN.'] + data['_IV'] + data['_IV-V'] + data['_IV...'] + data['_IV/V'] + data['_IV: (+F/G)']
data['IV'] = data['IV'] + data['_IV:pe...'] + data['_IVn'] + data['_IVne+...']

data['V'] = data['_ V'] + data['_.5IV-V'] + data['_.5V'] + data['_.5Ve'] + data['_.5Vn'] + data['_/A1V'] + data['_/A2V'] + data['_/A3IV/V']
data['V'] = data['V'] + data['_/A3V'] + data['_/A3V+...'] + data['_V+...'] + data['_/B3V'] + data['_/B8V'] + data['_/B9.5V'] + data['_/B9V'] 
data['V'] = data['V'] + data['_/F2IV/V'] + data['_/F3V'] + data['_/F5V'] + data['_/F6V'] + data['_/F7IV/V'] + data['_/F7V'] + data['_/F8V']
data['V'] = data['V'] + data['_/G0V'] + data['_/G1V'] + data['_/G2V'] + data['_/G3V'] + data['_/G5V'] + data['_/G6V'] + data['_/G8V']
data['V'] = data['V'] + data['_/K0IV/V'] + data['_/K0V'] + data['_/K1V'] + data['_/K3V'] + data['_/M2V'] + data['_:V...'] + data['_IV-V'] + data['_V']
data['V'] = data['V'] + data['_V + G/K'] + data['_V+...'] + data['_V-VI'] + data['_V...'] + data['_V:'] + data['_V:n'] + data['_V:pe'] + data['_VCN...']
data['V'] = data['V'] + data['_Ve'] + data['_Ve+...'] + data['_Vn'] + data['_Vne'] + data['_Vp'] + data['_Vw...']

data['VI'] = data['_V-VI']

data['...'] = data['_+...'] + data['_..'] + data['_...'] + data['_w...'] + data['_sp...'] + data['_V...'] + data['_p...'] + data['_m...']
data['...'] = data['...'] + data['_Vw...'] + data['_Ve+...'] + data['_IV:pe...'] + data['_IV...'] + data['_III...'] + data['_:+...']
data['...'] = data['...'] + data['_IICNp...'] + data['_III+...'] + data['_/K0p...'] + data['_/G0Vs...'] + data['_Ib+...']
data['...'] = data['...'] + data['_:w...'] + data['_:V...'] + data['_:III:+...'] + data['_:+...'] + data['_/K1III+..'] + data['_/G5Vw...'] 
data['...'] = data['...'] + data['_/A3V+...'] + data['_/G8w...']   + data['_:w...'] + data['_IIICN...'] + data['_IIIp...']
data['...'] = data['...'] + data['_IV...'] + data['_IV:pe...'] + data['_IVne+...'] + data['_Ib+...'] + data['_V+...'] + data['_VCN...']

data[':'] = data['_.5III:'] + data['_/G8III:'] + data['_/K0III:'] + data['_IV:pe...'] + data['_:+...'] + data['_:III:+...'] + data['_:w...'] + data['_:V...']
data[':'] = data[':'] + data['_/K1IV/V:'] + data['_/K3III:'] + data['_/K4III:'] + data['_:'] + data['_II/IIICNV:'] + data['_III:'] + data['_:III:']
data[':'] = data[':'] + data['_IV: (+F/G)'] + data['_IV:pe...'] + data['_Ia0:'] + data['_Iab:'] + data['_O:'] + data['_V:'] + data['_V:n'] + data['_V:pe']

data['+'] = data['_+...'] + data['_/A3V+...'] + data['_/K0V + A/F'] + data['_Ve+...'] + data['_:III:+...'] + data['_Ib+...']
data['+'] = data['+'] + data['_III+...'] + data['_:+...'] + data['_/K1III+..'] + data['_IVne+...'] + data['_Ib+...'] + data['_V + G/K']
data['+'] = data['+'] + data['_V+...'] 

data['A'] = data['_y_A'] + data['_x_A'] + data['_/A1V'] + data['_/A2V'] + data['_/A3III'] + data['_/A3IV'] + data['_/A3IV/V'] + data['_/A3V'] 
data['A'] = data['A'] + data['_/A7IV'] + data['_/A8III']

data['B'] = data['_/B3V'] + data['_/B8III'] + data['_/B8Ib'] + data['_/B8V'] + data['_/B9.5V'] + data['_/B9II/III'] + data['_/B9III/IV']
data['B'] = data['B'] + data['_/B9V'] + data['_y_B'] 

data['C'] = data['_y_C'] + data['_x_C']
data['E'] = data['_-G2Ie'] + data['_.5Ve']

data['F'] = data['_/F2IV'] + data['_/F2IV/V'] + data['_/F3'] + data['_/F3IV'] + data['_/F3V'] + data['_/F5IV'] + data['_/F5V'] + data['_/F6V'] 
data['F'] = data['F'] + data['_/F7IV/V'] + data['_/F7V'] + data['_/F8V'] + data['_y_F']

data['G'] = data['_y_G'] + data['_-G2Ie'] + data['_/G0V'] + data['_/G1V'] + data['_/G2V'] + data['_/G3V'] + data['_/G5III'] + data['_/G5V']  + data['_/G5Vw...']  
data['G'] = data['G'] + data['_/G6V']  + data['_/G8III']  + data['_/G8V']  + data['_/G8w...']  + data['_G2'] 

data['K'] = data['_y_K'] + data['_x_K'] + data['_/K0II']  + data['_/K0III'] + data['_/K0III:'] + data['_/K0IV/V'] + data['_/K0V'] + data['_/K0V + A/F']
data['K'] = data['K'] + data['_/K0p...'] + data['_/K1III'] + data['_/K1III+..'] + data['_/K1IV'] + data['_/K1IV/V:'] + data['_/K1V'] + data['_/K2III']
data['K'] = data['K'] + data['_/K3III'] + data['_/K3III:'] + data['_/K3V'] + data['_/K4'] + data['_/K4III'] + data['_/K4III:'] + data['_/K5III'] + data['_y_k']

data['N'] = data['_y_N'] + data['_x_N'] 
data['M'] = data['_y_M'] + data['_x_M'] + data['_/M0III'] + data['_/M1III'] + data['_/M2III'] + data['_/M2V'] + data['_/M3III'] + data['_e-M7e']
data['RD'] = data['_y_R'] + data['_x_d']
data['S'] = data['_y_s'] + data['_x_s']
data['W'] = data['_y_W'] + data['_x_w']
data['O'] = data['_y_O'] + data['_/O8'] + data['_O:']

data['D'] = data['_y_D']

data['n'] = data['_x_n'] + data['_.5Vn'] + data['_IVn'] + data['_IVne+...'] + data['_V:n'] + data['_Vn'] + data['_Vne'] + data['_npe']
data['e'] = data['_x_e'] + data['_.5e'] + data['_IV:pe...'] + data['_IVne+...'] + data['_V:pe'] + data['_Ve'] + data['_Ve+...'] + data['_Vne'] + data['_e']
data['e'] = data['e'] + data['_e-M7e'] + data['_npe']

data['0'] = data['_x_0'] + data['_/G0V'] + data['_/G0Vs...'] + data['_/K0II'] + data['_/K0III'] + data['_/K0III:'] + data['_/K0IV/V'] + data['_/K0V']
data['0'] = data['0'] + data['_/K0V + A/F'] + data['_/K0p...'] + data['_/M0III'] + data['_0'] + data['_Ia0:']
data['1'] = data['_x_1'] + data['_/A1V'] + data['_/G1V'] + data['_/G2V'] + data['_/K1III'] + data['_/K1III+..'] + data['_/K1IV'] + data['_/K1IV/V:']
data['1'] = data['1'] + data['_/K1V'] + data['_/M1III']
data['2'] = data['_x_2'] + data['_-G2Ie'] + data['_/A2V'] + data['_/M2III'] + data['_/F2IV'] + data['_/F2IV/V'] + data['_/K2III'] + data['_/M2V'] + data['_G2']
data['3'] = data['_x_3'] + data['_/A3III'] + data['_/A3IV'] + data['_/A3IV/V'] + data['_/A3V'] + data['_/A3V+...'] + data['_/B3V'] + data['_/F3']
data['3'] = data['3'] + data['_/F3IV'] + data['_/F3V'] + data['_/G2V'] + data['_/K3III'] + data['_/K3III:'] + data['_/K3V'] + data['_/M3III']
data['3'] = data['3'] + data['_3'] + data['_3   :']
data['4'] = data['_x_4'] + data['_/K4'] + data['_/K4III'] + data['_/K4III:']
data['5'] = data['_x_5'] + data['_/F5IV'] + data['_/F5V'] + data['_/G5III'] + data['_/G5V'] + data['_/G5Vw...'] + data['_/K5III'] + data['_5']
data['6'] = data['_x_6'] + data['_/F6V'] + data['_/G6V'] + data['_6'] + data['_6 (SB1)'] 
data['7'] = data['_x_7'] + data['_/A7IV'] + data['_/F7IV/V'] + data['_/F7V'] + data['_7'] + data['_e-M7e']
data['8'] = data['_x_8'] + data['_/A8III'] + data['_/B8III'] + data['_/B8Ib'] + data['_/B8V'] + data['_/F8V'] + data['_/G8III'] + data['_/G8III:'] 
data['8'] = data['_x_8'] + data['_/G8V'] + data['_/G8w...'] + data['_/O8'] + data['_8']
data['9'] = data['_x_9'] + data['_/B9.5V'] + data['_/B9II/III'] + data['_/B9III/IV'] + data['_/B9V'] + data['_9'] + data['_9?']

In [None]:
data = data.drop(labels=['harv_class', '_y_A', '_y_B', '_y_C', '_y_D', '_y_F', '_y_G', '_y_K', '_y_M', '_y_N', '_y_O',
                         '_y_R', '_y_W', '_y_d', '_y_k', '_y_m', '_y_s', '_x_', '_x_-', '_x_.', '_x_0', '_x_1', '_x_2', 
                         '_x_3', '_x_4', '_x_5', '_x_6', '_x_7', '_x_8', '_x_9', '_x_:', '_x_A', '_x_C', '_x_K', '_x_M', 
                         '_x_N', '_x_c', '_x_d', '_x_e', '_x_m', '_x_n', '_x_p', '_x_s', '_x_w', '_', '_ Si', '_ V', '_+...', 
                         '_-G2Ie', '_..', '_...', '_.5', '_.5III', '_.5III:', '_.5IV', '_.5IV-V', '_.5Ia', '_.5Ib', '_.5V', 
                         '_.5Ve', '_.5Vn', '_.5e', '_/A1V', '_/A2V', '_/A3III', '_/A3IV', '_/A3IV/V', '_/A3V', '_/A3V+...', 
                         '_/A7IV', '_/A8III', '_/B3V', '_/B8III', '_/B8Ib', '_/B8V', '_/B9.5V', '_/B9II/III', '_/B9III/IV', 
                         '_/B9V', '_/F2IV', '_/F2IV/V', '_/F3', '_/F3IV', '_/F3V', '_/F5IV', '_/F5V', '_/F6V', '_/F7IV/V',
                         '_/F7V', '_/F8V', '_/G0V', '_/G0Vs...', '_/G1V', '_/G2V', '_/G3V', '_/G5III', '_/G5V', '_/G5Vw...', 
                         '_/G6V', '_/G8III', '_/G8III:', '_/G8V', '_/G8w...', '_/K0II', '_/K0III', '_/K0III:', '_/K0IV/V', '_/K0V',
                         '_/K0V + A/F', '_/K0p...', '_/K1III', '_/K1III+..', '_/K1IV', '_/K1IV/V:', '_/K1V', '_/K2III', '_/K3III',
                         '_/K3III:', '_/K3V', '_/K4', '_/K4III', '_/K4III:', '_/K5III', '_/M0III', '_/M1III', '_/M2III', '_/M2V',
                         '_/M3III', '_/O8', '_0', '_3', '_3   :', '_5', '_6', '_6 (SB1)', '_7', '_8', '_9', '_9?', '_:', '_:+...', 
                         '_:III:', '_:III:+...', '_:V...', '_:w...', '_G2', '_II', '_II-III', '_II/III', '_II/IIICNV:', '_IICNp...', 
                         '_III', '_III + (F)', '_III+...', '_III-IV', '_III...', '_III/IV', '_III/IVCN.', '_III:', '_IIICN...', '_IIICNII',
                         '_IIICNp...', '_IIIb', '_IIIp', '_IIIp...', '_IIIvar', '_IIb', '_IV', '_IV-V', '_IV...', '_IV/V', '_IV: (+F/G)',
                         '_IV:pe...', '_IVn', '_IVne+...', '_Ia', '_Ia0:', '_Iab:', '_Ib', '_Ib+...', '_Ib-II', '_Ib/II', '_O:', '_Sv', 
                         '_V', '_V + G/K', '_V+...', '_V-VI', '_V...', '_V:', '_V:n', '_V:pe', '_VCN...', '_Ve', '_Ve+...', '_Vn', '_Vne', 
                         '_Vp', '_Vs', '_Vw...', '_e', '_e-M7e', '_m', '_m...', '_npe', '_p', '_p...', '_psh', '_sp...', '_w...', '_wp', 'spect',
                         'index'], axis=1)

In [None]:
data['ra'] = data['ra'].astype(float)
data['dec'] = data['dec'].astype(float)
data['dist'] = data['dist'].astype(float)
data['pmra'] = data['pmra'].astype(float)
data['pmdec'] = data['pmdec'].astype(float)
data['rv'] = data['rv'].astype(float)
data['mag'] = data['mag'].astype(float)
data['absmag'] = data['absmag'].astype(float)
data['x'] = data['x'].astype(float)
data['y'] = data['y'].astype(float)
data['z'] = data['z'].astype(float)
data['vx'] = data['vx'].astype(float)
data['vz'] = data['vz'].astype(float)
data['vy'] = data['vy'].astype(float)
data['rarad'] = data['rarad'].astype(float)
data['decrad'] = data['decrad'].astype(float)
data['pmrarad'] = data['pmrarad'].astype(float)
data['pmdecrad'] = data['pmdecrad'].astype(float)
data['comp_primary'] = data['comp_primary'].astype(float)
data['lum'] = data['lum'].astype(float)
data['ci'] = data['ci'].astype(float)

In [None]:
data.to_csv('scrubbed.csv')