# KNN

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [2]:
HYG = pd.read_csv("hygdata_v3.csv")

In [3]:
HYG.head()

Unnamed: 0,id,hip,hd,hr,gl,bf,proper,ra,dec,dist,...,bayer,flam,con,comp,comp_primary,base,lum,var,var_min,var_max
0,0,,,,,,Sol,0.0,0.0,0.0,...,,,,1,0,,1.0,,,
1,1,1.0,224700.0,,,,,6e-05,1.089009,219.7802,...,,,Psc,1,1,,9.63829,,,
2,2,2.0,224690.0,,,,,0.000283,-19.49884,47.9616,...,,,Cet,1,2,,0.392283,,,
3,3,3.0,224699.0,,,,,0.000335,38.859279,442.4779,...,,,And,1,3,,386.901132,,,
4,4,4.0,224707.0,,,,,0.000569,-51.893546,134.2282,...,,,Phe,1,4,,9.366989,,,


# DATA Cleaning

We will first drop all columns that are completely irrelevant to making Morgan-Keegan spectral class prediction. With reference to the <a href='https://github.com/astronexus/HYG-Database'>data dictionary</a>, these columns are the following:<br>
1. `id`: the database primary key.<br>
2. `hip`: the star's ID in the Hipparcos catalog, if known.<br>
3. `hd`: the star's ID in the Henry Draper catalog, if known.<br>
4. `hr`: the star's ID in the Harvard Revised catalog, which is the same as its number in the Yale Bright Star Catalog.<br>
5. `gl`: the star's ID in the third edition of the Gliese Catalog of Nearby Stars.<br>
6. `bf`: the Bayer / Flamsteed designation, primarily from the Fifth Edition of the Yale Bright Star Catalog. This is a combination of the two designations. The Flamsteed number, if present, is given first; then a three-letter abbreviation for the Bayer Greek letter; the Bayer superscript number, if present; and finally, the three-letter constellation abbreviation. Thus Alpha Andromedae has the field value "21Alp And", and Kappa1 Sculptoris (no Flamsteed number) has "Kap1Scl".<br>
7. (`ra`, `dec`): the star's right ascension and declination, for epoch and equinox 2000.0.<br>
8. `proper`: a common name for the star, such as "Barnard's Star" or "Sirius". I have taken these names primarily from the Hipparcos project's web site, which lists representative names for the 150 brightest stars and many of the 150 closest stars. I have added a few names to this list. Most of the additions are designations from catalogs mostly now forgotten (e.g., Lalande, Groombridge, and Gould ["G."]) except for certain nearby stars which are still best known by these designations.<br>
9. (`pmra`, `pmdec`): the star's proper motion in right ascension and declination, in milliarcseconds per year.<br>
10. `bayer`: the Bayer designation as a distinct value.<br>
11. `flam`: the Flamsteed number as a distinct value.<br>
12. `con`: the standard constellation abbreviation.<br>
13. `comp`: ID of companion star.<br>
14. `comp_primary`: ID of primary star for this component.<br>
15. `base`: catalog ID or name for this multi-star system. Currently only used for Gliese stars.<br>
16. `var`: star's standard variable star designation, when known.<br>
17. (`var_min`, `var_max`): star's approximate magnitude range, for variables. This value is based on the Hp magnitudes for the range in the original Hipparcos catalog, adjusted to the V magnitude scale to match the "mag" field.

In [4]:
HYG = HYG.drop(columns = ["id", "hip", "hd", "hr", "gl", "bf", "ra", "dec", "proper", "pmra", "pmdec",
                          "bayer", "flam", "con", "comp", "comp_primary", "base", "var", "var_min", "var_max"])
HYG.head(10)

Unnamed: 0,dist,rv,mag,absmag,spect,ci,x,y,z,vx,vy,vz,rarad,decrad,pmrarad,pmdecrad,lum
0,0.0,0.0,-26.7,4.85,G2V,0.656,5e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,219.7802,0.0,9.1,2.39,F5,0.482,219.740502,0.003449,4.177065,4e-08,-6e-06,-2e-06,1.6e-05,0.019007,-2.521031e-08,-9.114497e-09,9.63829
2,47.9616,0.0,9.27,5.866,K3V,0.999,45.210918,0.003365,-16.008996,-7e-08,4.2e-05,-2e-07,7.4e-05,-0.340319,8.785309e-07,-4.508767e-09,0.392283
3,442.4779,0.0,6.61,-1.619,B9,-0.019,344.552785,0.030213,277.614965,3.92e-06,1.1e-05,-4.86e-06,8.8e-05,0.678222,2.540424e-08,-1.410808e-08,386.901132
4,134.2282,0.0,8.06,2.421,F0V,0.37,82.835513,0.012476,-105.61954,8e-08,4.1e-05,6e-08,0.000149,-0.905713,3.047054e-07,7.75701e-10,9.366989
5,257.732,0.0,8.55,1.494,G8III,0.902,195.714261,0.034068,-167.695291,7.37e-06,3e-06,8.61e-06,0.000174,-0.70845,1.226579e-08,4.39726e-08,21.998851
6,55.0358,0.0,12.31,8.607,M0V:,1.336,54.905296,0.017912,3.787796,2.2e-07,6e-05,-3.42e-06,0.000326,0.068879,1.097085e-06,-6.225008e-08,0.03142
7,57.8704,0.0,9.64,5.828,G0,0.74,54.367897,0.020886,19.827115,1.932e-05,-5.8e-05,-5.292e-05,0.000385,0.349696,-1.008994e-06,-9.734574e-07,0.406256
8,200.8032,-31.0,9.05,2.536,M6e-M8.5e Tc,1.102,180.654532,0.086213,87.668389,-2.613e-05,1.9e-05,-1.88e-05,0.000477,0.451804,9.255093e-08,-2.744045e-08,8.425584
9,420.1681,0.0,8.59,0.473,G5,1.067,337.379614,0.207994,250.431996,-1.021e-05,-1.3e-05,1.377e-05,0.000617,0.638545,-3.054326e-08,4.082131e-08,56.337815


In [5]:
print(HYG.shape)

(119614, 17)


In [6]:
HYG.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119614 entries, 0 to 119613
Data columns (total 17 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   dist      119614 non-null  float64
 1   rv        119614 non-null  float64
 2   mag       119614 non-null  float64
 3   absmag    119614 non-null  float64
 4   spect     116564 non-null  object 
 5   ci        117732 non-null  float64
 6   x         119614 non-null  float64
 7   y         119614 non-null  float64
 8   z         119614 non-null  float64
 9   vx        119614 non-null  float64
 10  vy        119614 non-null  float64
 11  vz        119614 non-null  float64
 12  rarad     119614 non-null  float64
 13  decrad    119614 non-null  float64
 14  pmrarad   119614 non-null  float64
 15  pmdecrad  119614 non-null  float64
 16  lum       119614 non-null  float64
dtypes: float64(16), object(1)
memory usage: 15.5+ MB


## Handling and Imputing Missing Values

From the information presented above, we noticed the missing values `NaN` are mainly found in the `spect` and `ci` columns.<br>
Since we cannot make up for missing `spec` information (it is what we aim to predict in the first place), we will simply handle it by dropping any row (star) whose `spec` is empty.<br>
Also, we came to realize that our `spec` has spectral types other than the 7 outlined in the MK naming hierarchy (because it was originally recorded using a separate naming system). While some classes can be translated to the MK system, others cannot. Overall, due to lack of documentation and domain knowledge, we will try our best to carry out this translation.

In [7]:
missing_spec_index = HYG[HYG['spect'].isnull()].index
HYG = HYG.drop(labels = missing_spec_index)
HYG.head(10)

Unnamed: 0,dist,rv,mag,absmag,spect,ci,x,y,z,vx,vy,vz,rarad,decrad,pmrarad,pmdecrad,lum
0,0.0,0.0,-26.7,4.85,G2V,0.656,5e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,219.7802,0.0,9.1,2.39,F5,0.482,219.740502,0.003449,4.177065,4e-08,-6e-06,-2e-06,1.6e-05,0.019007,-2.521031e-08,-9.114497e-09,9.63829
2,47.9616,0.0,9.27,5.866,K3V,0.999,45.210918,0.003365,-16.008996,-7e-08,4.2e-05,-2e-07,7.4e-05,-0.340319,8.785309e-07,-4.508767e-09,0.392283
3,442.4779,0.0,6.61,-1.619,B9,-0.019,344.552785,0.030213,277.614965,3.92e-06,1.1e-05,-4.86e-06,8.8e-05,0.678222,2.540424e-08,-1.410808e-08,386.901132
4,134.2282,0.0,8.06,2.421,F0V,0.37,82.835513,0.012476,-105.61954,8e-08,4.1e-05,6e-08,0.000149,-0.905713,3.047054e-07,7.75701e-10,9.366989
5,257.732,0.0,8.55,1.494,G8III,0.902,195.714261,0.034068,-167.695291,7.37e-06,3e-06,8.61e-06,0.000174,-0.70845,1.226579e-08,4.39726e-08,21.998851
6,55.0358,0.0,12.31,8.607,M0V:,1.336,54.905296,0.017912,3.787796,2.2e-07,6e-05,-3.42e-06,0.000326,0.068879,1.097085e-06,-6.225008e-08,0.03142
7,57.8704,0.0,9.64,5.828,G0,0.74,54.367897,0.020886,19.827115,1.932e-05,-5.8e-05,-5.292e-05,0.000385,0.349696,-1.008994e-06,-9.734574e-07,0.406256
8,200.8032,-31.0,9.05,2.536,M6e-M8.5e Tc,1.102,180.654532,0.086213,87.668389,-2.613e-05,1.9e-05,-1.88e-05,0.000477,0.451804,9.255093e-08,-2.744045e-08,8.425584
9,420.1681,0.0,8.59,0.473,G5,1.067,337.379614,0.207994,250.431996,-1.021e-05,-1.3e-05,1.377e-05,0.000617,0.638545,-3.054326e-08,4.082131e-08,56.337815


In [8]:
print(HYG.shape)

(116564, 17)


In [9]:
def find_cap_character(string):
    return re.findall('[A-Z]', string)

In [10]:
cap_chars = HYG['spect'].apply(find_cap_character)
cap_chars

0         [G, V]
1            [F]
2         [K, V]
3            [B]
4         [F, V]
           ...  
119609        []
119610        []
119611       [M]
119612       [M]
119613    [D, A]
Name: spect, Length: 116564, dtype: object

In [11]:
MK_star_types = ['O', 'B', 'A', 'F', 'G', 'K', 'M']

In [12]:
first_cap_char = [cap_char_list[0] if len(cap_char_list) >= 1 else np.NaN for cap_char_list in cap_chars]
first_cap_char[:10]

['G', 'F', 'K', 'B', 'F', 'G', 'M', 'G', 'M', 'G']

In [13]:
naive_MK_spect = [cap_char if cap_char in MK_star_types else np.NaN for cap_char in first_cap_char]
naive_MK_spect[:10]

['G', 'F', 'K', 'B', 'F', 'G', 'M', 'G', 'M', 'G']

In [14]:
HYG['spect'] = naive_MK_spect
missing_spec_index_1 = HYG[HYG['spect'].isnull()].index
HYG = HYG.drop(labels = missing_spec_index_1)
HYG.head()

Unnamed: 0,dist,rv,mag,absmag,spect,ci,x,y,z,vx,vy,vz,rarad,decrad,pmrarad,pmdecrad,lum
0,0.0,0.0,-26.7,4.85,G,0.656,5e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,219.7802,0.0,9.1,2.39,F,0.482,219.740502,0.003449,4.177065,4e-08,-6e-06,-2e-06,1.6e-05,0.019007,-2.521031e-08,-9.114497e-09,9.63829
2,47.9616,0.0,9.27,5.866,K,0.999,45.210918,0.003365,-16.008996,-7e-08,4.2e-05,-2e-07,7.4e-05,-0.340319,8.785309e-07,-4.508767e-09,0.392283
3,442.4779,0.0,6.61,-1.619,B,-0.019,344.552785,0.030213,277.614965,3.92e-06,1.1e-05,-4.86e-06,8.8e-05,0.678222,2.540424e-08,-1.410808e-08,386.901132
4,134.2282,0.0,8.06,2.421,F,0.37,82.835513,0.012476,-105.61954,8e-08,4.1e-05,6e-08,0.000149,-0.905713,3.047054e-07,7.75701e-10,9.366989


In [15]:
print(HYG.shape)

(115276, 17)


In [16]:
HYG['ci']

0         0.656
1         0.482
2         0.999
3        -0.019
4         0.370
          ...  
119603    1.500
119605      NaN
119606      NaN
119611    1.640
119612      NaN
Name: ci, Length: 115276, dtype: float64

On the other hand, since `ci` is a column that contains continuous values, we can handle the missing values within by imputing them with the mean color index of each spectral group.

In [17]:
def mean_impute(series: pd.Series):
    return series.fillna(series.mean())

In [18]:
HYG.groupby('spect')['ci'].transform(mean_impute)

0         0.656000
1         0.482000
2         0.999000
3        -0.019000
4         0.370000
            ...   
119603    1.500000
119605    1.555152
119606    1.555152
119611    1.640000
119612    1.555152
Name: ci, Length: 115276, dtype: float64

In [19]:
HYG['ci'] = HYG.groupby('spect')['ci'].transform(mean_impute)
HYG.head(10)

Unnamed: 0,dist,rv,mag,absmag,spect,ci,x,y,z,vx,vy,vz,rarad,decrad,pmrarad,pmdecrad,lum
0,0.0,0.0,-26.7,4.85,G,0.656,5e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,219.7802,0.0,9.1,2.39,F,0.482,219.740502,0.003449,4.177065,4e-08,-6e-06,-2e-06,1.6e-05,0.019007,-2.521031e-08,-9.114497e-09,9.63829
2,47.9616,0.0,9.27,5.866,K,0.999,45.210918,0.003365,-16.008996,-7e-08,4.2e-05,-2e-07,7.4e-05,-0.340319,8.785309e-07,-4.508767e-09,0.392283
3,442.4779,0.0,6.61,-1.619,B,-0.019,344.552785,0.030213,277.614965,3.92e-06,1.1e-05,-4.86e-06,8.8e-05,0.678222,2.540424e-08,-1.410808e-08,386.901132
4,134.2282,0.0,8.06,2.421,F,0.37,82.835513,0.012476,-105.61954,8e-08,4.1e-05,6e-08,0.000149,-0.905713,3.047054e-07,7.75701e-10,9.366989
5,257.732,0.0,8.55,1.494,G,0.902,195.714261,0.034068,-167.695291,7.37e-06,3e-06,8.61e-06,0.000174,-0.70845,1.226579e-08,4.39726e-08,21.998851
6,55.0358,0.0,12.31,8.607,M,1.336,54.905296,0.017912,3.787796,2.2e-07,6e-05,-3.42e-06,0.000326,0.068879,1.097085e-06,-6.225008e-08,0.03142
7,57.8704,0.0,9.64,5.828,G,0.74,54.367897,0.020886,19.827115,1.932e-05,-5.8e-05,-5.292e-05,0.000385,0.349696,-1.008994e-06,-9.734574e-07,0.406256
8,200.8032,-31.0,9.05,2.536,M,1.102,180.654532,0.086213,87.668389,-2.613e-05,1.9e-05,-1.88e-05,0.000477,0.451804,9.255093e-08,-2.744045e-08,8.425584
9,420.1681,0.0,8.59,0.473,G,1.067,337.379614,0.207994,250.431996,-1.021e-05,-1.3e-05,1.377e-05,0.000617,0.638545,-3.054326e-08,4.082131e-08,56.337815


In [20]:
HYG.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 115276 entries, 0 to 119612
Data columns (total 17 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   dist      115276 non-null  float64
 1   rv        115276 non-null  float64
 2   mag       115276 non-null  float64
 3   absmag    115276 non-null  float64
 4   spect     115276 non-null  object 
 5   ci        115276 non-null  float64
 6   x         115276 non-null  float64
 7   y         115276 non-null  float64
 8   z         115276 non-null  float64
 9   vx        115276 non-null  float64
 10  vy        115276 non-null  float64
 11  vz        115276 non-null  float64
 12  rarad     115276 non-null  float64
 13  decrad    115276 non-null  float64
 14  pmrarad   115276 non-null  float64
 15  pmdecrad  115276 non-null  float64
 16  lum       115276 non-null  float64
dtypes: float64(16), object(1)
memory usage: 15.8+ MB


As we can now see, we reduced our number of observations down to 115276 observations, but without any missing value across any column/feature.

In [21]:
y = HYG["spect"]
X = HYG.drop(columns = ["spect", "dist"])

# KNN

In [22]:
kf = KFold(n_splits = 5, shuffle = True, random_state = 42)
model = KNeighborsClassifier(n_neighbors = 5)
scores_knn = cross_val_score(model, X, y, scoring = 'accuracy', cv = kf)

In [23]:
scores_knn

array([0.41269084, 0.41440035, 0.41418347, 0.41331598, 0.41344611])

# SVC

In [None]:
kf = KFold(n_splits = 5, shuffle = True, random_state = 42)
clf = make_pipeline(StandardScaler(), SVC(gamma = 'auto'))
scores_svc = cross_val_score(clf, X, y, scoring = 'accuracy', cv = kf)

In [None]:
scores_svc