In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

Preprocessing

In [2]:
df = pd.read_csv("Dataset/Star99999_raw.csv")

In [3]:
df

Unnamed: 0.1,Unnamed: 0,Vmag,Plx,e_Plx,B-V,SpType
0,0,9.10,3.54,1.39,0.482,F5
1,1,9.27,21.90,3.10,0.999,K3V
2,2,6.61,2.81,0.63,-0.019,B9
3,3,8.06,7.75,0.97,0.370,F0V
4,4,8.55,2.87,1.11,0.902,G8III
...,...,...,...,...,...,...
99994,99994,8.72,3.07,0.87,0.097,B3
99995,99995,9.25,,,0.131,A1V
99996,99996,8.08,1.07,0.68,1.094,G5
99997,99997,6.98,2.97,0.76,-0.143,B1.5V


In [4]:
df.columns

Index(['Unnamed: 0', 'Vmag', 'Plx', 'e_Plx', 'B-V', 'SpType'], dtype='object')

In [5]:
df.describe()

Unnamed: 0.1,Unnamed: 0
count,99999.0
mean,49999.0
std,28867.369122
min,0.0
25%,24999.5
50%,49999.0
75%,74998.5
max,99998.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99999 entries, 0 to 99998
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  99999 non-null  int64 
 1   Vmag        99999 non-null  object
 2   Plx         99999 non-null  object
 3   e_Plx       99999 non-null  object
 4   B-V         99999 non-null  object
 5   SpType      97377 non-null  object
dtypes: int64(1), object(5)
memory usage: 4.6+ MB


Data Cleaning

In [7]:
# Convert Columns data type to float values
df["Vmag"] = pd.to_numeric(df["Vmag"], downcast="float", errors='coerce')
df["Plx"] = pd.to_numeric(df["Plx"], downcast="float", errors='coerce')
df["e_Plx"] = pd.to_numeric(df["e_Plx"], downcast="float", errors='coerce')
df["B-V"] = pd.to_numeric(df["B-V"], downcast="float", errors='coerce')

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99999 entries, 0 to 99998
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  99999 non-null  int64  
 1   Vmag        99998 non-null  float32
 2   Plx         99768 non-null  float32
 3   e_Plx       99768 non-null  float32
 4   B-V         98871 non-null  float32
 5   SpType      97377 non-null  object 
dtypes: float32(4), int64(1), object(1)
memory usage: 3.1+ MB


In [9]:
df.describe(include='all')

Unnamed: 0.1,Unnamed: 0,Vmag,Plx,e_Plx,B-V,SpType
count,99999.0,99998.0,99768.0,99768.0,98871.0,97377
unique,,,,,,3756
top,,,,,,K0
freq,,,,,,7355
mean,49999.0,8.369723,7.212443,1.365389,0.704728,
std,28867.369122,1.313881,11.349038,1.816845,0.489686,
min,0.0,-1.44,-54.950001,0.38,-0.4,
25%,24999.5,7.64,2.51,0.88,0.348,
50%,49999.0,8.44,4.63,1.1,0.612,
75%,74998.5,9.14,8.41,1.39,1.075,


Check Missing Data

In [10]:
# get the number of missing data points per column
missing_values_count = df.isnull().sum()

missing_values_count

Unnamed: 0       0
Vmag             1
Plx            231
e_Plx          231
B-V           1128
SpType        2622
dtype: int64

In [11]:
# how many total missing values do we have?
total_cells = np.product(df.shape)
total_missing = missing_values_count.sum()

# percentage of data that is missing
percent_missing = (total_missing/total_cells)
print("Percentage Missing:", "{:.2%}".format(percent_missing))

AttributeError: module 'numpy' has no attribute 'product'

Dropping Missing Data

In [None]:
# remove all the rows that contain a missing value
# better to store it into a new variable to avoid confusion
df_dropped = df.dropna() 

df_dropped

In [None]:
# just how much rows did we drop?
dropped_rows_count = df.shape[0]-df_na_dropped.shape[0]
print("Rows we dropped from original dataset: %d \n" % dropped_rows_count)

# Percentage we dropped
percent_dropped = dropped_rows_count/df.shape[0]
print("Percentage Loss:", "{:.2%}".format(percent_dropped))

In [None]:
df_na_dropped.describe()

Droping Unwanted Column

In [None]:
#The best way to do this in pandas is to use drop:
df_na_dropped = df_na_dropped.drop('Unnamed: 0', axis=1)

In [None]:
df_na_dropped.describe()

In [None]:
df_na_dropped.info()

In [None]:
df_na_dropped_reindex = df_na_dropped.reset_index(drop=True)

In [None]:
df_na_dropped_reindex.info()

In [None]:
#Optional - Save our progress
df_na_dropped_reindex.to_csv("Star99999_na_dropped.csv", index=False)