Machine Learning Astronomy Related Project



In [69]:
import kagglehub 

path = kagglehub.dataset_download("datascientist97/astronomical-data")
print(f'path to dataset files {path}')


path to dataset files /home/codespace/.cache/kagglehub/datasets/datascientist97/astronomical-data/versions/1


In [70]:
import pandas as pd
import os 

print(os.listdir(path))   #this command is to retrieve the exact file name cotaining the data

['cleaned_star_data.csv']


In [71]:
file_path = os.path.join(path, 'cleaned_star_data.csv')      #creating the file path so i can use pandas to open it
df = pd.read_csv(file_path)
df.head(10)

Unnamed: 0,Temperature (K),Luminosity(L/Lo),Radius(R/Ro),Absolute magnitude(Mv),Star type,Star color,Spectral Class
0,,,,,,,
1,3042.0,0.0005,0.1542,16.6,0.0,Red,M
2,2600.0,0.0003,0.102,18.7,0.0,Red,M
3,2800.0,0.0002,,16.65,0.0,Red,M
4,1939.0,0.000138,0.103,20.06,0.0,Red,M
5,2840.0,,0.11,16.98,0.0,Red,M
6,2637.0,0.00073,0.127,17.22,0.0,Red,M
7,2600.0,0.0004,0.096,17.4,0.0,Red,M
8,2650.0,0.00069,0.11,17.45,0.0,Red,M
9,2700.0,0.00018,0.13,16.05,0.0,Red,M


In [72]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Temperature (K)         239 non-null    object 
 1   Luminosity(L/Lo)        239 non-null    object 
 2   Radius(R/Ro)            239 non-null    object 
 3   Absolute magnitude(Mv)  239 non-null    object 
 4   Star type               239 non-null    float64
 5   Star color              239 non-null    object 
 6   Spectral Class          239 non-null    object 
dtypes: float64(1), object(6)
memory usage: 13.3+ KB


In [73]:
df.describe()

Unnamed: 0,Star type
count,239.0
mean,2.51046
std,1.70728
min,0.0
25%,1.0
50%,3.0
75%,4.0
max,5.0


The Data I found in Kaggle doesn't seem to be cleaned or organized, there are even some missing ones, so I will try to clean it so I can use it without having trouble with biases.


In [74]:
#first, I will check for duplicated rows

index_list=df[df.duplicated(keep=False)].index.tolist()
if not index_list : 
    print('list is empty, there are no duplicated rows')
else :
    print(j for j in index_list)  
    

list is empty, there are no duplicated rows


It appears that there are no duplicated rows ! 

In [75]:

#df.duplicated(keep=False) return a series of True (for duplicated rows) and/or False (for unique rows).
# the df[df.duplicated(keep=False)] keeps the case where df.duplicated(keep=False) is true, hence duplicated.

df[df.duplicated(keep=False)]

Unnamed: 0,Temperature (K),Luminosity(L/Lo),Radius(R/Ro),Absolute magnitude(Mv),Star type,Star color,Spectral Class


In [76]:
#Checking for missing values
df.isnull().sum()

Temperature (K)           1
Luminosity(L/Lo)          1
Radius(R/Ro)              1
Absolute magnitude(Mv)    1
Star type                 1
Star color                1
Spectral Class            1
dtype: int64

It appears that each column has exactly one missing value 


In [77]:
new_df = df.drop(index=0) #deleting row with index = 0 because it contains missing values
new_df

Unnamed: 0,Temperature (K),Luminosity(L/Lo),Radius(R/Ro),Absolute magnitude(Mv),Star type,Star color,Spectral Class
1,3042,0.0005,0.1542,16.6,0.0,Red,M
2,2600,0.0003,0.102,18.7,0.0,Red,M
3,2800,0.0002,,16.65,0.0,Red,M
4,1939,0.000138,0.103,20.06,0.0,Red,M
5,2840,,0.11,16.98,0.0,Red,M
...,...,...,...,...,...,...,...
235,38940,374830,1356,-9.93,5.0,Blue,O
236,30839,834042,1194,-10.63,5.0,Blue,O
237,8829,537493,1423,-10.73,5.0,White,A
238,9235,404940,1112,-11.23,5.0,White,A


In [78]:
#Since I am used to index 0 being first and not 1, I am going to use .reset_index

new_df.reset_index()

Unnamed: 0,index,Temperature (K),Luminosity(L/Lo),Radius(R/Ro),Absolute magnitude(Mv),Star type,Star color,Spectral Class
0,1,3042,0.0005,0.1542,16.6,0.0,Red,M
1,2,2600,0.0003,0.102,18.7,0.0,Red,M
2,3,2800,0.0002,,16.65,0.0,Red,M
3,4,1939,0.000138,0.103,20.06,0.0,Red,M
4,5,2840,,0.11,16.98,0.0,Red,M
...,...,...,...,...,...,...,...,...
234,235,38940,374830,1356,-9.93,5.0,Blue,O
235,236,30839,834042,1194,-10.63,5.0,Blue,O
236,237,8829,537493,1423,-10.73,5.0,White,A
237,238,9235,404940,1112,-11.23,5.0,White,A


In [79]:
new_df.isnull().sum()

Temperature (K)           0
Luminosity(L/Lo)          0
Radius(R/Ro)              0
Absolute magnitude(Mv)    0
Star type                 0
Star color                0
Spectral Class            0
dtype: int64

I no longer have NaN values, but I still notice some empty columns, so I will work on that now.

In [80]:
print((df==" ").sum())   #I specified the space " " to retrieve empty columns

Temperature (K)           2
Luminosity(L/Lo)          6
Radius(R/Ro)              7
Absolute magnitude(Mv)    5
Star type                 0
Star color                6
Spectral Class            2
dtype: int64


Now, I will replace the empty spaces with NaN to facilitate the process of replacing them. For more information please visit this page https://sparkbyexamples.com/pandas/pandas-replace-blank-values-with-nan/#:~:text=Pandas%20Replace%20Blank%20Values%20with,case%20of%20the%20specified%20value.

In [81]:
newest_df = new_df.mask(new_df == " ")

#now checking if missing values are detected
newest_df.isnull().sum() 

Temperature (K)           2
Luminosity(L/Lo)          6
Radius(R/Ro)              7
Absolute magnitude(Mv)    5
Star type                 0
Star color                6
Spectral Class            2
dtype: int64

In [82]:
#I want to see all the rows with missing values
rows_with_nan = newest_df[newest_df.isna().any(axis=1)]
rows_with_nan

Unnamed: 0,Temperature (K),Luminosity(L/Lo),Radius(R/Ro),Absolute magnitude(Mv),Star type,Star color,Spectral Class
3,2800.0,0.0002,,16.65,0.0,Red,M
5,2840.0,,0.11,16.98,0.0,Red,M
10,3600.0,,0.51,10.69,1.0,Red,M
18,3192.0,,0.1967,13.53,1.0,Red,M
23,8500.0,0.0005,0.01,14.5,2.0,,A
31,30000.0,28840.0,6.3,-4.2,3.0,,B
37,6380.0,1.35,0.98,,3.0,,F
44,3008.0,,25.0,-6.0,4.0,Red,M
51,3750.0,283000.0,1260.0,,5.0,Red,M
58,3752.0,209000.0,955.0,,5.0,Red,


In [83]:
#I will create a new Dataframe with Temperature and spectral class columns of the previous one
Temp_Spec_class_df = newest_df[['Temperature (K)', 'Spectral Class', 'Absolute magnitude(Mv)', 'Star color']]
Temp_Spec_class_df

Unnamed: 0,Temperature (K),Spectral Class,Absolute magnitude(Mv),Star color
1,3042,M,16.6,Red
2,2600,M,18.7,Red
3,2800,M,16.65,Red
4,1939,M,20.06,Red
5,2840,M,16.98,Red
...,...,...,...,...
235,38940,O,-9.93,Blue
236,30839,O,-10.63,Blue
237,8829,A,-10.73,White
238,9235,A,-11.23,White


In [84]:
#Next, I'll drop the rows wit empty values
Temp_Spec_class_df.dropna(inplace=True)
Temp_Spec_class_df.reset_index()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Temp_Spec_class_df.dropna(inplace=True)


Unnamed: 0,index,Temperature (K),Spectral Class,Absolute magnitude(Mv),Star color
0,1,3042,M,16.6,Red
1,2,2600,M,18.7,Red
2,3,2800,M,16.65,Red
3,4,1939,M,20.06,Red
4,5,2840,M,16.98,Red
...,...,...,...,...,...
221,235,38940,O,-9.93,Blue
222,236,30839,O,-10.63,Blue
223,237,8829,A,-10.73,White
224,238,9235,A,-11.23,White


In [99]:
#since the spectral class and star color are not numerical but I will need them to train my model, I will encode them
pd.get_dummies(Temp_Spec_class_df['Spectral Class'])

Unnamed: 0,A,B,F,G,K,M,O
1,False,False,False,False,False,True,False
2,False,False,False,False,False,True,False
3,False,False,False,False,False,True,False
4,False,False,False,False,False,True,False
5,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...
235,False,False,False,False,False,False,True
236,False,False,False,False,False,False,True
237,True,False,False,False,False,False,False
238,True,False,False,False,False,False,False


In [88]:
pd.get_dummies(Temp_Spec_class_df['Star color'])

Unnamed: 0,Blue,Blue-White,Red,White,Yellow-White
1,False,False,True,False,False
2,False,False,True,False,False
3,False,False,True,False,False
4,False,False,True,False,False
5,False,False,True,False,False
...,...,...,...,...,...
235,True,False,False,False,False
236,True,False,False,False,False
237,False,False,False,True,False
238,False,False,False,True,False


In [94]:
cat_X = pd.get_dummies(Temp_Spec_class_df, columns= ['Spectral Class', 'Star color']) 
num_X = Temp_Spec_class_df[['Absolute magnitude(Mv)']]
X= pd.concat([cat_X, num_X], axis=1)
y = Temp_Spec_class_df['Temperature (K)']


In [95]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline


In [96]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [97]:
pipeline = make_pipeline(PolynomialFeatures(2), LinearRegression())
pipeline.fit(X_train,y_train)

In [98]:
import numpy as np

pred = pipeline.predict(X_test)
mse = np.sqrt(mean_squared_error(y_test,pred))
print(f'Mean error: {mse:3.3} ({mse/np.mean(pred)*100:3.3}%)')


Mean error: 1.23e+02 (1.2%)
