In [19]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plot
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score
import os
from collections import Counter
from sklearn.cluster import KMeans
import astropy.units as u
from astropy.coordinates import SkyCoord, Galactic, ICRS
from sklearn.model_selection import cross_val_score, StratifiedKFold

#Data file
file = os.chdir("/home/abhi/Desktop/eROSITA/")

#Importing the data and model parameters
data = pd.read_csv("e-ROSITA NEW RESULTS - EROSITA+GAIA (1).csv")
data2 = pd.read_csv("parameters.txt").to_numpy()
#.reshape(-1,1)
lum_x = data["Luminosity (ergs/sec)"].dropna().to_numpy().reshape(-1,1)
dist = data["Distance (pc)"].dropna().to_numpy().reshape(-1,1)
sr_class = data["New source class"].dropna().to_numpy()
bp_rp = data["bp_rp"].dropna().to_numpy().reshape(-1,1)
Absolute_G = data["Absolute G"].dropna().to_numpy().reshape(-1,1)
Temp = data["Temp (Kelvin)"].dropna().to_numpy().reshape(-1,1)
lum_o = data["luminosity_GAIA (ergs/sec)"].dropna().to_numpy().reshape(-1,1)
radius = data["RADIUS"].to_numpy().reshape(-1,1)

#Checking the shapes
print("lum_x",np.shape(lum_x))
print("lum_o",np.shape(lum_o))
print("dist",np.shape(dist))
print("bp-rp",np.shape(bp_rp))
print("Temp", np.shape(Temp))
print("radius", np.shape(radius))
print("Absolute_G", np.shape(Absolute_G))

#Changing the elements to nan
lum_o[lum_o == "--"] = np.nan
Temp[Temp == "--"] = np.nan
bp_rp[bp_rp == "--"] = np.nan
radius[radius == "--"] = np.nan

#Checking the length of the array
print(len(radius))
print(len(dist))
print(len(Temp))

#Checking the source class
count = Counter(sr_class)
print(count)

#Class
class_1 = ['SS_IB', 'IB', 'YSO', 'Not_known', 'CV', 'LPV','XRB']

#Replacing sources by their numbers
sr_class_new = sr_class

sr_class_new[sr_class_new  == "SS_IB"] = 0
sr_class_new[sr_class_new  == "IB"] = 1
sr_class_new[sr_class_new  == "YSO"] = 2
sr_class_new[sr_class_new  == "Not_known"] = 3
sr_class_new[sr_class_new  == "CV"] = 4
sr_class_new[sr_class_new  == "LPV"] = 5
sr_class_new[sr_class_new  == "XRB"] = 6

# Random Forest classification
rf_cl = RandomForestClassifier(n_estimators = 250, class_weight="balanced", random_state=42)
target = np.array(sr_class_new, dtype=int)

#Data and Standardization
data_1 = np.column_stack((lum_x,dist,Absolute_G,bp_rp,lum_o,Temp,radius))#,bp_rp,Absolute_G,lum_o,Temp,radius,a1,a2,Ep,galac_coord_LA,galac_coord_LG))#,o_period)) = how to adjust the nan vals
scalar = RobustScaler()
data_1_scaled = scalar.fit_transform(data_1)
# data_1_scaled_clean = data_1_scaled[~np.isnan(data_1_scaled).any(axis=1)]
# target_clean = target[~np.isnan(data_1_scaled).any(axis=1)]

#Test_Train Data Set
X_train, X_test, y_train, y_test = train_test_split(data_1_scaled, target, test_size=0.25, random_state=42)

#RF classification
y_rf_cl = rf_cl.fit(X_train,y_train)
print(y_rf_cl)

lum_x (2916, 1)
lum_o (2916, 1)
dist (2916, 1)
bp-rp (2916, 1)
Temp (2916, 1)
radius (2916, 1)
Absolute_G (2916, 1)
2916
2916
2916
Counter({'SS_IB': 1671, 'IB': 497, 'YSO': 376, 'Not_known': 261, 'CV': 78, 'LPV': 25, 'XRB': 8})
RandomForestClassifier(class_weight='balanced', n_estimators=250,
                       random_state=42)


In [20]:
y_rf_cl

In [None]:
#Predict
lum_x_ = 1.18e31
dist_ = 114.15
bp_rp_ = 0.98
Absolute_G_ = 3.34
lum_o_ = 3.564
Temp_ = 5366
radius_ = 2.18

source = np.array([[lum_x_,dist_,bp_rp_,Absolute_G_,lum_o_,Temp_,radius_]])   #,bp_rp_,Absolute_G_,lum_o_,Temp_,radius_,galac_coord_LA_,galac_coord_LG_]])
source_scaled = scalar.transform(source)

result = y_rf_cl.predict(source_scaled)
print("The new source belongs to the class:",class_1[int(result[0])])

The new source belongs to the class: IB


In [36]:
#Predict
lum_x_ = 2.16e31
dist_ = 243.14
bp_rp_ = 1.246
Absolute_G_ = 0.391
lum_o_ = 3.564
Temp_ = 4779
radius_ = 11.35

source = np.array([[lum_x_,dist_,bp_rp_,Absolute_G_,lum_o_,Temp_,radius_]])   #,bp_rp_,Absolute_G_,lum_o_,Temp_,radius_,galac_coord_LA_,galac_coord_LG_]])
source_scaled = scalar.transform(source)

result = y_rf_cl.predict(source_scaled)
print("The new source belongs to the class:",class_1[int(result[0])])

The new source belongs to the class: SS_IB


In [32]:
y_pred = rf_cl.predict(X_test) 
cm = confusion_matrix(y_test, y_pred)

report = classification_report(y_test, y_pred)
print("Confusion Matrix is as follows:")
print(cm)

Confusion Matrix is as follows:
[[336  33  19   8   3   3   0]
 [ 98  26   4   4   0   1   0]
 [ 51   2  49   0   0   0   0]
 [ 46   1   5  16   1   0   0]
 [  2   0   0   2  13   0   0]
 [  2   1   0   0   0   2   0]
 [  1   0   0   0   0   0   0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [33]:
class_1 = ['SS_IB', 'IB', 'YSO', 'Not_known', 'CV', 'LPV','XRB']
print(classification_report(y_test, y_pred, target_names=class_1))

              precision    recall  f1-score   support

       SS_IB       0.63      0.84      0.72       402
          IB       0.41      0.20      0.27       133
         YSO       0.64      0.48      0.55       102
   Not_known       0.53      0.23      0.32        69
          CV       0.76      0.76      0.76        17
         LPV       0.33      0.40      0.36         5
         XRB       0.00      0.00      0.00         1

    accuracy                           0.61       729
   macro avg       0.47      0.42      0.43       729
weighted avg       0.58      0.61      0.57       729



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [34]:
#Using cross_validation 
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Evaluate model with accuracy scoring
scores = cross_val_score(rf_cl,data_1_scaled,target, cv=cv, scoring='accuracy')

# Print results
print("Cross-validation scores:", scores)
print("Mean accuracy:", np.mean(scores))

Cross-validation scores: [0.61130137 0.61578045 0.62264151 0.60377358 0.61578045]
Mean accuracy: 0.6138554712281773
