In [35]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [2]:
raw_pdf = pd.read_csv('star_classification.csv')
raw_pdf

Unnamed: 0,obj_ID,alpha,delta,u,g,r,i,z,run_ID,rerun_ID,cam_col,field_ID,spec_obj_ID,class,redshift,plate,MJD,fiber_ID
0,1.237661e+18,135.689107,32.494632,23.87882,22.27530,20.39501,19.16573,18.79371,3606,301,2,79,6.543777e+18,GALAXY,0.634794,5812,56354,171
1,1.237665e+18,144.826101,31.274185,24.77759,22.83188,22.58444,21.16812,21.61427,4518,301,5,119,1.176014e+19,GALAXY,0.779136,10445,58158,427
2,1.237661e+18,142.188790,35.582444,25.26307,22.66389,20.60976,19.34857,18.94827,3606,301,2,120,5.152200e+18,GALAXY,0.644195,4576,55592,299
3,1.237663e+18,338.741038,-0.402828,22.13682,23.77656,21.61162,20.50454,19.25010,4192,301,3,214,1.030107e+19,GALAXY,0.932346,9149,58039,775
4,1.237680e+18,345.282593,21.183866,19.43718,17.58028,16.49747,15.97711,15.54461,8102,301,3,137,6.891865e+18,GALAXY,0.116123,6121,56187,842
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,1.237679e+18,39.620709,-2.594074,22.16759,22.97586,21.90404,21.30548,20.73569,7778,301,2,581,1.055431e+19,GALAXY,0.000000,9374,57749,438
99996,1.237679e+18,29.493819,19.798874,22.69118,22.38628,20.45003,19.75759,19.41526,7917,301,1,289,8.586351e+18,GALAXY,0.404895,7626,56934,866
99997,1.237668e+18,224.587407,15.700707,21.16916,19.26997,18.20428,17.69034,17.35221,5314,301,4,308,3.112008e+18,GALAXY,0.143366,2764,54535,74
99998,1.237661e+18,212.268621,46.660365,25.35039,21.63757,19.91386,19.07254,18.62482,3650,301,4,131,7.601080e+18,GALAXY,0.455040,6751,56368,470


In [3]:
data = raw_pdf.copy()

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 18 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   obj_ID       100000 non-null  float64
 1   alpha        100000 non-null  float64
 2   delta        100000 non-null  float64
 3   u            100000 non-null  float64
 4   g            100000 non-null  float64
 5   r            100000 non-null  float64
 6   i            100000 non-null  float64
 7   z            100000 non-null  float64
 8   run_ID       100000 non-null  int64  
 9   rerun_ID     100000 non-null  int64  
 10  cam_col      100000 non-null  int64  
 11  field_ID     100000 non-null  int64  
 12  spec_obj_ID  100000 non-null  float64
 13  class        100000 non-null  object 
 14  redshift     100000 non-null  float64
 15  plate        100000 non-null  int64  
 16  MJD          100000 non-null  int64  
 17  fiber_ID     100000 non-null  int64  
dtypes: float64(10), int64(7),

# Preprocessing Time

In [6]:
# Drop column yang tidak penting, sekedar ID untuk setup observasi tpi gk terllau memengaruhi data result.
data = data.drop(['obj_ID', 'run_ID', 'rerun_ID', 'cam_col', 'field_ID', 'spec_obj_ID', 'fiber_ID', 'plate', 'MJD'], axis = 1)

In [7]:
data

Unnamed: 0,alpha,delta,u,g,r,i,z,class,redshift
0,135.689107,32.494632,23.87882,22.27530,20.39501,19.16573,18.79371,GALAXY,0.634794
1,144.826101,31.274185,24.77759,22.83188,22.58444,21.16812,21.61427,GALAXY,0.779136
2,142.188790,35.582444,25.26307,22.66389,20.60976,19.34857,18.94827,GALAXY,0.644195
3,338.741038,-0.402828,22.13682,23.77656,21.61162,20.50454,19.25010,GALAXY,0.932346
4,345.282593,21.183866,19.43718,17.58028,16.49747,15.97711,15.54461,GALAXY,0.116123
...,...,...,...,...,...,...,...,...,...
99995,39.620709,-2.594074,22.16759,22.97586,21.90404,21.30548,20.73569,GALAXY,0.000000
99996,29.493819,19.798874,22.69118,22.38628,20.45003,19.75759,19.41526,GALAXY,0.404895
99997,224.587407,15.700707,21.16916,19.26997,18.20428,17.69034,17.35221,GALAXY,0.143366
99998,212.268621,46.660365,25.35039,21.63757,19.91386,19.07254,18.62482,GALAXY,0.455040


In [8]:
# Konversi class jadi label/dummy value
data['class'].unique()

array(['GALAXY', 'QSO', 'STAR'], dtype=object)

In [9]:
data['class'] = data['class'].map({'GALAXY':0,'QSO':1, 'STAR':2})

In [10]:
data

Unnamed: 0,alpha,delta,u,g,r,i,z,class,redshift
0,135.689107,32.494632,23.87882,22.27530,20.39501,19.16573,18.79371,0,0.634794
1,144.826101,31.274185,24.77759,22.83188,22.58444,21.16812,21.61427,0,0.779136
2,142.188790,35.582444,25.26307,22.66389,20.60976,19.34857,18.94827,0,0.644195
3,338.741038,-0.402828,22.13682,23.77656,21.61162,20.50454,19.25010,0,0.932346
4,345.282593,21.183866,19.43718,17.58028,16.49747,15.97711,15.54461,0,0.116123
...,...,...,...,...,...,...,...,...,...
99995,39.620709,-2.594074,22.16759,22.97586,21.90404,21.30548,20.73569,0,0.000000
99996,29.493819,19.798874,22.69118,22.38628,20.45003,19.75759,19.41526,0,0.404895
99997,224.587407,15.700707,21.16916,19.26997,18.20428,17.69034,17.35221,0,0.143366
99998,212.268621,46.660365,25.35039,21.63757,19.91386,19.07254,18.62482,0,0.455040


In [11]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class CustomScaler(BaseEstimator, TransformerMixin):
    def __init__(self,columns):
        self.scaler = StandardScaler()
        self.columns = columns       

    def fit(self, X):       
        self.scaler.fit(X[self.columns])       
        return self

    def transform(self, X):       
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns = self.columns)
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis = 1) [init_col_order]

In [12]:
#Biar gampang, di-reporder datanya
col_name_reorder = ['alpha', 'delta', 'u', 'g', 'r', 'i', 'z', 'redshift', 'class']
data = data[col_name_reorder]
data

Unnamed: 0,alpha,delta,u,g,r,i,z,redshift,class
0,135.689107,32.494632,23.87882,22.27530,20.39501,19.16573,18.79371,0.634794,0
1,144.826101,31.274185,24.77759,22.83188,22.58444,21.16812,21.61427,0.779136,0
2,142.188790,35.582444,25.26307,22.66389,20.60976,19.34857,18.94827,0.644195,0
3,338.741038,-0.402828,22.13682,23.77656,21.61162,20.50454,19.25010,0.932346,0
4,345.282593,21.183866,19.43718,17.58028,16.49747,15.97711,15.54461,0.116123,0
...,...,...,...,...,...,...,...,...,...
99995,39.620709,-2.594074,22.16759,22.97586,21.90404,21.30548,20.73569,0.000000,0
99996,29.493819,19.798874,22.69118,22.38628,20.45003,19.75759,19.41526,0.404895,0
99997,224.587407,15.700707,21.16916,19.26997,18.20428,17.69034,17.35221,0.143366,0
99998,212.268621,46.660365,25.35039,21.63757,19.91386,19.07254,18.62482,0.455040,0


In [13]:
#Normalisasi data
data2 = data.copy()
unscl = data2.iloc[:,:-1]
unscl

Unnamed: 0,alpha,delta,u,g,r,i,z,redshift
0,135.689107,32.494632,23.87882,22.27530,20.39501,19.16573,18.79371,0.634794
1,144.826101,31.274185,24.77759,22.83188,22.58444,21.16812,21.61427,0.779136
2,142.188790,35.582444,25.26307,22.66389,20.60976,19.34857,18.94827,0.644195
3,338.741038,-0.402828,22.13682,23.77656,21.61162,20.50454,19.25010,0.932346
4,345.282593,21.183866,19.43718,17.58028,16.49747,15.97711,15.54461,0.116123
...,...,...,...,...,...,...,...,...
99995,39.620709,-2.594074,22.16759,22.97586,21.90404,21.30548,20.73569,0.000000
99996,29.493819,19.798874,22.69118,22.38628,20.45003,19.75759,19.41526,0.404895
99997,224.587407,15.700707,21.16916,19.26997,18.20428,17.69034,17.35221,0.143366
99998,212.268621,46.660365,25.35039,21.63757,19.91386,19.07254,18.62482,0.455040


In [14]:
tgt = data2['class']
tgt

0        0
1        0
2        0
3        0
4        0
        ..
99995    0
99996    0
99997    0
99998    0
99999    0
Name: class, Length: 100000, dtype: int64

In [15]:
col_omit = []
col_scale = [x for x in unscl.columns.values if x not in col_omit]
abs_scaler = CustomScaler(col_scale)
abs_scaler.fit(unscl)

In [16]:
scaled = abs_scaler.transform(unscl)

In [17]:
scaled

Unnamed: 0,alpha,delta,u,g,r,i,z,redshift
0,-0.434604,0.425529,0.059755,0.054926,0.403962,0.046007,0.003937,0.079557
1,-0.339921,0.363402,0.088045,0.072456,1.584406,1.185097,0.092835,0.277096
2,-0.367251,0.582713,0.103327,0.067165,0.519745,0.150019,0.008808,0.092423
3,1.669523,-1.249105,0.004921,0.102210,1.059904,0.807610,0.018321,0.486770
4,1.737310,-0.150242,-0.080055,-0.092948,-1.697421,-1.767887,-0.098468,-0.630267
...,...,...,...,...,...,...,...,...
99995,-1.430113,-1.360650,0.005890,0.076991,1.217564,1.263237,0.065144,-0.789186
99996,-1.535053,-0.220744,0.022371,0.058422,0.433626,0.382696,0.023527,-0.235069
99997,0.486606,-0.429360,-0.025538,-0.039730,-0.777184,-0.793290,-0.041496,-0.592984
99998,0.358952,1.146631,0.106075,0.034840,0.144547,-0.007005,-0.001386,-0.166444


In [18]:
#Split
from sklearn.model_selection import train_test_split as tts

In [19]:
# Split Train / Valid+Test
x_train, x_valid_test, y_train, y_valid_test = tts(scaled, tgt, train_size = 0.7, shuffle=True, random_state = 42)

In [20]:
# Split Valid / Test
x_valid, x_test, y_valid, y_test = tts(scaled, tgt, test_size = 0.333, shuffle=True, random_state = 42)

In [21]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Define a neural network model
model = Sequential([
    Dense(64, activation='relu', input_shape=(x_train.shape[1],)),  # Assuming your input has 15 features
    Dense(32, activation='relu'),
    Dense(3, activation='softmax')  # For 3 output classes: galaxy, star, quasar
])

# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',  # Use this loss since labels are integers
              metrics=['accuracy'])

# Display the model architecture
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 64)                576       
                                                                 
 dense_1 (Dense)             (None, 32)                2080      
                                                                 
 dense_2 (Dense)             (None, 3)                 99        
                                                                 
Total params: 2,755
Trainable params: 2,755
Non-trainable params: 0
_________________________________________________________________


In [22]:
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=1,
    verbose=0,
    restore_best_weights=True
)

In [23]:
history = model.fit(x_train, y_train,
                    epochs=10,              # You can adjust the number of epochs
                    validation_data=(x_valid, y_valid),  # Validation data to monitor the model performance
                    batch_size=32,           # Adjust the batch size if needed
                    callbacks=[early_stopping],
                    verbose=2)               # Verbose 2 gives cleaner output

Epoch 1/10
2188/2188 - 38s - loss: 0.2369 - accuracy: 0.9224 - val_loss: 0.1406 - val_accuracy: 0.9623 - 38s/epoch - 17ms/step
Epoch 2/10
2188/2188 - 38s - loss: 0.1362 - accuracy: 0.9597 - val_loss: 0.1344 - val_accuracy: 0.9573 - 38s/epoch - 17ms/step
Epoch 3/10
2188/2188 - 34s - loss: 0.1258 - accuracy: 0.9623 - val_loss: 0.1165 - val_accuracy: 0.9657 - 34s/epoch - 15ms/step
Epoch 4/10
2188/2188 - 48s - loss: 0.1205 - accuracy: 0.9639 - val_loss: 0.1178 - val_accuracy: 0.9652 - 48s/epoch - 22ms/step


In [24]:
test_loss, test_acc = model.evaluate(x_test, y_test)
print(f"Test accuracy: {test_acc}")

Test accuracy: 0.9636636972427368


# Test to create pred & actual table

In [40]:
# Make predictions on the test set
y_pred_prob = model.predict(x_test)

# Convert probabilities to class predictions (argmax returns the index of the highest probability)
y_pred = y_pred_prob.argmax(axis=1)



In [46]:
print(f'Akurasi: {accuracy_score(y_test,y_pred)*100:.2f}%')
print(classification_report(y_test, y_pred))
# 'GALAXY':0,'QSO':1, 'STAR':2

Akurasi: 96.37%
              precision    recall  f1-score   support

           0       0.97      0.97      0.97     19807
           1       0.94      0.91      0.93      6336
           2       0.96      1.00      0.98      7157

    accuracy                           0.96     33300
   macro avg       0.96      0.96      0.96     33300
weighted avg       0.96      0.96      0.96     33300

