In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    matthews_corrcoef,
    confusion_matrix
)


In [2]:
df = pd.read_csv("star_classification.csv")
df.head()

Unnamed: 0,obj_ID,alpha,delta,u,g,r,i,z,run_ID,rerun_ID,cam_col,field_ID,spec_obj_ID,class,redshift,plate,MJD,fiber_ID
0,1.237661e+18,135.689107,32.494632,23.87882,22.2753,20.39501,19.16573,18.79371,3606,301,2,79,6.543777e+18,GALAXY,0.634794,5812,56354,171
1,1.237665e+18,144.826101,31.274185,24.77759,22.83188,22.58444,21.16812,21.61427,4518,301,5,119,1.176014e+19,GALAXY,0.779136,10445,58158,427
2,1.237661e+18,142.18879,35.582444,25.26307,22.66389,20.60976,19.34857,18.94827,3606,301,2,120,5.1522e+18,GALAXY,0.644195,4576,55592,299
3,1.237663e+18,338.741038,-0.402828,22.13682,23.77656,21.61162,20.50454,19.2501,4192,301,3,214,1.030107e+19,GALAXY,0.932346,9149,58039,775
4,1.23768e+18,345.282593,21.183866,19.43718,17.58028,16.49747,15.97711,15.54461,8102,301,3,137,6.891865e+18,GALAXY,0.116123,6121,56187,842


In [3]:
cols_to_drop = [
    "obj_ID",
    "run_ID",
    "rerun_ID",
    "cam_col",
    "field_ID",
    "spec_obj_ID"
]

df.drop(columns=cols_to_drop, inplace=True)

In [4]:
le = LabelEncoder()
df["class"] = le.fit_transform(df["class"])

df["class"].value_counts()

class
0    59445
2    21594
1    18961
Name: count, dtype: int64

In [5]:
X = df.drop("class", axis=1)
y = df["class"]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [6]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [7]:
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    if hasattr(model, "predict_proba"):
        y_proba = model.predict_proba(X_test)
        auc = roc_auc_score(
            y_test,
            y_proba,
            multi_class="ovr",
            average="macro"
        )
    else:
        auc = None

    return {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, average="macro"),
        "Recall": recall_score(y_test, y_pred, average="macro"),
        "F1 Score": f1_score(y_test, y_pred, average="macro"),
        "MCC": matthews_corrcoef(y_test, y_pred),
        "AUC": auc
    }

In [8]:
##LogisticRegression

log_reg = LogisticRegression(max_iter=1000)
log_reg_results = evaluate_model(
    log_reg,
    X_train_scaled,
    X_test_scaled,
    y_train,
    y_test
)

print(log_reg_results)

{'Accuracy': 0.95715, 'Precision': 0.9545056600545424, 'Recall': 0.9482366923260472, 'F1 Score': 0.9506119180185769, 'MCC': 0.9241273964882151, 'AUC': 0.9879519425351043}


In [9]:
##Decision Tree

dt = DecisionTreeClassifier(random_state=42)
dt_results = evaluate_model(
    dt,
    X_train_scaled,
    X_test_scaled,
    y_train,
    y_test
)

print(dt_results)

{'Accuracy': 0.96705, 'Precision': 0.9612024901530763, 'Recall': 0.9630809306430521, 'F1 Score': 0.9621315014669394, 'MCC': 0.9416641659180381, 'AUC': 0.9715560380291083}


In [10]:
#KNN

knn = KNeighborsClassifier(n_neighbors=5)
knn_results = evaluate_model(
    knn,
    X_train_scaled,
    X_test_scaled,
    y_train,
    y_test
)

print(knn_results)


{'Accuracy': 0.9271, 'Precision': 0.9347363240581684, 'Recall': 0.9035133311849862, 'F1 Score': 0.9179232890269408, 'MCC': 0.8693162106120901, 'AUC': 0.9661792392808278}


In [11]:
##NaiveByes

nb = GaussianNB()
nb_results = evaluate_model(
    nb,
    X_train_scaled,
    X_test_scaled,
    y_train,
    y_test
)

print(nb_results)


{'Accuracy': 0.72555, 'Precision': 0.7873222352981518, 'Recall': 0.6403123218749527, 'F1 Score': 0.6025634285623344, 'MCC': 0.5069614238080791, 'AUC': 0.9314195008857832}


In [12]:
##RandomForest

rf = RandomForestClassifier(
    n_estimators=100,
    random_state=42
)

rf_results = evaluate_model(
    rf,
    X_train_scaled,
    X_test_scaled,
    y_train,
    y_test
)

print(rf_results)


{'Accuracy': 0.98065, 'Precision': 0.9809188693558105, 'Recall': 0.9741658611531693, 'F1 Score': 0.9774373027750212, 'MCC': 0.9656248720303633, 'AUC': 0.9949749960490122}


In [15]:
##XGBoost

xgb = XGBClassifier(
    objective="multi:softprob",
    num_class=3,
    eval_metric="mlogloss"
    #use_label_encoder=False
)

xgb_results = evaluate_model(
    xgb,
    X_train_scaled,
    X_test_scaled,
    y_train,
    y_test
)

print(xgb_results)


{'Accuracy': 0.9772, 'Precision': 0.9762126797589317, 'Recall': 0.9714557659861628, 'F1 Score': 0.9737477515921135, 'MCC': 0.9595107382334537, 'AUC': 0.9958047927772221}


In [14]:
##Results

results_df = pd.DataFrame([
    {"Model": "Logistic Regression", **log_reg_results},
    {"Model": "Decision Tree", **dt_results},
    {"Model": "KNN", **knn_results},
    {"Model": "Naive Bayes", **nb_results},
    {"Model": "Random Forest", **rf_results},
    {"Model": "XGBoost", **xgb_results}
])

print(results_df)


                 Model  Accuracy  Precision    Recall  F1 Score       MCC  \
0  Logistic Regression   0.95715   0.954506  0.948237  0.950612  0.924127   
1        Decision Tree   0.96705   0.961202  0.963081  0.962132  0.941664   
2                  KNN   0.92710   0.934736  0.903513  0.917923  0.869316   
3          Naive Bayes   0.72555   0.787322  0.640312  0.602563  0.506961   
4        Random Forest   0.98065   0.980919  0.974166  0.977437  0.965625   
5              XGBoost   0.97720   0.976213  0.971456  0.973748  0.959511   

        AUC  
0  0.987952  
1  0.971556  
2  0.966179  
3  0.931420  
4  0.994975  
5  0.995805  


In [16]:
import joblib
import os


In [17]:
# Create folder to store models
os.makedirs("models", exist_ok=True)

# Save models
joblib.dump(log_reg, "models/logistic_regression.pkl")
joblib.dump(dt, "models/decision_tree.pkl")
joblib.dump(knn, "models/knn.pkl")
joblib.dump(nb, "models/naive_bayes.pkl")
joblib.dump(rf, "models/random_forest.pkl")
joblib.dump(xgb, "models/xgboost.pkl")

# Save scaler (VERY IMPORTANT)
joblib.dump(scaler, "models/scaler.pkl")

print("All models and scaler saved successfully.")


All models and scaler saved successfully.
