In [2]:
# ================================
# 1. Import Libraries
# ================================
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score,
    recall_score, f1_score, matthews_corrcoef
)

# ================================
# 2. Load & Data
# ================================
# Replace 'stellar.csv' with your dataset path
df = pd.read_csv("D:\Praveer MTech\Course\Assignment\ML\Assignment 2\star_classification.csv")

In [4]:
#Clean the data
# Drop duplicates
df.drop_duplicates(inplace=True)

# Handle missing values (simple strategy: drop rows with NA)
df.dropna(inplace=True)

# Drop irrelevant identifiers (unique IDs that donâ€™t help classification)
id_columns = ["run_ID", "rerun_ID", "cam_col", "field_ID",
              "spec_obj_ID", "plate", "MJD", "fiber_ID"]
df.drop(columns=id_columns, inplace=True)


In [5]:
#Pre-processing
# Encode target labels
label_enc = LabelEncoder()
df["class"] = label_enc.fit_transform(df["class"])  # galaxy=0, star=1, quasar=2 (example)

X = df.drop(columns=["class"])
y = df["class"]

# Normalize features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Feature selection (ANOVA F-test)
selector = SelectKBest(score_func=f_classif, k="all")
X_selected = selector.fit_transform(X_scaled, y)

# Show feature importance scores
feature_scores = pd.DataFrame({
    "Feature": X.columns,
    "Score": selector.scores_
}).sort_values(by="Score", ascending=False)

print("Feature importance ranking:\n", feature_scores)

Feature importance ranking:
     Feature         Score
8  redshift  83429.418967
6         i   8282.343545
5         r   4584.533364
2     delta    217.588357
0    obj_ID    122.542455
7         z     32.328308
3         u     30.445339
4         g     25.962524
1     alpha     21.948822


In [6]:
#Train / Test data split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42, stratify=y)

In [7]:
#Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="mlogloss")
}

In [8]:
#Train, predict and evaluate
results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # For multiclass AUC, use 'ovr'
    try:
        y_prob = model.predict_proba(X_test)
        auc = roc_auc_score(y_test, y_prob, multi_class="ovr")
    except:
        auc = None
    
    metrics = {
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "AUC": auc,
        "Precision": precision_score(y_test, y_pred, average="weighted"),
        "Recall": recall_score(y_test, y_pred, average="weighted"),
        "F1": f1_score(y_test, y_pred, average="weighted"),
        "MCC": matthews_corrcoef(y_test, y_pred)
    }
    results.append(metrics)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [9]:
# After training and collecting results
results_df = pd.DataFrame(results)

# Option 1: Print the whole table (already in your code)
print("\nModel Performance Comparison:\n", results_df)

# Option 2: Print metrics for each model one by one
for idx, row in results_df.iterrows():
    print(f"\n===== {row['Model']} =====")
    print(f"Accuracy : {row['Accuracy']:.4f}")
    if row['AUC'] is not None:
        print(f"AUC      : {row['AUC']:.4f}")
    else:
        print("AUC      : Not available")
    print(f"Precision: {row['Precision']:.4f}")
    print(f"Recall   : {row['Recall']:.4f}")
    print(f"F1 Score : {row['F1']:.4f}")
    print(f"MCC      : {row['MCC']:.4f}")


Model Performance Comparison:
                  Model  Accuracy       AUC  Precision   Recall        F1  \
0  Logistic Regression   0.92840  0.977709   0.929428  0.92840  0.927847   
1        Decision Tree   0.96630  0.970101   0.966317  0.96630  0.966308   
2                  KNN   0.92690  0.967534   0.927824  0.92690  0.926607   
3          Naive Bayes   0.74315  0.938585   0.790269  0.74315  0.690546   
4        Random Forest   0.97990  0.995141   0.979807  0.97990  0.979773   
5              XGBoost   0.97580  0.995612   0.975695  0.97580  0.975688   

        MCC  
0  0.872008  
1  0.940246  
2  0.869237  
3  0.537239  
4  0.964289  
5  0.957009  

===== Logistic Regression =====
Accuracy : 0.9284
AUC      : 0.9777
Precision: 0.9294
Recall   : 0.9284
F1 Score : 0.9278
MCC      : 0.8720

===== Decision Tree =====
Accuracy : 0.9663
AUC      : 0.9701
Precision: 0.9663
Recall   : 0.9663
F1 Score : 0.9663
MCC      : 0.9402

===== KNN =====
Accuracy : 0.9269
AUC      : 0.9675
Precisio