In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

# Load the dataset
file_path = '/content/drive/MyDrive/SDSS_ML_P_SI_ZOHO/sdss_100k_galaxy_form_burst.csv'
data_sdss_raw = pd.read_csv(file_path, sep=",", skiprows=1, low_memory=False)

# Inspect the data
print(data_sdss_raw.head())
print(data_sdss_raw.info())
print(data_sdss_raw.isnull().sum())
print(data_sdss_raw.describe())

                 objid            specobjid          ra       dec           u  \
0  1237646587710669400  8175185722644649984   82.038679  0.847177    21.73818   
1  1237646588247540577  8175186822156277760   82.138894  1.063072    20.66761   
2  1237646588247540758  8175187097034184704   82.028510  1.104003    23.63531   
3  1237648702973083853   332152325571373056  198.544469 -1.097059    20.12374   
4  1237648702973149350   332154249716721664  198.706864 -1.046217 -9999.00000   

            g         r         i         z  modelFlux_u  ...  psfMag_z  \
0    20.26633  19.32409  18.64037  18.23833     2.007378  ...  19.43575   
1    19.32016  18.67888  18.24693  18.04122     5.403369  ...  18.85012   
2    21.19671  19.92297  19.31443  18.68396     0.295693  ...  19.42235   
3    18.41520  17.47202  17.05297  16.72423     8.920645  ...  18.03204   
4 -9999.00000  18.37762  18.13383  17.78497     0.000000  ...  19.02880   

       expAB_u      expAB_g   expAB_r   expAB_i   expAB_z   cl

In [None]:
import numpy as np

data_sdss_replace = data_sdss_raw.copy()

# Replace -9999 with NaN
data_sdss_replace = data_sdss_replace.replace(-9999, np.nan)

# Verify
print(data_sdss_replace[['u','g','z','expAB_u']].head(10))
print(data_sdss_replace.head())
print(data_sdss_replace.info())
print(data_sdss_replace.isnull().sum())

          u         g         z   expAB_u
0  21.73818  20.26633  18.23833  0.099951
1  20.66761  19.32016  18.04122  0.366549
2  23.63531  21.19671  18.68396  0.050000
3  20.12374  18.41520  16.72423  0.310763
4       NaN       NaN  17.78497       NaN
5  19.47473  18.18575  16.89580  0.754158
6  20.24418  18.62780  16.97725  0.273089
7  18.46020  17.27620  16.04085  0.849475
8  18.70091  17.50793  16.54133  0.419439
9  19.88486  18.45146  16.73163  0.699929
                 objid            specobjid          ra       dec         u  \
0  1237646587710669400  8175185722644649984   82.038679  0.847177  21.73818   
1  1237646588247540577  8175186822156277760   82.138894  1.063072  20.66761   
2  1237646588247540758  8175187097034184704   82.028510  1.104003  23.63531   
3  1237648702973083853   332152325571373056  198.544469 -1.097059  20.12374   
4  1237648702973149350   332154249716721664  198.706864 -1.046217       NaN   

          g         r         i         z  modelFlux_u  ...  ps

In [None]:
data_sdss_impute = data_sdss_replace.copy()

# Import library
from sklearn.impute import SimpleImputer

# Select numeric columns
numeric_cols = data_sdss_impute.select_dtypes(include=np.number).columns

# Median imputer
median_imputer = SimpleImputer(strategy='median')
data_sdss_impute[numeric_cols] = median_imputer.fit_transform(data_sdss_impute[numeric_cols])

# Verify no more missing values
print(data_sdss_impute.head())
print(data_sdss_impute.info())
print(data_sdss_impute.isnull().sum())


          objid     specobjid          ra       dec         u         g  \
0  1.237647e+18  8.175186e+18   82.038679  0.847177  21.73818  20.26633   
1  1.237647e+18  8.175187e+18   82.138894  1.063072  20.66761  19.32016   
2  1.237647e+18  8.175187e+18   82.028510  1.104003  23.63531  21.19671   
3  1.237649e+18  3.321523e+17  198.544469 -1.097059  20.12374  18.41520   
4  1.237649e+18  3.321542e+17  198.706864 -1.046217  19.34985  18.07271   

          r         i         z  modelFlux_u  ...  psfMag_z   expAB_u  \
0  19.32409  18.64037  18.23833     2.007378  ...  19.43575  0.099951   
1  18.67888  18.24693  18.04122     5.403369  ...  18.85012  0.366549   
2  19.92297  19.31443  18.68396     0.295693  ...  19.42235  0.050000   
3  17.47202  17.05297  16.72423     8.920645  ...  18.03204  0.310763   
4  18.37762  18.13383  17.78497     0.000000  ...  19.02880  0.508736   

    expAB_g   expAB_r   expAB_i   expAB_z   class     subclass  redshift  \
0  0.311864  0.289370  0.270588  0

In [None]:
data_sdss_nonulls = data_sdss_replace.copy()

# Drop all rows with any missing values
data_sdss_nonulls = data_sdss_nonulls.dropna()

# Verify the result
print(data_sdss_nonulls.head())
print(data_sdss_nonulls.info())
print(data_sdss_nonulls.isnull().sum())
print(data_sdss_nonulls.describe())

                 objid            specobjid          ra       dec         u  \
0  1237646587710669400  8175185722644649984   82.038679  0.847177  21.73818   
1  1237646588247540577  8175186822156277760   82.138894  1.063072  20.66761   
2  1237646588247540758  8175187097034184704   82.028510  1.104003  23.63531   
3  1237648702973083853   332152325571373056  198.544469 -1.097059  20.12374   
5  1237648702973149360   332153425083000832  198.720675 -1.083876  19.47473   

          g         r         i         z  modelFlux_u  ...  psfMag_z  \
0  20.26633  19.32409  18.64037  18.23833     2.007378  ...  19.43575   
1  19.32016  18.67888  18.24693  18.04122     5.403369  ...  18.85012   
2  21.19671  19.92297  19.31443  18.68396     0.295693  ...  19.42235   
3  18.41520  17.47202  17.05297  16.72423     8.920645  ...  18.03204   
5  18.18575  17.52763  17.14837  16.89580    16.220930  ...  18.23220   

    expAB_u   expAB_g   expAB_r   expAB_i   expAB_z   class     subclass  \
0  0.09995

In [None]:
# Unique values in 'class'
print("Class column unique values:")
print(data_sdss_nonulls['class'].unique())

# Unique values in 'subclass'
print("\nSubclass column unique values:")
print(data_sdss_nonulls['subclass'].unique())


Class column unique values:
['GALAXY']

Subclass column unique values:
['STARFORMING' 'STARBURST']


In [None]:
# Encode 'subclass' column: STARFORMING -> 0, STARBURST -> 1
data_sdss_nonulls['subclass'] = data_sdss_nonulls['subclass'].map({'STARFORMING': 0, 'STARBURST': 1})

# Verify transformation
print(data_sdss_nonulls['subclass'].value_counts())


subclass
0    73518
1    23960
Name: count, dtype: int64


In [None]:
import numpy as np

# Make a copy
data_sdss_iqr = data_sdss_nonulls.copy()

# Identify numeric columns except ids and target
numeric_cols = data_sdss_iqr.select_dtypes(include=[np.number]).columns

cols_to_exclude = ['objid', 'specobjid', 'subclass']  # do not clip these
numeric_cols = [col for col in numeric_cols if col not in cols_to_exclude]

# Apply IQR clipping
for col in numeric_cols:
    Q1 = data_sdss_iqr[col].quantile(0.25)
    Q3 = data_sdss_iqr[col].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Clip
    data_sdss_iqr[col] = np.clip(data_sdss_iqr[col], lower_bound, upper_bound)

print("IQR outlier clipping completed.")


IQR outlier clipping completed.


In [None]:
print(data_sdss_iqr.head())
print(data_sdss_iqr.info())
print(data_sdss_iqr.isnull().sum())
print(data_sdss_iqr.describe())

                 objid            specobjid          ra       dec          u  \
0  1237646587710669400  8175185722644649984   82.038679  0.847177  21.738180   
1  1237646588247540577  8175186822156277760   82.138894  1.063072  20.667610   
2  1237646588247540758  8175187097034184704   82.028510  1.104003  21.888454   
3  1237648702973083853   332152325571373056  198.544469 -1.097059  20.123740   
5  1237648702973149360   332153425083000832  198.720675 -1.083876  19.474730   

           g          r          i         z  modelFlux_u  ...  psfMag_z  \
0  20.217987  19.314826  18.640370  18.23833     2.007378  ...  19.43575   
1  19.320160  18.678880  18.246930  18.04122     5.403369  ...  18.85012   
2  20.217987  19.314826  19.033301  18.68396     0.295693  ...  19.42235   
3  18.415200  17.472020  17.052970  16.72423     8.920645  ...  18.03204   
5  18.185750  17.527630  17.148370  16.89580    16.220930  ...  18.23220   

    expAB_u   expAB_g   expAB_r   expAB_i   expAB_z   class  s

In [None]:
data_sdss_iqr['u_g'] = data_sdss_iqr['u'] - data_sdss_iqr['g']
data_sdss_iqr['g_r'] = data_sdss_iqr['g'] - data_sdss_iqr['r']
data_sdss_iqr['r_i'] = data_sdss_iqr['r'] - data_sdss_iqr['i']
data_sdss_iqr['i_z'] = data_sdss_iqr['i'] - data_sdss_iqr['z']


In [None]:
for band in ['u', 'g', 'r', 'i', 'z']:
    data_sdss_iqr[f'compact_{band}'] = data_sdss_iqr[f'psfMag_{band}'] - data_sdss_iqr[f'modelFlux_{band}']


In [None]:
data_sdss_iqr['flux_u_g'] = data_sdss_iqr['modelFlux_u'] / (data_sdss_iqr['modelFlux_g'] + 1e-5)
data_sdss_iqr['flux_g_r'] = data_sdss_iqr['modelFlux_g'] / (data_sdss_iqr['modelFlux_r'] + 1e-5)
data_sdss_iqr['flux_r_i'] = data_sdss_iqr['modelFlux_r'] / (data_sdss_iqr['modelFlux_i'] + 1e-5)
data_sdss_iqr['flux_i_z'] = data_sdss_iqr['modelFlux_i'] / (data_sdss_iqr['modelFlux_z'] + 1e-5)


In [None]:
# Display first 5 rows of the newly computed features
computed_features = ['u_g', 'g_r', 'r_i', 'i_z',
                     'compact_u', 'compact_g', 'compact_r', 'compact_i', 'compact_z',
                     'flux_u_g', 'flux_g_r', 'flux_r_i', 'flux_i_z']

print(data_sdss_iqr[computed_features].head())
print(data_sdss_iqr.head())
print(data_sdss_iqr.info())

        u_g       g_r       r_i       i_z  compact_u  compact_g  compact_r  \
0  1.520193  0.903161  0.674456  0.402040  20.578932  13.841280    2.11619   
1  1.347450  0.641280  0.431950  0.205710  15.909471   1.534370  -14.09173   
2  1.670466  0.903161  0.281525  0.349341  22.971974  18.513746    9.92772   
3  1.708540  0.943180  0.419050  0.328740  12.428735 -23.286420  -83.83370   
5  1.288980  0.658120  0.379260  0.252570   4.491130 -33.509270  -78.56381   

   compact_i  compact_z  flux_u_g  flux_g_r  flux_r_i  flux_i_z  
0  -14.90529  -31.21386  0.256578  0.419817  0.532729  0.690662  
1  -31.06720  -41.88613  0.288894  0.553969  0.671767  0.827512  
2    1.20595  -14.16737  0.089093  0.309200  0.570909  0.559735  
3 -132.55392 -186.28406  0.207241  0.419498  0.679795  0.738770  
5 -119.79579 -156.21850  0.305055  0.545443  0.705178  0.792459  
                 objid            specobjid          ra       dec          u  \
0  1237646587710669400  8175185722644649984   82.038679

In [None]:
cluster_features = ['u_g', 'g_r', 'r_i', 'i_z',
                    'compact_u', 'compact_g', 'compact_r', 'compact_i', 'compact_z',
                    'flux_u_g', 'flux_g_r', 'flux_r_i', 'flux_i_z']

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_cluster_scaled = scaler.fit_transform(data_sdss_iqr[cluster_features])

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=2, random_state=42)
clusters = kmeans.fit_predict(X_cluster_scaled)

# Add cluster labels to main dataframe
data_sdss_iqr['AGN_cluster'] = clusters


In [None]:
data_sdss_iqr['AGN_label'] = data_sdss_iqr['AGN_cluster'].map({0: 'Non-AGN', 1: 'AGN'})
print(data_sdss_iqr['AGN_label'].value_counts())
print(data_sdss_iqr.head())
print(data_sdss_iqr.info())

AGN_label
AGN        73717
Non-AGN    23761
Name: count, dtype: int64
                 objid            specobjid          ra       dec          u  \
0  1237646587710669400  8175185722644649984   82.038679  0.847177  21.738180   
1  1237646588247540577  8175186822156277760   82.138894  1.063072  20.667610   
2  1237646588247540758  8175187097034184704   82.028510  1.104003  21.888454   
3  1237648702973083853   332152325571373056  198.544469 -1.097059  20.123740   
5  1237648702973149360   332153425083000832  198.720675 -1.083876  19.474730   

           g          r          i         z  modelFlux_u  ...  compact_g  \
0  20.217987  19.314826  18.640370  18.23833     2.007378  ...  13.841280   
1  19.320160  18.678880  18.246930  18.04122     5.403369  ...   1.534370   
2  20.217987  19.314826  19.033301  18.68396     0.295693  ...  18.513746   
3  18.415200  17.472020  17.052970  16.72423     8.920645  ... -23.286420   
5  18.185750  17.527630  17.148370  16.89580    16.220930  ... -

In [None]:
raw_features = ['u', 'g', 'r', 'i', 'z', 'psfMag_u', 'psfMag_g', 'psfMag_r', 'psfMag_i', 'psfMag_z',
                'modelFlux_u', 'modelFlux_g', 'modelFlux_r', 'modelFlux_i', 'modelFlux_z']

computed_features = ['u_g', 'g_r', 'r_i', 'i_z',
                     'compact_u', 'compact_g', 'compact_r', 'compact_i', 'compact_z',
                     'flux_u_g', 'flux_g_r', 'flux_r_i', 'flux_i_z']

all_features = raw_features + computed_features

In [None]:
from sklearn.model_selection import train_test_split

X = data_sdss_iqr[all_features]
y = data_sdss_iqr['AGN_label'].map({'Non-AGN':0, 'AGN':1})

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y)


In [None]:
print("X_train:", X_train.shape, "X_test:", X_test.shape)
print("Y_train counts:\n", y_train.value_counts())
print("Y_test counts:\n", y_test.value_counts())


X_train: (77982, 28) X_test: (19496, 28)
Y_train counts:
 AGN_label
1    58973
0    19009
Name: count, dtype: int64
Y_test counts:
 AGN_label
1    14744
0     4752
Name: count, dtype: int64


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)  # use same scaler


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Use class_weight='balanced' to handle imbalance
logreg = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
logreg.fit(X_train_scaled, y_train)

# Predictions
y_pred = logreg.predict(X_test_scaled)
# Overall Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Overall Accuracy:", round(accuracy, 4))

# Precision, Recall, F1 per class
precision = precision_score(y_test, y_pred, average=None)
recall = recall_score(y_test, y_pred, average=None)
f1 = f1_score(y_test, y_pred, average=None)
print("Precision per class:", precision)
print("Recall per class:", recall)
print("F1-score per class:", f1)

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

# Classification report
cr = classification_report(y_test, y_pred)
print("Classification Report:\n", cr)

Overall Accuracy: 0.9973
Precision per class: [0.98917569 1.        ]
Recall per class: [1.         0.99647314]
F1-score per class: [0.99455839 0.99823346]
Confusion Matrix:
 [[ 4752     0]
 [   52 14692]]
Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99      4752
           1       1.00      1.00      1.00     14744

    accuracy                           1.00     19496
   macro avg       0.99      1.00      1.00     19496
weighted avg       1.00      1.00      1.00     19496



In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Initialize the Random Forest
rf = RandomForestClassifier(
    n_estimators=200,       # number of trees
    class_weight='balanced',# handle class imbalance
    random_state=42,
    n_jobs=-1               # use all cores
)

# Fit on training data
rf.fit(X_train_scaled, y_train)

# Predictions
y_pred = rf.predict(X_test_scaled)

# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average=None)
recall = recall_score(y_test, y_pred, average=None)
f1 = f1_score(y_test, y_pred, average=None)
cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Overall Accuracy: {accuracy:.4f}")
print(f"Precision per class: {precision}")
print(f"Recall per class: {recall}")
print(f"F1-score per class: {f1}")
print("Confusion Matrix:")
print(cm)
print("Classification Report:\n", report)


Overall Accuracy: 0.9974
Precision per class: [0.99328577 0.998778  ]
Recall per class: [0.99621212 0.99782963]
F1-score per class: [0.9947468  0.99830359]
Confusion Matrix:
[[ 4734    18]
 [   32 14712]]
Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99      4752
           1       1.00      1.00      1.00     14744

    accuracy                           1.00     19496
   macro avg       1.00      1.00      1.00     19496
weighted avg       1.00      1.00      1.00     19496



In [None]:
from xgboost import XGBClassifier

# Initialize XGBoost
xgb = XGBClassifier(
    n_estimators=200,      # number of trees
    learning_rate=0.1,
    max_depth=5,
    scale_pos_weight=(y_train.value_counts()[0] / y_train.value_counts()[1]),  # handle imbalance
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)

# Fit on training data
xgb.fit(X_train_scaled, y_train)

# Predictions
y_pred = xgb.predict(X_test_scaled)

# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average=None)
recall = recall_score(y_test, y_pred, average=None)
f1 = f1_score(y_test, y_pred, average=None)
cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Overall Accuracy: {accuracy:.4f}")
print(f"Precision per class: {precision}")
print(f"Recall per class: {recall}")
print(f"F1-score per class: {f1}")
print("Confusion Matrix:")
print(cm)
print("Classification Report:\n", report)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Overall Accuracy: 0.9974
Precision per class: [0.9916353  0.99932038]
Recall per class: [0.99789562 0.99728703]
F1-score per class: [0.99475561 0.99830267]
Confusion Matrix:
[[ 4742    10]
 [   40 14704]]
Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99      4752
           1       1.00      1.00      1.00     14744

    accuracy                           1.00     19496
   macro avg       1.00      1.00      1.00     19496
weighted avg       1.00      1.00      1.00     19496



In [None]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import numpy as np

# ------------------------------
# 1️ Stratified CV
# ------------------------------
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# ------------------------------
# 2️ Models
# ------------------------------
models = {
    "Logistic Regression": LogisticRegression(
        max_iter=1000, class_weight='balanced', random_state=42
    ),
    "Random Forest": RandomForestClassifier(
        n_estimators=200, class_weight='balanced', random_state=42, n_jobs=-1
    ),
    "XGBoost": XGBClassifier(
        n_estimators=200,
        learning_rate=0.1,
        max_depth=5,
        scale_pos_weight=(y_train.value_counts()[0] / y_train.value_counts()[1]),
        random_state=42,
        use_label_encoder=False,
        eval_metric='logloss',
        n_jobs=-1
    )
}

# ------------------------------
# 3️ Function to compute CV metrics
# ------------------------------
def cv_metrics(model, X, y, cv):
    scoring_metrics = ['accuracy', 'precision', 'recall', 'f1']
    results = {}

    for metric in scoring_metrics:
        scores = cross_val_score(model, X, y, cv=cv, scoring=metric, n_jobs=-1)
        results[metric] = (scores.mean(), scores.std())

    return results

# ------------------------------
# 4️ Run CV for each model
# ------------------------------
for name, model in models.items():
    scores = cv_metrics(model, X_train_scaled, y_train, cv)
    print(f"\n{name} CV Performance:")
    print(f"Accuracy : {scores['accuracy'][0]:.4f} ± {scores['accuracy'][1]:.4f}")
    print(f"Precision: {scores['precision'][0]:.4f} ± {scores['precision'][1]:.4f}")
    print(f"Recall   : {scores['recall'][0]:.4f} ± {scores['recall'][1]:.4f}")
    print(f"F1-score : {scores['f1'][0]:.4f} ± {scores['f1'][1]:.4f}")



Logistic Regression CV Performance:
Accuracy : 0.9973 ± 0.0005
Precision: 1.0000 ± 0.0000
Recall   : 0.9965 ± 0.0006
F1-score : 0.9982 ± 0.0003

Random Forest CV Performance:
Accuracy : 0.9964 ± 0.0002
Precision: 0.9975 ± 0.0003
Recall   : 0.9977 ± 0.0003
F1-score : 0.9976 ± 0.0001

XGBoost CV Performance:
Accuracy : 0.9973 ± 0.0004
Precision: 0.9989 ± 0.0002
Recall   : 0.9974 ± 0.0005
F1-score : 0.9982 ± 0.0002


In [None]:
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

# Stratified 5-fold CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Base XGB model
xgb = XGBClassifier(
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss',
    n_jobs=-1
)

# Hyperparameter distribution
param_dist = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [3, 4, 5, 6, 7],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.3, 0.5],
    'reg_alpha': [0, 0.1, 0.5, 1.0],
    'reg_lambda': [1, 1.5, 2.0]
}

# Randomized Search
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=30,  # try 30 random combinations
    scoring='f1',
    cv=cv,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Fit
random_search.fit(X_train_scaled, y_train)

# Best params
print("Best Hyperparameters:", random_search.best_params_)

# Evaluate best model
best_xgb = random_search.best_estimator_

metrics = ['accuracy', 'precision', 'recall', 'f1']
for metric in metrics:
    scores = cross_val_score(best_xgb, X_train_scaled, y_train, cv=cv, scoring=metric, n_jobs=-1)
    print(f"{metric.capitalize()} : {scores.mean():.4f} ± {scores.std():.4f}")

Fitting 5 folds for each of 30 candidates, totalling 150 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


✅ Best Hyperparameters: {'subsample': 0.7, 'reg_lambda': 1, 'reg_alpha': 0, 'n_estimators': 300, 'max_depth': 4, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.8}
Accuracy : 0.9978 ± 0.0002
Precision : 0.9985 ± 0.0002
Recall : 0.9986 ± 0.0003
F1 : 0.9986 ± 0.0001


In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import joblib  # for saving the model

# ------------------------------
# 1️ Best Hyperparameters from Randomized Search
# ------------------------------
best_params = {
    'n_estimators': 300,
    'max_depth': 4,
    'learning_rate': 0.1,
    'subsample': 0.7,
    'colsample_bytree': 0.8,
    'gamma': 0,
    'reg_alpha': 0,
    'reg_lambda': 1
}

# Compute scale_pos_weight for class imbalance
scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]

# ------------------------------
# 2️ Initialize the final model
# ------------------------------
final_xgb = XGBClassifier(
    **best_params,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss',
    scale_pos_weight=scale_pos_weight,
    n_jobs=-1
)

# ------------------------------
# 3️ Train on full training data
# ------------------------------
final_xgb.fit(X_train_scaled, y_train)

# ------------------------------
# 4️ Evaluate on test set
# ------------------------------
y_pred = final_xgb.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average=None)
recall = recall_score(y_test, y_pred, average=None)
f1 = f1_score(y_test, y_pred, average=None)
cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Overall Accuracy: {accuracy:.4f}")
print(f"Precision per class: {precision}")
print(f"Recall per class: {recall}")
print(f"F1-score per class: {f1}")
print("Confusion Matrix:")
print(cm)
print("Classification Report:\n", report)



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Overall Accuracy: 0.9979
Precision per class: [0.99247334 0.99966016]
Recall per class: [0.99894781 0.99755833]
F1-score per class: [0.99570005 0.99860814]
Confusion Matrix:
[[ 4747     5]
 [   36 14708]]
Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00      4752
           1       1.00      1.00      1.00     14744

    accuracy                           1.00     19496
   macro avg       1.00      1.00      1.00     19496
weighted avg       1.00      1.00      1.00     19496



In [None]:
import joblib
import os


model_path = '/content/drive/MyDrive/Stable_XAI_Saved_models/'
model_name = 'SDSS_S3_USC_XGB.pkl'

os.makedirs(model_path, exist_ok=True)

full_model_path = os.path.join(model_path, model_name)


joblib.dump(final_xgb, full_model_path)
print(f" Model saved as '{full_model_path}'")


✅ Model saved as '/content/drive/MyDrive/Stable_XAI_Saved_models/SDSS_S3_USC_XGB.pkl'
