In [1]:
!pip install xgboost

[0m

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OrdinalEncoder
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

pd.set_option('display.float_format', '{:.2f}'.format)
pd.set_option('display.max_rows', None)

In [3]:
import os
print(os.getcwd())

/root/miniconda3/envs/exoplanets_with_ai/main/notebooks


In [4]:
data = pd.read_csv("/root/miniconda3/envs/exoplanets_with_ai/main/data/TESS cleaned.csv", index_col="rowid")

In [5]:
data.head()

Unnamed: 0_level_0,pl_pnum,ra,dec,st_pmra,st_pmdec,pl_tranmid,pl_orbper,pl_trandurh,pl_trandep,pl_rade,pl_insol,pl_eqt,st_tmag,st_dist,st_teff,st_rad,class
rowid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,1,112.36,-12.7,-5.96,-0.08,2459229.63,2.17,2.02,656.89,5.82,22601.95,3127.2,9.6,485.74,10249.0,2.17,False Alarm
2,1,122.58,-5.51,-4.96,-15.55,2459987.95,1.93,3.17,1286.0,11.22,44464.5,4045.0,9.42,295.86,7070.0,2.01,Possible Candidate
3,1,104.73,-10.58,-1.46,-2.25,2459224.69,1.87,1.41,1500.0,23.75,2860.61,2037.0,9.3,943.11,8924.0,5.73,False Alarm
4,1,110.56,-25.21,-0.94,1.64,2458493.4,2.74,3.17,383.41,,1177.36,1631.0,9.3,7728.17,5388.5,,False Alarm
5,1,122.18,-48.8,-4.5,9.35,2459987.05,3.57,3.37,755.0,11.31,54679.3,4260.0,9.14,356.44,9219.0,2.15,False Alarm


### Train-Test split

In [6]:
X, y = data.drop("class", axis=1), data[["class"]]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)

## Encoding of the target

In [7]:
# ordinal Encoding
encoder = OrdinalEncoder(categories=[['False Alarm', 'Possible Candidate', 'Confirmed Planet']])
y_train_encoding = encoder.fit_transform(y_train)
y_train_encoding = y_train_encoding.astype("int32")
y_test_encoding = encoder.transform(y_test)
y_test_encoding = y_test_encoding.astype("int32")

## Fix NaN

In [8]:
imputer = KNNImputer(n_neighbors=6)
X_train_impute = imputer.fit_transform(X_train)
X_test_impute = imputer.transform(X_test)

## Fixing Imbalance Data (SMOTE)

In [10]:
type(y_train_encoding)

numpy.ndarray

In [13]:
np.unique(y_train_encoding, return_counts=True)

(array([0, 1, 2], dtype=int32), array([1007, 4037,  936]))

In [14]:
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_impute, y_train_encoding)

In [15]:
np.unique(y_train_resampled, return_counts=True)

(array([0, 1, 2], dtype=int32), array([4037, 4037, 4037]))

## Scaling

In [22]:
# Standard Scaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test_impute)

In [19]:
# Robust Scaler
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test_impute)

## Modeling

### Decision Tree

In [None]:
tree_clf = DecisionTreeClassifier(random_state=42)
tree_clf.fit(X_train_scaled, y_train_resampled)
y_pred = tree_clf.predict(X_test_scaled)

In [None]:
# Metrics y_test_encoding
accuracy  = accuracy_score(y_test_encoding, y_pred)
precision = precision_score(y_test_encoding, y_pred, average="weighted")  # for binary classification
recall    = recall_score(y_test_encoding, y_pred, average="weighted")
f1        = f1_score(y_test_encoding, y_pred, average="weighted")

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

# Full classification report
print("\nClassification Report:\n", classification_report(y_test_encoding, y_pred))

### Random Forest

In [None]:
rf = RandomForestClassifier(
    n_estimators=100,     # number of trees
    max_depth=None,       # let it grow fully
    random_state=42,
    class_weight="balanced"   # helps handle imbalance too
)
rf.fit(X_train_scaled, y_train_resampled)
y_pred = rf.predict(X_test_scaled)

In [None]:
# Metrics
accuracy  = accuracy_score(y_test_encoding, y_pred)
precision = precision_score(y_test_encoding, y_pred, average="weighted")  # for binary classification
recall    = recall_score(y_test_encoding, y_pred, average="weighted")
f1        = f1_score(y_test_encoding, y_pred, average="weighted")

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

# Full classification report
print("\nClassification Report:\n", classification_report(y_test_encoding, y_pred))

### XGBoost

In [23]:
xgb_model = xgb.XGBClassifier(
    n_estimators=5000,        # number of trees
    learning_rate=0.02,       # step size shrinkage
    max_depth=10,             # max depth of each tree
    subsample=0.8,           # subsample ratio for training
    colsample_bytree=0.8,    # subsample ratio for features
    random_state=42,
    objective='multi:softmax',   # for multiclass classification
    num_class=4,                 # your number of classes
    eval_metric='mlogloss'
)

xgb_model.fit(X_train_scaled, y_train_resampled)
y_pred = xgb_model.predict(X_test_scaled)

In [24]:
# Metrics
accuracy  = accuracy_score(y_test_encoding, y_pred)
precision = precision_score(y_test_encoding, y_pred, average="weighted")  # for binary classification
recall    = recall_score(y_test_encoding, y_pred, average="weighted")
f1        = f1_score(y_test_encoding, y_pred, average="weighted")

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

# Full classification report
print("\nClassification Report:\n", classification_report(y_test_encoding, y_pred))

Accuracy: 0.7372994652406417
Precision: 0.7323937188359892
Recall: 0.7372994652406417
F1 Score: 0.7344032539781833

Classification Report:
               precision    recall  f1-score   support

           0       0.55      0.49      0.52       263
           1       0.81      0.84      0.82       991
           2       0.60      0.60      0.60       242

    accuracy                           0.74      1496
   macro avg       0.65      0.64      0.65      1496
weighted avg       0.73      0.74      0.73      1496

