## Supervised Learning Models
### Import Libraries and Load Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix,f1_score
df_raw = pd.read_csv(r"C:\Users\madha\Downloads\12310219-PA\nasa_exoplanets.csv", sep=',')

#### =======================================
#### PLANET DISCOVERY METHOD CLASSIFICATION
#### =======================================
#### Features Selection

In [None]:
features_A = [
    'pl_orbper',
    'pl_orbsmax',
    'pl_rade',
    'pl_bmasse',
    'st_teff',
    'st_mass',
    'st_rad',
    'sy_dist'
]
features_A = [f for f in features_A if f in df.columns]
df = df_raw[features_A + ["discoverymethod"]].copy()
print("Initial shape:", df.shape)

#### Handle Missing Values

In [None]:
df = df.dropna(thresh=len(features_A) - 2)

for col in features_A:
    df[col] = df[col].fillna(df[col].median())

print("After cleaning shape:", df.shape)

#### Target Engineering

In [None]:
def simplify_discovery_method(method):
    method = str(method).lower()

    if method == 'transit':
        return 'Transit'
    elif method == 'radial velocity':
        return 'Radial Velocity'
    elif 'timing' in method:
        return 'Timing'
    elif method == 'microlensing':
        return 'Microlensing'
    elif method == 'imaging':
        return 'Imaging'
    elif method == 'astrometry':
        return 'Astrometry'
    else:
        return 'Other'

df["discovery_simple"] = df["discoverymethod"].apply(simplify_discovery_method)

print(df["discovery_simple"].value_counts())


#### Reduce Class Imbalance

In [None]:
class_counts = df["discovery_simple"].value_counts()
valid_classes = class_counts[class_counts >= 100].index
df = df[df["discovery_simple"].isin(valid_classes)]

print(df["discovery_simple"].value_counts())

### Encoding and Train/Test Split

In [None]:
X = df[features_A]
y = df['discovery_simple']

le = LabelEncoder()
y = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42,
    stratify=y
)

print("Train size:", X_train.shape)
print("Test size:", X_test.shape)

### Train Multiple Models (Using Pipelines)

In [None]:
models_A = {
    "Logistic Regression": Pipeline([
        ("scaler", StandardScaler()),
        ("model", LogisticRegression(max_iter=1000,class_weight="balanced"))]),
    
    "K-Nearest Neighbors": Pipeline([
        ("scaler", StandardScaler()),
        ("model", KNeighborsClassifier(n_neighbors=7, weights='distance'))
    ]),
    
    "Decision Tree": Pipeline([
        ("model", DecisionTreeClassifier(max_depth=5,class_weight="balanced", random_state=42))
    ])
}
results_A = {}

### Detailed Evaluation

In [None]:
for name, model in models_A.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    results_A[name] = f1_score(y_test, y_pred, average="macro")

results_A

In [None]:
dt_model = models_A["Decision Tree"]
y_pred = dt_model.predict(X_test)

print(classification_report(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred, normalize="true")
sns.heatmap(cm, annot=True, fmt=".2f")
plt.title("Normalized Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

####  Feature Importance

In [None]:
dt_A = models_A["Decision Tree"].named_steps["model"]
# Feature importance
importance_A = dt_A.feature_importances_
fi_A = pd.DataFrame({
    "Feature": features_A,
    "Importance": importance_A
}).sort_values(by="Importance", ascending=True)
print(fi_A)

In [None]:
plt.figure(figsize=(7,4))
plt.hlines(
    y=fi_A["Feature"],
    xmin=0,
    xmax=fi_A["Importance"]
)
plt.plot(
    fi_A["Importance"],
    fi_A["Feature"],
    "o"
)
plt.xlabel("Importance")
plt.title("Feature Importance (Discovery Method)")
plt.show()

#### Domain Validation

In [None]:
plt.figure(figsize=(6,4))
sns.boxplot(
    x="discoverymethod",
    y="pl_rade",
    data=df
)
plt.title("Planet Radius vs Discovery Method")
plt.show()

### =========================================
###        TEMPERATURE CLASSIFICATION
### =========================================
#### Feature Selection & Cleaning

In [None]:
features_B = [
    "pl_orbper",
    "pl_rade",
    "pl_bmasse",
    "st_teff",
    "st_mass"
]

df_B = df_raw[features_B + ["pl_eqt"]].copy()

# Fill missing values (simple & beginner-friendly)
for col in features_B + ["pl_eqt"]:
    df_B[col] = df_B[col].fillna(df_B[col].median())

#### Create Temperature Classes

In [None]:
def temperature_class(temp):
    if temp <= 300:
        return "Non-Hot"   
    else:
        return "Hot"

df_B["temp_class"] = df_B["pl_eqt"].apply(temperature_class)
df_B.drop(columns=["pl_eqt"], inplace=True)

print(df_B["temp_class"].value_counts())


#### Encode Target

In [None]:
le_temp = LabelEncoder()
df_B["temp_class"] = le_temp.fit_transform(df_B["temp_class"])

#### Train/Test Split

In [None]:
X = df_B[features_B]
y = df_B["temp_class"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

#### Define Models

In [None]:
models_B = {
    "Logistic Regression": Pipeline([
        ("scaler", StandardScaler()),
        ("model", LogisticRegression(
            max_iter=1000,
            class_weight="balanced"
        ))
    ]),

    "Naive Bayes": Pipeline([
        ("scaler", StandardScaler()),
        ("model", GaussianNB())
    ]),

    "Decision Tree": Pipeline([
        ("model", DecisionTreeClassifier(
            max_depth=5,
            class_weight="balanced",
            random_state=42
        ))
    ])
}

#### Train & Evaluate

In [None]:
results_B = {}

for name, model in models_B.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    results_B[name] = f1_score(y_test, y_pred, average="macro")

results_B

#### Detailed Evaluation

In [None]:
dt_temp = models_B["Decision Tree"]
y_pred_dt = dt_temp.predict(X_test)

print(classification_report(y_test, y_pred_dt))

In [None]:
cm = confusion_matrix(y_test, y_pred_dt, normalize="true")

plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt=".2f", cmap="Blues")
plt.title("Temperature Classification – Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

#### Feature Importance

In [None]:
dt_B = models_B["Decision Tree"].named_steps["model"]

importance_B = dt_B.feature_importances_

fi_B = pd.DataFrame({
    "Feature": features_B,
    "Importance": importance_B
}).sort_values(by="Importance", ascending=True)

print(fi_B)

In [None]:
fi_sorted = fi_B.sort_values("Importance", ascending=False)
fi_sorted["Cumulative"] = fi_sorted["Importance"].cumsum()

plt.figure(figsize=(6,4))
plt.plot(
    fi_sorted["Cumulative"],
    marker="o"
)

plt.xticks(range(len(fi_sorted)), fi_sorted["Feature"],  ha="right")
plt.ylabel("Cumulative Importance")
plt.title("Cumulative Feature Importance — Temperature Classification")

plt.grid(axis="y", linestyle="--", alpha=0.4)
plt.tight_layout()
plt.show()

#### Domain Validation

In [None]:
df_temp_plot = df_B.copy()
df_temp_plot["temp_label"] = le_temp.inverse_transform(df_temp_plot["temp_class"])
plt.figure(figsize=(6,4))

sns.violinplot(
    x="temp_label",
    y="st_teff",
    data=df_temp_plot,
    inner="quartile",
    cut=0
)

plt.xlabel("Planet Temperature Class")
plt.ylabel("Stellar Effective Temperature (K)")
plt.title("Stellar Temperature Distribution by Planet Temperature Class", pad=8)

sns.despine()
plt.tight_layout()
plt.show()


## MODEL COMPARISON
#### Prediction Probabilities

In [None]:
# Logistic Regression probabilities
lr_B = models_B["Logistic Regression"]
lr_probs_B = lr_B.predict_proba(X_test)
# Naive Bayes probabilities
nb_B = models_B["Naive Bayes"]
nb_probs_B = nb_B.predict_proba(X_test)
# Decision Tree probabilities
dt_B = models_B["Decision Tree"]
dt_probs_B = dt_B.predict_proba(X_test)

# Confidence scores 
lr_conf_B = lr_probs_B.max(axis=1)
nb_conf_B = nb_probs_B.max(axis=1)
dt_conf_B = dt_probs_B.max(axis=1)

In [None]:
plt.figure(figsize=(6,4))

sns.kdeplot(lr_conf_B, label="Logistic Regression", fill=True)
sns.kdeplot(nb_conf_B, label="Naive Bayes", fill=True)
sns.kdeplot(dt_conf_B, label="Decision Tree", fill=True)

plt.xlabel("Prediction Confidence")
plt.title("Model Confidence Comparison")
plt.legend()
plt.show()
