In [220]:
import torch
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
from sklearn.calibration import LabelEncoder

In [221]:
# -------------------------------
# 1. Load dataset
# -------------------------------
df = pd.read_csv('Data\k2pandc final.csv')
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4004 entries, 0 to 4003
Data columns (total 95 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   loc_rowid        4004 non-null   int64  
 1   pl_name          4004 non-null   object 
 2   hostname         4004 non-null   object 
 3   default_flag     4004 non-null   int64  
 4   disposition      4004 non-null   object 
 5   disp_refname     4004 non-null   object 
 6   sy_snum          4004 non-null   int64  
 7   sy_pnum          4004 non-null   int64  
 8   discoverymethod  4004 non-null   object 
 9   disc_year        4004 non-null   int64  
 10  disc_facility    4004 non-null   object 
 11  soltype          4004 non-null   object 
 12  pl_controv_flag  4004 non-null   int64  
 13  pl_refname       4004 non-null   object 
 14  pl_orbper        3960 non-null   float64
 15  pl_orbpererr1    3071 non-null   float64
 16  pl_orbpererr2    3071 non-null   float64
 17  pl_orbperlim  

In [222]:
# -------------------------------
# 2. ]Drop name and ID columns
# -------------------------------
cols_to_drop = [
    'loc_rowid','pl_name','hostname','disp_refname','pl_refname','st_refname',
    'sy_refname','rowupdate','pl_pubdate','releasedate',
    'pl_orbsmax','pl_orbsmaxerr1','pl_orbsmaxerr2','pl_orbsmaxlim',
    'pl_bmasse','pl_bmasseerr1','pl_bmasseerr2','pl_bmasselim',
    'pl_bmassj','pl_bmassjerr1','pl_bmassjerr2','pl_bmassjlim','pl_bmassprov',
    'pl_orbeccen','pl_orbeccenerr1','pl_orbeccenerr2','pl_orbeccenlim',
    'pl_insol','pl_insolerr1','pl_insolerr2','pl_insollim',
    'pl_eqt','pl_eqterr1','pl_eqterr2','pl_eqtlim',
    'st_spectype','st_mass','st_masserr1','st_masserr2','st_masslim',
    'st_met','st_meterr1','st_meterr2','st_metlim','st_metratio',
    'pl_radj','pl_radjerr1','pl_radjerr2','pl_radjlim',
    'pl_orbpererr1','pl_orbpererr2','pl_orbperlim',
    'st_tefferr1','st_tefferr2','st_tefflim',
    'st_raderr1','st_raderr2','st_radlim',
    'sy_vmagerr1','sy_vmagerr2','sy_kmagerr1','sy_kmagerr2',
    'sy_gaiamagerr1','sy_gaiamagerr2'
]
df = df.drop(columns=cols_to_drop)

In [223]:
# -------------------------------
# 2.1 Handle missing values (improved)
# -------------------------------
missing_pct = df.isnull().mean() * 100

# Drop columns with >50% missing values
cols_to_drop = missing_pct[missing_pct > 50].index
df = df.drop(columns=cols_to_drop)

# Update num_cols and cat_cols after dropping columns
num_cols = df.select_dtypes(include=['float64', 'int64']).columns
cat_cols = df.select_dtypes(include=['object', 'category', 'bool']).columns

# Fill numerical columns with median
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# Fill categorical columns with mode
for col in cat_cols:
    mode = df[col].mode()
    df[col] = df[col].fillna(mode[0] if not mode.empty else 'Unknown')

In [224]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4004 entries, 0 to 4003
Data columns (total 31 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   default_flag     4004 non-null   int64  
 1   disposition      4004 non-null   object 
 2   sy_snum          4004 non-null   int64  
 3   sy_pnum          4004 non-null   int64  
 4   discoverymethod  4004 non-null   object 
 5   disc_year        4004 non-null   int64  
 6   disc_facility    4004 non-null   object 
 7   soltype          4004 non-null   object 
 8   pl_controv_flag  4004 non-null   int64  
 9   pl_orbper        4004 non-null   float64
 10  pl_rade          4004 non-null   float64
 11  pl_radeerr1      4004 non-null   float64
 12  pl_radeerr2      4004 non-null   float64
 13  pl_radelim       4004 non-null   float64
 14  ttv_flag         4004 non-null   int64  
 15  st_teff          4004 non-null   float64
 16  st_rad           4004 non-null   float64
 17  st_logg       

In [225]:
# -------------------------------
# 3. Scale numerical features
# -------------------------------
df_scaled = df.copy()
num_cols = df_scaled.select_dtypes(include=['float64', 'int64']).columns
scaler = StandardScaler()
df_scaled[num_cols] = scaler.fit_transform(df_scaled[num_cols])

In [226]:
# -------------------------------
# 4. Encode categorical features
# -------------------------------
##########
df_encoded = df_scaled.copy()
cat_cols = df_encoded.select_dtypes(include=['object', 'category', 'bool']).columns

le_dict = {}
for col in cat_cols:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col].fillna('NaN_Label'))
    le_dict[col] = le

In [227]:
# -------------------------------
# 5. Feature selection (correlation + importance)
# -------------------------------
from sklearn.ensemble import RandomForestClassifier

X = df_encoded.drop(columns=['disposition'])
y = df_encoded['disposition']

X = pd.get_dummies(X)  # one-hot if needed
X_train, _, y_train, _ = train_test_split(X, y, random_state=42)

# Correlation
corr_with_target = df_encoded.corr(numeric_only=True)['disposition'].drop('disposition').abs()
top_corr = corr_with_target.sort_values(ascending=False).head(7).index.tolist()

# Random Forest importance
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
importances = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
top_importance = importances.head(7).index.tolist()

# Combine
selected_features = list(set(top_corr + top_importance))


In [228]:
# -------------------------------
# 6. Build preprocessed dataframe
# -------------------------------
X = df_encoded.drop(columns=['disposition'])
valuable_features_df = X[selected_features]

# Add target
preprocessed_df = valuable_features_df.copy()
preprocessed_df['disposition'] = df_encoded['disposition']


In [229]:
preprocessed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4004 entries, 0 to 4003
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   pl_radeerr2   4004 non-null   float64
 1   sy_vmag       4004 non-null   float64
 2   sy_pnum       4004 non-null   float64
 3   dec           4004 non-null   float64
 4   soltype       4004 non-null   int32  
 5   sy_kmag       4004 non-null   float64
 6   sy_disterr1   4004 non-null   float64
 7   sy_disterr2   4004 non-null   float64
 8   default_flag  4004 non-null   float64
 9   disc_year     4004 non-null   float64
 10  sy_gaiamag    4004 non-null   float64
 11  disposition   4004 non-null   int32  
dtypes: float64(10), int32(2)
memory usage: 344.2 KB


In [230]:
preprocessed_df['disposition'].value_counts()

disposition
1    2315
0    1374
2     293
3      22
Name: count, dtype: int64

In [231]:
preprocessed_df = preprocessed_df[preprocessed_df['disposition'] != 3].reset_index(drop=True)

In [232]:
preprocessed_df['disposition'].value_counts()

disposition
1    2315
0    1374
2     293
Name: count, dtype: int64

In [233]:
from sklearn.model_selection import train_test_split

X = preprocessed_df.drop(columns=['disposition'])
y = preprocessed_df['disposition']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

In [234]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

import torch
import numpy as np
from sklearn.utils.class_weight import compute_class_weight
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Ensure y_train is numpy
if isinstance(y_train, torch.Tensor):
    y_train_np = y_train.cpu().numpy()
else:
    y_train_np = np.array(y_train)

# Get unique classes
classes = np.unique(y_train_np)

# Compute class weights
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=classes,
    y=y_train_np
)

# Convert to tensor for PyTorch
class_weights = torch.tensor(class_weights, dtype=torch.float32).to(device)

print("Classes:", classes)
print("Class Weights:", class_weights)


# ---- Focal Loss with class weights ----
class FocalLoss(nn.Module):
    def __init__(self, gamma=2.0, alpha=None, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.alpha = alpha  # can be class weights tensor
        self.reduction = reduction

    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, reduction='none', weight=self.alpha)
        pt = torch.exp(-ce_loss)  # probability of true class
        focal_loss = (1 - pt) ** self.gamma * ce_loss

        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        return focal_loss


# ---- Initialize with class weights ----
loss_fn = FocalLoss(alpha=class_weights, gamma=2.0)


Classes: [0 1 2]
Class Weights: tensor([0.9663, 0.5733, 4.5242], device='cuda:0')


In [235]:
import torch
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from sklearn.metrics import recall_score, roc_auc_score
from sklearn.preprocessing import label_binarize

# Convert to tensors if not already
X_train = torch.tensor(X_train.to_numpy(), dtype=torch.float32)
y_train = torch.tensor(y_train.to_numpy(), dtype=torch.long)
X_test = torch.tensor(X_test.to_numpy(), dtype=torch.float32)
y_test = torch.tensor(y_test.to_numpy(), dtype=torch.long)

# Create datasets
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)

# Data loaders
batch_size = 32  # Adjust based on your data size and memory
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Compute class weights for imbalance (optional but recommended for recall)
class_counts = np.bincount(y_train.numpy())
class_weights = 1. / class_counts
class_weights = torch.tensor(class_weights / class_weights.sum(), dtype=torch.float32)  # Normalize

In [236]:
X_train.shape

torch.Size([2986, 11])

In [237]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class ResidualBlock(nn.Module):
    def __init__(self, in_features, out_features, dropout_rate=0.3):  # <-- fixed __init__
        super(ResidualBlock, self).__init__()  # <-- fixed __init__
        self.fc = nn.Linear(in_features, out_features)
        self.ln = nn.LayerNorm(out_features)  # Layer norm for better stability
        self.dropout = nn.Dropout(dropout_rate)
        
        # Projection if dimensions differ
        self.proj = nn.Linear(in_features, out_features) if in_features != out_features else nn.Identity()
    
    def forward(self, x):
        residual = self.proj(x)
        x = F.silu(self.ln(self.fc(x)))  # Swish activation
        x = self.dropout(x)
        return x + residual  # Skip connection


class ResidualMLP(nn.Module):
    def __init__(self, input_size=16, num_classes=3):  # <-- fixed __init__
        super(ResidualMLP, self).__init__()  # <-- fixed __init__
        self.entry = nn.Linear(input_size, 256)  # Entry layer
        
        # Residual blocks for depth
        self.res_block1 = ResidualBlock(256, 256)
        self.res_block2 = ResidualBlock(256, 128)
        self.res_block3 = ResidualBlock(128, 128)
        self.res_block4 = ResidualBlock(128, 64)
        
        self.dropout_final = nn.Dropout(0.2)
        self.fc_out = nn.Linear(64, num_classes)
    
    def forward(self, x):
        x = F.silu(self.entry(x))
        
        x = self.res_block1(x)
        x = self.res_block2(x)
        x = self.res_block3(x)
        x = self.res_block4(x)
        
        x = self.dropout_final(x)
        x = self.fc_out(x)  # Logits output
        return x


In [238]:
classes = np.unique(y_train)  # unique class labels
print("Classes:", classes)

Classes: [0 1 2]


In [239]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

import torch
import numpy as np
from sklearn.utils.class_weight import compute_class_weight

# Ensure y_train is numpy
if isinstance(y_train, torch.Tensor):
    y_train_np = y_train.cpu().numpy()
else:
    y_train_np = np.array(y_train)

# Get unique classes
classes = np.unique(y_train_np)

# Compute class weights
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=classes,
    y=y_train_np
)

# Convert to tensor for PyTorch
class_weights = torch.tensor(class_weights, dtype=torch.float32).to(device)

print("Classes:", classes)
print("Class Weights:", class_weights)


# ---- Focal Loss with class weights ----
class FocalLoss(nn.Module):
    def __init__(self, gamma=2.0, alpha=None, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.alpha = alpha  # can be class weights tensor
        self.reduction = reduction

    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, reduction='none', weight=self.alpha)
        pt = torch.exp(-ce_loss)  # probability of true class
        focal_loss = (1 - pt) ** self.gamma * ce_loss

        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        return focal_loss


# ---- Initialize with class weights ----
loss_fn = FocalLoss(alpha=class_weights, gamma=2.0)


Classes: [0 1 2]
Class Weights: tensor([0.9663, 0.5733, 4.5242], device='cuda:0')


In [240]:
print("CUDA Available:", torch.cuda.is_available())

if torch.cuda.is_available():
    print("GPU Name:", torch.cuda.get_device_name(0))
    print("Number of GPUs:", torch.cuda.device_count())
    print("Current Device Index:", torch.cuda.current_device())

CUDA Available: True
GPU Name: NVIDIA GeForce RTX 4070 Laptop GPU
Number of GPUs: 1
Current Device Index: 0


In [241]:
X

Unnamed: 0,pl_radeerr2,sy_vmag,sy_pnum,dec,soltype,sy_kmag,sy_disterr1,sy_disterr2,default_flag,disc_year,sy_gaiamag
0,0.152988,-1.229216,-0.091546,1.274274,1,-0.953914,-0.164254,0.188935,-0.906452,-0.761754,-1.142112
1,0.157520,-1.229216,-0.091546,1.274274,1,-0.953914,-0.164254,0.188935,-0.906452,-0.761754,-1.142112
2,0.157353,-1.229216,-0.091546,1.274274,1,-0.953914,-0.164254,0.188935,1.103202,-0.761754,-1.142112
3,0.159915,-0.764653,-0.815954,-0.498382,0,-1.059170,-0.170415,0.197710,1.103202,0.222931,-0.831989
4,0.156657,-0.764653,-0.815954,-0.498382,0,-1.059170,-0.170415,0.197710,-0.906452,0.222931,-0.831989
...,...,...,...,...,...,...,...,...,...,...,...
3977,-0.719784,-1.297472,-0.091546,0.348257,0,-1.405518,-0.164471,0.189295,-0.906452,-0.761754,-1.281380
3978,0.160793,-1.535574,-0.091546,-0.489871,1,-2.199202,-0.173274,0.201801,1.103202,0.222931,-1.702106
3979,0.160474,-1.535574,-0.091546,-0.489871,1,-2.199202,-0.173274,0.201801,-0.906452,0.222931,-1.702106
3980,0.160793,-1.535574,-0.091546,-0.489871,1,-2.199202,-0.173274,0.201801,-0.906452,0.222931,-1.702106


In [242]:
import torch
import torch.nn as nn
import numpy as np

# Device setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize model
input_size = X_train.shape[1]  # number of features
model = ResidualMLP(input_size=input_size, num_classes=3).to(device)

# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=0.005, weight_decay=1e-5)

# Learning rate scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='max', factor=0.8, patience=6, min_lr=1e-6
)

# Early stopping setup
patience = 40
best_accuracy = 0.0
epochs_no_improve = 0
best_model_path = "K2_best_residual_mlp_model.pth"

# Loss function
loss_fn = nn.CrossEntropyLoss()

# Training parameters
epochs = 500

for epoch in range(epochs):
    # --- Training ---
    model.train()
    train_loss, correct, total = 0.0, 0, 0
    
    for batch_X, batch_y in train_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = loss_fn(outputs, batch_y)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total += batch_y.size(0)
        correct += (predicted == batch_y).sum().item()
    
    avg_train_loss = train_loss / len(train_loader)
    train_accuracy = 100 * correct / total

    # --- Evaluation ---
    model.eval()
    test_loss, correct_test, total_test = 0.0, 0, 0
    with torch.no_grad():
        for batch_X, batch_y in test_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            outputs = model(batch_X)
            loss = loss_fn(outputs, batch_y)
            test_loss += loss.item()
            
            _, predicted = torch.max(outputs, 1)
            total_test += batch_y.size(0)
            correct_test += (predicted == batch_y).sum().item()
    
    avg_test_loss = test_loss / len(test_loader)
    test_accuracy = 100 * correct_test / total_test

    # --- Scheduler step (manual print) ---
    old_lr = optimizer.param_groups[0]['lr']
    scheduler.step(test_accuracy)
    new_lr = optimizer.param_groups[0]['lr']
    if new_lr < old_lr:
        print(f"📉 Learning rate reduced from {old_lr:.6f} to {new_lr:.6f}")

    # --- Early stopping check ---
    if test_accuracy > best_accuracy:
        best_accuracy = test_accuracy
        torch.save(model.state_dict(), best_model_path)
        epochs_no_improve = 0
        print(f"✅ Saved new best model at epoch {epoch+1} | Test Acc: {best_accuracy:.2f}%")
    else:
        epochs_no_improve += 1
        print(f"⏳ No improvement for {epochs_no_improve}/{patience} epochs")
    
    if epochs_no_improve >= patience:
        print(f"🚨 Early stopping triggered at epoch {epoch+1}")
        print(f"🎯 Best Test Accuracy: {best_accuracy:.2f}%")
        model.load_state_dict(torch.load(best_model_path))
        break

    # --- Logging ---
    print(
        f"Epoch {epoch+1}/{epochs} | "
        f"Train Loss: {avg_train_loss:.4f}, Train Acc: {train_accuracy:.2f}% | "
        f"Test Loss: {avg_test_loss:.4f}, Test Acc: {test_accuracy:.2f}%"
    )

print("\n🎯 Training finished.")
print(f"Best Test Accuracy: {best_accuracy:.2f}%")


✅ Saved new best model at epoch 1 | Test Acc: 91.77%
Epoch 1/500 | Train Loss: 0.3067, Train Acc: 88.71% | Test Loss: 0.2066, Test Acc: 91.77%
✅ Saved new best model at epoch 2 | Test Acc: 92.07%
Epoch 2/500 | Train Loss: 0.2209, Train Acc: 92.06% | Test Loss: 0.2067, Test Acc: 92.07%
⏳ No improvement for 1/40 epochs
Epoch 3/500 | Train Loss: 0.2097, Train Acc: 92.26% | Test Loss: 0.2130, Test Acc: 91.97%
✅ Saved new best model at epoch 4 | Test Acc: 93.37%
Epoch 4/500 | Train Loss: 0.1904, Train Acc: 92.67% | Test Loss: 0.1856, Test Acc: 93.37%
✅ Saved new best model at epoch 5 | Test Acc: 94.38%
Epoch 5/500 | Train Loss: 0.1754, Train Acc: 93.24% | Test Loss: 0.2061, Test Acc: 94.38%
⏳ No improvement for 1/40 epochs
Epoch 6/500 | Train Loss: 0.1770, Train Acc: 93.00% | Test Loss: 0.1853, Test Acc: 93.88%
⏳ No improvement for 2/40 epochs
Epoch 7/500 | Train Loss: 0.1692, Train Acc: 93.30% | Test Loss: 0.1631, Test Acc: 94.38%
✅ Saved new best model at epoch 8 | Test Acc: 95.28%
Epoch 

In [243]:
from sklearn.metrics import classification_report

# --- Evaluate on test set and generate classification report ---
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch_X, batch_y in test_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        outputs = model(batch_X)
        _, predicted = torch.max(outputs, 1)
        
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(batch_y.cpu().numpy())

# Generate report
print("\n📊 Classification Report:")
print(classification_report(all_labels, all_preds, digits=4))



📊 Classification Report:
              precision    recall  f1-score   support

           0     0.9114    0.9564    0.9333       344
           1     0.9914    1.0000    0.9957       579
           2     0.8039    0.5616    0.6613        73

    accuracy                         0.9528       996
   macro avg     0.9022    0.8393    0.8634       996
weighted avg     0.9500    0.9528    0.9497       996

