In [1]:
import torch
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
from sklearn.calibration import LabelEncoder

In [2]:
# -------------------------------
# 1. Load dataset
# -------------------------------
df = pd.read_csv('Data\k2pandc final.csv')
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4004 entries, 0 to 4003
Data columns (total 95 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   loc_rowid        4004 non-null   int64  
 1   pl_name          4004 non-null   object 
 2   hostname         4004 non-null   object 
 3   default_flag     4004 non-null   int64  
 4   disposition      4004 non-null   object 
 5   disp_refname     4004 non-null   object 
 6   sy_snum          4004 non-null   int64  
 7   sy_pnum          4004 non-null   int64  
 8   discoverymethod  4004 non-null   object 
 9   disc_year        4004 non-null   int64  
 10  disc_facility    4004 non-null   object 
 11  soltype          4004 non-null   object 
 12  pl_controv_flag  4004 non-null   int64  
 13  pl_refname       4004 non-null   object 
 14  pl_orbper        3960 non-null   float64
 15  pl_orbpererr1    3071 non-null   float64
 16  pl_orbpererr2    3071 non-null   float64
 17  pl_orbperlim  

  df = pd.read_csv('Data\k2pandc final.csv')


In [3]:
# -------------------------------
# 2. Handle missing values (improved)
# -------------------------------
missing_pct = df.isnull().mean() * 100

# Drop columns with >50% missing values
cols_to_drop = missing_pct[missing_pct > 50].index
df = df.drop(columns=cols_to_drop)

# Update num_cols and cat_cols after dropping columns
num_cols = df.select_dtypes(include=['float64', 'int64']).columns
cat_cols = df.select_dtypes(include=['object', 'category', 'bool']).columns

# Fill numerical columns with median
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# Fill categorical columns with mode
for col in cat_cols:
    mode = df[col].mode()
    df[col] = df[col].fillna(mode[0] if not mode.empty else 'Unknown')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4004 entries, 0 to 4003
Data columns (total 62 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   loc_rowid        4004 non-null   int64  
 1   pl_name          4004 non-null   object 
 2   hostname         4004 non-null   object 
 3   default_flag     4004 non-null   int64  
 4   disposition      4004 non-null   object 
 5   disp_refname     4004 non-null   object 
 6   sy_snum          4004 non-null   int64  
 7   sy_pnum          4004 non-null   int64  
 8   discoverymethod  4004 non-null   object 
 9   disc_year        4004 non-null   int64  
 10  disc_facility    4004 non-null   object 
 11  soltype          4004 non-null   object 
 12  pl_controv_flag  4004 non-null   int64  
 13  pl_refname       4004 non-null   object 
 14  pl_orbper        4004 non-null   float64
 15  pl_orbpererr1    4004 non-null   float64
 16  pl_orbpererr2    4004 non-null   float64
 17  pl_orbperlim  

In [5]:
# -------------------------------
# 3. Scale numerical features
# -------------------------------
df_scaled = df.copy()
num_cols = df_scaled.select_dtypes(include=['float64', 'int64']).columns
scaler = StandardScaler()
df_scaled[num_cols] = scaler.fit_transform(df_scaled[num_cols])

In [6]:
# -------------------------------
# 4. Encode categorical features
# -------------------------------

df_encoded = df_scaled.copy()
cat_cols = df_encoded.select_dtypes(include=['object', 'category', 'bool']).columns

le_dict = {}
for col in cat_cols:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col].fillna('NaN_Label'))
    le_dict[col] = le

In [8]:
# -------------------------------
# 5. Feature selection (correlation + importance)
# -------------------------------
from sklearn.ensemble import RandomForestClassifier

X = df_encoded.drop(columns=['disposition'])
y = df_encoded['disposition']

X = pd.get_dummies(X)  # one-hot if needed
X_train, _, y_train, _ = train_test_split(X, y, random_state=42)

# Correlation
corr_with_target = df_encoded.corr(numeric_only=True)['disposition'].drop('disposition').abs()
top_corr = corr_with_target.sort_values(ascending=False).head(20).index.tolist()

# Random Forest importance
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
importances = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
top_importance = importances.head(20).index.tolist()

# Combine
selected_features = list(set(top_corr + top_importance))


In [9]:
# -------------------------------
# 6. Build preprocessed dataframe
# -------------------------------
X = df_encoded.drop(columns=['disposition'])
valuable_features_df = X[selected_features]

# Add target
preprocessed_df = valuable_features_df.copy()
preprocessed_df['disposition'] = df_encoded['disposition']


In [16]:
preprocessed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4004 entries, 0 to 4003
Data columns (total 32 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   rowupdate       4004 non-null   int64  
 1   st_tefferr1     4004 non-null   float64
 2   sy_disterr2     4004 non-null   float64
 3   sy_vmag         4004 non-null   float64
 4   hostname        4004 non-null   int64  
 5   pl_rade         4004 non-null   float64
 6   sy_gaiamag      4004 non-null   float64
 7   soltype         4004 non-null   int64  
 8   default_flag    4004 non-null   float64
 9   loc_rowid       4004 non-null   float64
 10  disp_refname    4004 non-null   int64  
 11  disc_year       4004 non-null   float64
 12  pl_radeerr2     4004 non-null   float64
 13  pl_radjerr2     4004 non-null   float64
 14  sy_disterr1     4004 non-null   float64
 15  sy_gaiamagerr1  4004 non-null   float64
 16  st_refname      4004 non-null   int64  
 17  pl_radeerr1     4004 non-null   f

In [20]:
preprocessed_df['disposition'].value_counts()

disposition
1    2315
0    1374
2     293
3      22
Name: count, dtype: int64

In [28]:
preprocessed_df = preprocessed_df[preprocessed_df['disposition'] != 3].reset_index(drop=True)

In [29]:
preprocessed_df['disposition'].value_counts()

disposition
1    2315
0    1374
2     293
Name: count, dtype: int64

In [None]:
# # -------------------------------
# # 7. Downsample classes
# # -------------------------------
# from sklearn.utils import resample

# df_downsample = df.copy()
# class_counts = df_downsample['disposition'].value_counts()
# min_count = class_counts.min()

# dfs = []
# for cls in class_counts.index:
#     cls_df = df_downsample[df_downsample['disposition'] == cls]
#     cls_downsampled = resample(cls_df, replace=False, n_samples=min_count, random_state=42)
#     dfs.append(cls_downsampled)

# df_downsampled = pd.concat(dfs).sample(frac=1, random_state=42).reset_index(drop=True)


In [31]:
from sklearn.model_selection import train_test_split

X = preprocessed_df
y = preprocessed_df['disposition']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

In [32]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

import torch
import numpy as np
from sklearn.utils.class_weight import compute_class_weight

# Ensure y_train is numpy
if isinstance(y_train, torch.Tensor):
    y_train_np = y_train.cpu().numpy()
else:
    y_train_np = np.array(y_train)

# Get unique classes
classes = np.unique(y_train_np)

# Compute class weights
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=classes,
    y=y_train_np
)

# Convert to tensor for PyTorch
class_weights = torch.tensor(class_weights, dtype=torch.float32).to(device)

print("Classes:", classes)
print("Class Weights:", class_weights)


# ---- Focal Loss with class weights ----
class FocalLoss(nn.Module):
    def __init__(self, gamma=2.0, alpha=None, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.alpha = alpha  # can be class weights tensor
        self.reduction = reduction

    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, reduction='none', weight=self.alpha)
        pt = torch.exp(-ce_loss)  # probability of true class
        focal_loss = (1 - pt) ** self.gamma * ce_loss

        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        return focal_loss


# ---- Initialize with class weights ----
loss_fn = FocalLoss(alpha=class_weights, gamma=2.0)


NameError: name 'device' is not defined