In [None]:
import pandas as pd

df = pd.read_csv('/content/cumulative_2025.10.03_08.34.41.csv', sep=',') # Use comma as separator
print(df.head())

      kepid kepoi_name   kepler_name koi_disposition koi_pdisposition  \
0  10797460  K00752.01  Kepler-227 b       CONFIRMED        CANDIDATE   
1  10797460  K00752.02  Kepler-227 c       CONFIRMED        CANDIDATE   
2  10811496  K00753.01           NaN       CANDIDATE        CANDIDATE   
3  10848459  K00754.01           NaN  FALSE POSITIVE   FALSE POSITIVE   
4  10854555  K00755.01  Kepler-664 b       CONFIRMED        CANDIDATE   

   koi_score  koi_fpflag_nt  koi_fpflag_ss  koi_fpflag_co  koi_fpflag_ec  ...  \
0      1.000              0              0              0              0  ...   
1      0.969              0              0              0              0  ...   
2      0.000              0              0              0              0  ...   
3      0.000              0              1              0              0  ...   
4      1.000              0              0              0              0  ...   

   koi_steff_err2  koi_slogg  koi_slogg_err1  koi_slogg_err2  koi_srad  \


In [None]:
y = df[['koi_disposition', 'koi_pdisposition']]
X = df.drop(['koi_disposition', 'koi_pdisposition'], axis=1)

print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

Shape of X: (9564, 47)
Shape of y: (9564, 2)


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = X_train.drop(['kepler_name', 'koi_tce_delivname'], axis=1)
X_test = X_test.drop(['kepler_name', 'koi_tce_delivname'], axis=1)

X_train = X_train.drop(['koi_teq_err1', 'koi_teq_err2'], axis=1)
X_test = X_test.drop(['koi_teq_err1', 'koi_teq_err2'], axis=1)

print("Missing values in X_train after dropping columns:")
print(X_train.isnull().sum())
print("\nMissing values in X_test after dropping columns:")
print(X_test.isnull().sum())

In [None]:
print("Missing values in X_train:")
print(X_train.isnull().sum())
print("\nMissing values in X_test:")
print(X_test.isnull().sum())

Missing values in X_train:
kepid                0
kepoi_name           0
koi_score            0
koi_fpflag_nt        0
koi_fpflag_ss        0
koi_fpflag_co        0
koi_fpflag_ec        0
koi_period           0
koi_period_err1      0
koi_period_err2      0
koi_time0bk          0
koi_time0bk_err1     0
koi_time0bk_err2     0
koi_impact           0
koi_impact_err1      0
koi_impact_err2      0
koi_duration         0
koi_duration_err1    0
koi_duration_err2    0
koi_depth            0
koi_depth_err1       0
koi_depth_err2       0
koi_prad             0
koi_prad_err1        0
koi_prad_err2        0
koi_teq              0
koi_insol            0
koi_insol_err1       0
koi_insol_err2       0
koi_model_snr        0
koi_tce_plnt_num     0
koi_steff            0
koi_steff_err1       0
koi_steff_err2       0
koi_slogg            0
koi_slogg_err1       0
koi_slogg_err2       0
koi_srad             0
koi_srad_err1        0
koi_srad_err2        0
ra                   0
dec                  0
koi_kep

In [None]:
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()

print("Categorical features:", categorical_features)
print("Numerical features:", numerical_features)

Categorical features: ['kepoi_name']
Numerical features: ['kepid', 'koi_score', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec', 'koi_period', 'koi_period_err1', 'koi_period_err2', 'koi_time0bk', 'koi_time0bk_err1', 'koi_time0bk_err2', 'koi_impact', 'koi_impact_err1', 'koi_impact_err2', 'koi_duration', 'koi_duration_err1', 'koi_duration_err2', 'koi_depth', 'koi_depth_err1', 'koi_depth_err2', 'koi_prad', 'koi_prad_err1', 'koi_prad_err2', 'koi_teq', 'koi_insol', 'koi_insol_err1', 'koi_insol_err2', 'koi_model_snr', 'koi_tce_plnt_num', 'koi_steff', 'koi_steff_err1', 'koi_steff_err2', 'koi_slogg', 'koi_slogg_err1', 'koi_slogg_err2', 'koi_srad', 'koi_srad_err1', 'koi_srad_err2', 'ra', 'dec', 'koi_kepmag']


In [None]:
from sklearn.preprocessing import OneHotEncoder

categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

X_train_encoded = encoder.fit_transform(X_train[categorical_features])
X_test_encoded = encoder.transform(X_test[categorical_features])

X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=encoder.get_feature_names_out(categorical_features), index=X_train.index)
X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=encoder.get_feature_names_out(categorical_features), index=X_test.index)

print("Shape of X_train_encoded_df:", X_train_encoded_df.shape)
print("Shape of X_test_encoded_df:", X_test_encoded_df.shape)

Shape of X_train_encoded_df: (7651, 7651)
Shape of X_test_encoded_df: (1913, 7651)


In [None]:
from sklearn.preprocessing import StandardScaler

numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train[numerical_features])
X_test_scaled = scaler.transform(X_test[numerical_features])

X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=numerical_features, index=X_train.index)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=numerical_features, index=X_test.index)

print("Shape of X_train_scaled_df:", X_train_scaled_df.shape)
print("Shape of X_test_scaled_df:", X_test_scaled_df.shape)

Shape of X_train_scaled_df: (7651, 42)
Shape of X_test_scaled_df: (1913, 42)


In [None]:
X_train_processed = pd.concat([X_train_scaled_df, X_train_encoded_df], axis=1)
X_test_processed = pd.concat([X_test_scaled_df, X_test_encoded_df], axis=1)

print("Shape of X_train_processed:", X_train_processed.shape)
print("Shape of X_test_processed:", X_test_processed.shape)

Shape of X_train_processed: (7651, 7693)
Shape of X_test_processed: (1913, 7693)


In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=42)
model.fit(X_train_processed, y_train)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_train_single = y_train['koi_disposition']
y_test_single = y_test['koi_disposition']

model.fit(X_train_processed, y_train_single)

y_pred = model.predict(X_test_processed)

accuracy = accuracy_score(y_test_single, y_pred)
precision = precision_score(y_test_single, y_pred, average='weighted')
recall = recall_score(y_test_single, y_pred, average='weighted')
f1 = f1_score(y_test_single, y_pred, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

Accuracy: 0.8834
Precision: 0.8795
Recall: 0.8834
F1-score: 0.8784


In [None]:
predictions = model.predict(X_test_processed)
print("First few predictions:")
print(predictions[:10])

First few predictions:
['FALSE POSITIVE' 'CANDIDATE' 'CONFIRMED' 'FALSE POSITIVE'
 'FALSE POSITIVE' 'CONFIRMED' 'CONFIRMED' 'FALSE POSITIVE'
 'FALSE POSITIVE' 'FALSE POSITIVE']


In [None]:
import joblib

# Define the filename for the saved model
model_filename = 'exoplanet_detector.joblib'

# Save the trained model to the file
joblib.dump(model, model_filename)

print(f"Model saved successfully to {model_filename}")

Model saved successfully to exoplanet_detector.joblib
