In [1]:
!pip install pycaret

Collecting pycaret
  Downloading pycaret-3.3.2-py3-none-any.whl.metadata (17 kB)
Collecting scipy<=1.11.4,>=1.6.1 (from pycaret)
  Downloading scipy-1.11.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.4/60.4 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting joblib<1.4,>=1.2.0 (from pycaret)
  Downloading joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)
Collecting scikit-learn>1.4.0 (from pycaret)
  Downloading scikit_learn-1.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting pyod>=1.1.3 (from pycaret)
  Downloading pyod-2.0.1.tar.gz (163 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.8/163.8 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting category-encoders>=2.4.0 (from pycaret)
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl.metadata (8

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, auc
from pycaret.classification import *

# Load the dataset
df = pd.read_csv('/content/nearest-earth-objects(1910-2024).csv')

# Handle missing values
df = df.dropna()

# Feature Engineering: Calculate average diameter
df['estimated_diameter_avg'] = (df['estimated_diameter_min'] + df['estimated_diameter_max']) / 2

# Select features and target variable
features = ['absolute_magnitude', 'estimated_diameter_avg', 'relative_velocity', 'miss_distance']
X = df[features]
y = df['is_hazardous']

# Combine features and target into a single DataFrame
data = X.copy()
data['is_hazardous'] = y

# Initialize the PyCaret environment
clf = setup(data, target='is_hazardous', session_id=42,
            normalize=True,  # Normalize features
            remove_multicollinearity=True,  # Remove multicollinearity
            ignore_features=None)  # Do not ignore any features

# Compare models using PyCaret
best_model_pycaret = compare_models()

# Evaluate the best PyCaret model
evaluate_model(best_model_pycaret)

# Finalize and save the PyCaret model
final_model_pycaret = finalize_model(best_model_pycaret)
save_model(final_model_pycaret, 'best_neo_model_pycaret')

# Plot ROC curve for the best PyCaret model
plot_model(final_model_pycaret, plot='auc')

# --- Traditional scikit-learn Models ---
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the models
models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42)
}

# Function to evaluate models
def evaluate_model_sklearn(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_prob = model.predict_proba(X_test)[:, 1]
    report = classification_report(y_test, y_pred, output_dict=True)
    roc_auc = roc_auc_score(y_test, y_pred_prob)
    confusion = confusion_matrix(y_test, y_pred)
    return report, roc_auc, confusion

# Evaluate all scikit-learn models
results = {}
for model_name, model in models.items():
    report, roc_auc, confusion = evaluate_model_sklearn(model, X_train_scaled, X_test_scaled, y_train, y_test)
    results[model_name] = {'report': report, 'roc_auc': roc_auc, 'confusion': confusion}

# Print the results for scikit-learn models
best_model_name = max(results, key=lambda x: results[x]['roc_auc'])
print(f"Best Scikit-learn Model: {best_model_name}")
print(f"ROC-AUC Score: {results[best_model_name]['roc_auc']:.2f}")

# Display all model results
for model_name, metrics in results.items():
    print(f"\nModel: {model_name}")
    print(f"ROC-AUC Score: {metrics['roc_auc']:.2f}")
    print("Classification Report:")
    print(pd.DataFrame(metrics['report']).transpose())
    print("Confusion Matrix:")
    print(metrics['confusion'])

# Plot ROC curves for all scikit-learn models
plt.figure(figsize=(10, 8))
for model_name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred_prob = model.predict_proba(X_test_scaled)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=2, label=f'{model_name} (AUC = {roc_auc:.2f})')

plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curves')
plt.legend(loc="lower right")
plt.show()

print(f"\nBest PyCaret Model: {best_model_pycaret}")



Unnamed: 0,Description,Value
0,Session id,42
1,Target,is_hazardous
2,Target type,Binary
3,Original data shape,"(338171, 5)"
4,Transformed data shape,"(338171, 5)"
5,Transformed train set shape,"(236719, 5)"
6,Transformed test set shape,"(101452, 5)"
7,Numeric features,4
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.9122,0.9431,0.5377,0.7042,0.6098,0.5613,0.5678,52.161
dt,Decision Tree Classifier,0.8844,0.7438,0.5549,0.5465,0.5507,0.4843,0.4844,1.483
ada,Ada Boost Classifier,0.8839,0.8868,0.1557,0.7035,0.2549,0.2188,0.2942,9.878
knn,K Neighbors Classifier,0.8786,0.8481,0.3809,0.534,0.4446,0.3786,0.3854,2.462
svm,SVM - Linear Kernel,0.8724,0.7965,0.0,0.0,0.0,0.0,0.0,0.432
ridge,Ridge Classifier,0.8723,0.8344,0.0,0.05,0.0001,-0.0001,-0.0014,0.266
lr,Logistic Regression,0.872,0.8405,0.0741,0.4908,0.1288,0.0986,0.1525,0.86
qda,Quadratic Discriminant Analysis,0.8508,0.8513,0.2991,0.3901,0.3385,0.2561,0.259,0.325
nb,Naive Bayes,0.8377,0.8291,0.2017,0.2991,0.2409,0.1539,0.1577,0.203


Processing:   0%|          | 0/65 [00:00<?, ?it/s]