In [10]:
# nasa_ml_project.py

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

# Step 1: Load Dataset
print("📥 Reading data...")
df = pd.read_csv("nearest-earth-objects(1910-2024).csv.zip")  # Replace with your actual file path
print(df.info(), "\n")

# Step 2: Check & Handle Missing Values
print("📉 Missing values before cleaning:\n", df.isnull().sum(), "\n")
df.dropna(inplace=True)
print(f"✅ After dropping nulls: {df.shape}")

# Step 3: Handle Outliers using IQR
def remove_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    before = data.shape[0]
    data = data[(data[column] >= lower) & (data[column] <= upper)]
    removed = before - data.shape[0]
    print(f"✅ Removed outliers from {column}: {removed} rows dropped")
    return data

cols_to_clean = ['absolute_magnitude', 'estimated_diameter_min',
                 'estimated_diameter_max', 'relative_velocity', 'miss_distance']

for col in cols_to_clean:
    df = remove_outliers_iqr(df, col)

# Step 4: Feature Engineering
df.drop(['neo_id', 'name', 'orbiting_body'], axis=1, inplace=True)
X = df.drop('is_hazardous', axis=1)
y = df['is_hazardous']

# Step 5: Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=42,
                                                    stratify=y)

# Step 6: SMOTE Balancing
print("\n⚖️ Applying SMOTE for class balancing...")
sm = SMOTE(random_state=42)
X_train_bal, y_train_bal = sm.fit_resample(X_train, y_train)
print("Before SMOTE:\n", y_train.value_counts())
print("After SMOTE:\n", y_train_bal.value_counts())

# Step 7: Train Logistic Regression
print("\n🧠 Training Logistic Regression...")
lr = LogisticRegression()
lr.fit(X_train_bal, y_train_bal)
y_pred_lr = lr.predict(X_test)

print("\n📊 Logistic Regression - Classification Report:")
print(classification_report(y_test, y_pred_lr))

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))
print("ROC-AUC Score:", roc_auc_score(y_test, lr.predict_proba(X_test)[:, 1]))

# Step 8: Train Random Forest
print("\n🧠 Training Random Forest...")
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_bal, y_train_bal)
y_pred_rf = rf.predict(X_test)

print("\n📊 Random Forest - Classification Report:")
print(classification_report(y_test, y_pred_rf))

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("ROC-AUC Score:", roc_auc_score(y_test, rf.predict_proba(X_test)[:, 1]))


📥 Reading data...
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 338199 entries, 0 to 338198
Data columns (total 9 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   neo_id                  338199 non-null  int64  
 1   name                    338199 non-null  object 
 2   absolute_magnitude      338171 non-null  float64
 3   estimated_diameter_min  338171 non-null  float64
 4   estimated_diameter_max  338171 non-null  float64
 5   orbiting_body           338199 non-null  object 
 6   relative_velocity       338199 non-null  float64
 7   miss_distance           338199 non-null  float64
 8   is_hazardous            338199 non-null  bool   
dtypes: bool(1), float64(5), int64(1), object(2)
memory usage: 21.0+ MB
None 

📉 Missing values before cleaning:
 neo_id                     0
name                       0
absolute_magnitude        28
estimated_diameter_min    28
estimated_diameter_max    28
orbiting_body      