In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import warnings
warnings.filterwarnings('ignore')

print("🔍 COMPREHENSIVE PREPROCESSING ANALYSIS")
print("="*80)

# Load the dataset
df = pd.read_csv("nasa.csv")
print(f"Original dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

print(f"\n📊 INITIAL DATA EXPLORATION")
print("-"*50)
print(f"Dataset shape: {df.shape}")
print(f"Missing values per column:")
print(df.isnull().sum())
print(f"\nData types:")
print(df.dtypes)

# Check target distribution
print(f"\n🎯 TARGET VARIABLE ANALYSIS")
print("-"*50)
print("Hazardous distribution:")
print(df['Hazardous'].value_counts())
print(f"Class imbalance ratio: {df['Hazardous'].value_counts()[0] / df['Hazardous'].value_counts()[1]:.2f}:1")

print(f"\n❌ PROBLEMS WITH ORIGINAL PREPROCESSING:")
print("-"*50)
print("1. FEATURE SELECTION ISSUES:")
print("   - Removed ALL diameter features except KM versions")
print("   - May have removed important velocity/distance features too aggressively")
print("   - No systematic feature selection approach")

print("\n2. CORRELATION-BASED REMOVAL:")
print("   - Only keeping features with >0.1 correlation with target")
print("   - This is arbitrary and may remove important non-linear relationships")
print("   - Diameter features are nearly perfectly correlated (expected!)")

print("\n3. DATA LEAKAGE POTENTIAL:")
print("   - Need to verify no future information in features")
print("   - Some orbital parameters might be calculated post-classification")

print("\n4. SCALING ISSUES:")
print("   - StandardScaler applied to ALL features without considering distributions")
print("   - Should check for outliers first")

print("\n5. NO PROPER VALIDATION:")
print("   - Cross-validation done on full dataset (data leakage)")
print("   - Should be done only on training set")

# Let's do a better preprocessing approach
print(f"\n🛠️ IMPROVED PREPROCESSING PIPELINE")
print("="*80)

# Step 1: Intelligent feature removal
def improved_preprocessing(df):
    # Create a copy to work with
    df_clean = df.copy()
    
    # Remove obviously irrelevant features (IDs, names, dates)
    id_cols = ['Neo Reference ID', 'Name', 'Orbit ID', 'Close Approach Date', 
               'Epoch Date Close Approach', 'Orbit Determination Date']
    df_clean = df_clean.drop([col for col in id_cols if col in df_clean.columns], axis=1)
    
    # Remove categorical features that are not informative
    categorical_cols = ['Orbiting Body', 'Equinox']
    df_clean = df_clean.drop([col for col in categorical_cols if col in df_clean.columns], axis=1)
    
    # Convert boolean target to numeric
    df_clean['Hazardous'] = df_clean['Hazardous'].astype(int)
    
    print(f"After removing IDs and categorical: {df_clean.shape}")
    
    # Step 2: Handle highly correlated diameter features intelligently
    diameter_cols = [col for col in df_clean.columns if 'Est Dia' in col]
    print(f"Diameter columns found: {diameter_cols}")
    
    if len(diameter_cols) > 0:
        # Keep only KM max (most commonly used in astronomy)
        cols_to_keep = [col for col in diameter_cols if 'KM(max)' in col]
        cols_to_remove = [col for col in diameter_cols if col not in cols_to_keep]
        df_clean = df_clean.drop(cols_to_remove, axis=1)
        print(f"Kept diameter column: {cols_to_keep}")
        print(f"Removed redundant diameter columns: {len(cols_to_remove)}")
    
    # Step 3: Handle velocity features (keep the most relevant)
    velocity_cols = [col for col in df_clean.columns if 'Velocity' in col or 'per' in col]
    print(f"Velocity columns found: {velocity_cols}")
    
    # Keep km/s, remove km/h and mph (redundant)
    if 'Relative Velocity km per sec' in df_clean.columns:
        velocity_to_remove = [col for col in velocity_cols if col != 'Relative Velocity km per sec']
        df_clean = df_clean.drop(velocity_to_remove, axis=1)
        print(f"Kept: Relative Velocity km per sec")
        print(f"Removed redundant velocity columns: {len(velocity_to_remove)}")
    
    # Step 4: Handle distance features (keep the most relevant)
    distance_cols = [col for col in df_clean.columns if 'Miss Dist' in col]
    print(f"Distance columns found: {distance_cols}")
    
    # Keep Astronomical Units (most relevant for space)
    if 'Miss Dist.(Astronomical)' in df_clean.columns:
        distance_to_remove = [col for col in distance_cols if col != 'Miss Dist.(Astronomical)']
        df_clean = df_clean.drop(distance_to_remove, axis=1)
        print(f"Kept: Miss Dist.(Astronomical)")
        print(f"Removed redundant distance columns: {len(distance_to_remove)}")
    
    return df_clean

# Apply improved preprocessing
df_processed = improved_preprocessing(df)
print(f"\nFinal processed shape: {df_processed.shape}")
print(f"Remaining columns: {list(df_processed.columns)}")

# Separate features and target
X = df_processed.drop('Hazardous', axis=1)
y = df_processed['Hazardous']

print(f"\n📈 FEATURE ANALYSIS")
print("-"*50)

# Check for outliers
print("Checking for extreme outliers...")
Q1 = X.quantile(0.25)
Q3 = X.quantile(0.75)
IQR = Q3 - Q1
outlier_counts = ((X < (Q1 - 3 * IQR)) | (X > (Q3 + 3 * IQR))).sum()
print("Extreme outliers per feature:")
print(outlier_counts[outlier_counts > 0])

# Feature correlation analysis
print(f"\nFeature correlations with target:")
correlations = pd.concat([X, y], axis=1).corr()['Hazardous'].abs().sort_values(ascending=False)
print(correlations[1:])  # Exclude self-correlation

# Check for multicollinearity
print(f"\nMulticollinearity check:")
feature_corr = X.corr().abs()
upper_triangle = feature_corr.where(np.triu(np.ones(feature_corr.shape), k=1).astype(bool))
high_corr_pairs = []
for col in upper_triangle.columns:
    high_corr_features = upper_triangle.index[upper_triangle[col] > 0.9].tolist()
    for feature in high_corr_features:
        high_corr_pairs.append((feature, col, upper_triangle.loc[feature, col]))

if high_corr_pairs:
    print("High correlation pairs (>0.9):")
    for pair in high_corr_pairs:
        print(f"  {pair[0]} <-> {pair[1]}: {pair[2]:.3f}")
else:
    print("No concerning multicollinearity found")

print(f"\n🎯 IMPROVED FEATURE SELECTION")
print("-"*50)

# Split data first (CRITICAL: prevent data leakage)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

# Apply scaling (use RobustScaler for outlier resistance)
scaler = RobustScaler()  # More robust to outliers than StandardScaler
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)  # Important: only transform test set

# Convert back to DataFrames
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X.columns, index=X_test.index)

# Feature selection using multiple methods
print("\nFeature selection using statistical tests...")

# Method 1: Univariate statistical tests
selector_f = SelectKBest(score_func=f_classif, k='all')
selector_f.fit(X_train_scaled, y_train)
f_scores = pd.DataFrame({
    'Feature': X.columns,
    'F_Score': selector_f.scores_,
    'P_Value': selector_f.pvalues_
}).sort_values('F_Score', ascending=False)

print("Top features by F-score:")
print(f_scores.head())

# Method 2: Mutual Information
selector_mi = SelectKBest(score_func=mutual_info_classif, k='all')
selector_mi.fit(X_train_scaled, y_train)
mi_scores = pd.DataFrame({
    'Feature': X.columns,
    'MI_Score': selector_mi.scores_
}).sort_values('MI_Score', ascending=False)

print("\nTop features by Mutual Information:")
print(mi_scores.head())

# Select features that are top in both methods
top_f_features = set(f_scores.head(6)['Feature'].tolist())
top_mi_features = set(mi_scores.head(6)['Feature'].tolist())
selected_features = list(top_f_features.intersection(top_mi_features))

if len(selected_features) < 5:  # Ensure minimum features
    # Add top features from F-score if intersection is too small
    additional_features = f_scores.head(7)['Feature'].tolist()
    selected_features = list(set(selected_features + additional_features[:7]))

print(f"\nSelected features ({len(selected_features)}):")
for feature in selected_features:
    print(f"  - {feature}")

# Create final datasets
X_train_final = X_train_scaled[selected_features]
X_test_final = X_test_scaled[selected_features]

print(f"\n🤖 MODEL COMPARISON WITH IMPROVED PREPROCESSING")
print("="*80)

# Initialize models with proper parameters
models = {
    'Logistic Regression': LogisticRegression(
        class_weight='balanced', 
        random_state=42, 
        max_iter=1000,
        C=1.0  # Add regularization
    ),
    'SVM': SVC(
        class_weight='balanced',
        probability=True,
        random_state=42,
        C=1.0,  # Add regularization
        gamma='scale'
    ),
    'Naive Bayes': GaussianNB(),
    'Random Forest': RandomForestClassifier(
        class_weight='balanced',
        n_estimators=100,
        max_depth=10,  # Prevent overfitting
        min_samples_split=10,  # Prevent overfitting
        min_samples_leaf=5,  # Prevent overfitting
        random_state=42
    )
}

# Proper cross-validation (only on training set)
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

results_improved = {}
print("\nTraining models with improved preprocessing...")

for name, model in models.items():
    print(f"\n{name}:")
    print("-" * 30)
    
    # Cross-validation on training set only
    cv_scores = cross_val_score(model, X_train_final, y_train, 
                               cv=cv_strategy, scoring='accuracy')
    
    print(f"CV Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
    
    # Train on full training set and test
    model.fit(X_train_final, y_train)
    y_pred = model.predict(X_test_final)
    
    # Calculate test metrics
    test_accuracy = (y_pred == y_test).mean()
    
    if hasattr(model, 'predict_proba'):
        y_pred_proba = model.predict_proba(X_test_final)[:, 1]
        test_roc_auc = roc_auc_score(y_test, y_pred_proba)
    else:
        y_pred_proba = None
        test_roc_auc = None
    
    results_improved[name] = {
        'cv_accuracy': cv_scores.mean(),
        'cv_std': cv_scores.std(),
        'test_accuracy': test_accuracy,
        'test_roc_auc': test_roc_auc,
        'model': model
    }
    
    print(f"Test Accuracy: {test_accuracy:.4f}")
    if test_roc_auc:
        print(f"Test ROC-AUC: {test_roc_auc:.4f}")
    
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

print(f"\n📊 COMPARISON: ORIGINAL vs IMPROVED")
print("="*80)

# Create comparison summary
comparison_data = []
for name, result in results_improved.items():
    comparison_data.append({
        'Model': name,
        'CV_Accuracy': result['cv_accuracy'],
        'Test_Accuracy': result['test_accuracy'],
        'ROC_AUC': result['test_roc_auc'] if result['test_roc_auc'] else 0,
        'Overfitting_Gap': result['cv_accuracy'] - result['test_accuracy']
    })

comparison_df = pd.DataFrame(comparison_data)
print("Improved Model Performance:")
print(comparison_df.round(4))

print(f"\n🎯 KEY IMPROVEMENTS MADE:")
print("-"*50)
print("1. ✅ Proper train-test split BEFORE any preprocessing")
print("2. ✅ Used RobustScaler instead of StandardScaler (better for outliers)")
print("3. ✅ Systematic feature selection using statistical tests")
print("4. ✅ Added regularization to prevent overfitting")
print("5. ✅ Cross-validation only on training set (no data leakage)")
print("6. ✅ Kept most relevant features from each category")
print("7. ✅ Added constraints to Random Forest (max_depth, min_samples)")

print(f"\n🚨 EXPECTED RESULTS:")
print("-"*50)
print("- More realistic accuracy scores (likely 85-95% range)")
print("- Smaller gap between CV and test performance")
print("- Better generalization to new data")
print("- More trustworthy model selection")

print(f"\n💡 RECOMMENDATIONS:")
print("-"*50)
print("1. Use this improved preprocessing pipeline")
print("2. If you still get >98% accuracy, investigate data leakage")
print("3. Consider ensemble methods with the top 2-3 models")
print("4. Validate on completely separate data if possible")
print("5. Focus on precision for hazardous class (safety critical)")

🔍 COMPREHENSIVE PREPROCESSING ANALYSIS
Original dataset shape: (4687, 40)
Columns: ['Neo Reference ID', 'Name', 'Absolute Magnitude', 'Est Dia in KM(min)', 'Est Dia in KM(max)', 'Est Dia in M(min)', 'Est Dia in M(max)', 'Est Dia in Miles(min)', 'Est Dia in Miles(max)', 'Est Dia in Feet(min)', 'Est Dia in Feet(max)', 'Close Approach Date', 'Epoch Date Close Approach', 'Relative Velocity km per sec', 'Relative Velocity km per hr', 'Miles per hour', 'Miss Dist.(Astronomical)', 'Miss Dist.(lunar)', 'Miss Dist.(kilometers)', 'Miss Dist.(miles)', 'Orbiting Body', 'Orbit ID', 'Orbit Determination Date', 'Orbit Uncertainity', 'Minimum Orbit Intersection', 'Jupiter Tisserand Invariant', 'Epoch Osculation', 'Eccentricity', 'Semi Major Axis', 'Inclination', 'Asc Node Longitude', 'Orbital Period', 'Perihelion Distance', 'Perihelion Arg', 'Aphelion Dist', 'Perihelion Time', 'Mean Anomaly', 'Mean Motion', 'Equinox', 'Hazardous']

📊 INITIAL DATA EXPLORATION
-------------------------------------------