There are many dangerous bodies in space, one of them is N.E.O. - "Nearest Earth Objects". Some such bodies really pose a danger to the planet Earth, NASA classifies them as "is_hazardous". This dataset contains ALL NASA observations of similar objects from 1910 to 2024!!!

There are 338,199 records of N.E.O. in the Dataset!

Try to predict "is_hazardous" as accurately as possible! (otherwise we will not be ready for an asteroid attack)

In [21]:
# import libraries for classification
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier



In [22]:
df = pd.read_csv('NASA.csv')
print(df.head())
print(df.info())
print(df.describe())


    neo_id                name  absolute_magnitude  estimated_diameter_min  \
0  2162117  162117 (1998 SD15)               19.14                0.394962   
1  2349507    349507 (2008 QY)               18.50                0.530341   
2  2455415    455415 (2003 GA)               21.45                0.136319   
3  3132126           (2002 PB)               20.63                0.198863   
4  3557844           (2011 DW)               22.70                0.076658   

   estimated_diameter_max orbiting_body  relative_velocity  miss_distance  \
0                0.883161         Earth       71745.401048   5.814362e+07   
1                1.185878         Earth      109949.757148   5.580105e+07   
2                0.304818         Earth       24865.506798   6.720689e+07   
3                0.444672         Earth       78890.076805   3.039644e+07   
4                0.171412         Earth       56036.519484   6.311863e+07   

   is_hazardous  
0         False  
1          True  
2         Fals

In [23]:


# Check for missing values
print("Missing values per column:")
print(df.isnull().sum())

# Check for duplicates
print("Number of duplicates:")
print(df.duplicated().sum())

# Drop duplicates
df.drop_duplicates(inplace=True)

# Drop columns that are not needed

df.drop(['name', 'orbiting_body'], inplace=True, axis=1)

# fill missing values
def fill_missing_values(df, inplace=True):
    for column in df.columns:
        if df[column].dtype == 'object':
            df[column] = df[column].fillna(df[column].mode()[0])
        else:
            df[column] = df[column].fillna(df[column].mean())
    return df

df = fill_missing_values(df, inplace=True)

df.head(5)






Missing values per column:
neo_id                     0
name                       0
absolute_magnitude        28
estimated_diameter_min    28
estimated_diameter_max    28
orbiting_body              0
relative_velocity          0
miss_distance              0
is_hazardous               0
dtype: int64
Number of duplicates:
0


Unnamed: 0,neo_id,absolute_magnitude,estimated_diameter_min,estimated_diameter_max,relative_velocity,miss_distance,is_hazardous
0,2162117,19.14,0.394962,0.883161,71745.401048,58143620.0,False
1,2349507,18.5,0.530341,1.185878,109949.757148,55801050.0,True
2,2455415,21.45,0.136319,0.304818,24865.506798,67206890.0,False
3,3132126,20.63,0.198863,0.444672,78890.076805,30396440.0,False
4,3557844,22.7,0.076658,0.171412,56036.519484,63118630.0,False


In [24]:
# Outliers detection 

def detect_outliers(df, inplace=True):
    for column in df.columns:
        if df[column].dtype == 'int64' or df[column].dtype == 'float64':
            Q1 = df[column].quantile(0.25)
            Q3 = df[column].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            df = df[(df[column] > lower_bound) & (df[column] < upper_bound)]
    return df 

df = detect_outliers(df, inplace=True)

df.head(5)


Unnamed: 0,neo_id,absolute_magnitude,estimated_diameter_min,estimated_diameter_max,relative_velocity,miss_distance,is_hazardous
2,2455415,21.45,0.136319,0.304818,24865.506798,67206890.0,False
3,3132126,20.63,0.198863,0.444672,78890.076805,30396440.0,False
4,3557844,22.7,0.076658,0.171412,56036.519484,63118630.0,False
5,3656926,25.0,0.02658,0.059435,47477.649832,42905210.0,False
6,3421513,21.5,0.133216,0.297879,57853.295346,27279080.0,True


In [25]:
# categorical encoding

def encode_categorical(df, inplace=True):
    le = LabelEncoder()
    for column in df.columns:
        if df[column].dtype == 'object':
            df[column] = le.fit_transform(df[column])
    return df

df = encode_categorical(df, inplace=True)   

df.head(5)

Unnamed: 0,neo_id,absolute_magnitude,estimated_diameter_min,estimated_diameter_max,relative_velocity,miss_distance,is_hazardous
2,2455415,21.45,0.136319,0.304818,24865.506798,67206890.0,False
3,3132126,20.63,0.198863,0.444672,78890.076805,30396440.0,False
4,3557844,22.7,0.076658,0.171412,56036.519484,63118630.0,False
5,3656926,25.0,0.02658,0.059435,47477.649832,42905210.0,False
6,3421513,21.5,0.133216,0.297879,57853.295346,27279080.0,True


In [26]:
# scaling the data

scaler = StandardScaler()
scaled_data = scaler.fit_transform(df.drop('is_hazardous', axis=1))
scaled_df = pd.DataFrame(scaled_data, columns=df.drop('is_hazardous', axis=1).columns)

print(scaled_df.head(5))


     neo_id  absolute_magnitude  estimated_diameter_min  \
0 -0.714808           -0.841515                0.553580   
1 -0.686030           -1.170353                1.290769   
2 -0.667925           -0.340238               -0.149618   
3 -0.663712            0.582112               -0.739860   
4 -0.673723           -0.821464                0.517006   

   estimated_diameter_max  relative_velocity  miss_distance  
0                0.553580          -0.991316       1.272666  
1                1.290769           1.360548      -0.484653  
2               -0.149618           0.365659       1.077494  
3               -0.739860          -0.006936       0.112512  
4                0.517006           0.444749      -0.633475  


In [27]:
# split the data into features and target

X = scaled_df
y = df['is_hazardous']

# split the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# train the model

model = RandomForestClassifier()

model.fit(X_train, y_train)

# make predictions

y_pred = model.predict(X_test)

# evaluate the model

print('Accuracy:', accuracy_score(y_test, y_pred))

print('Classification Report:', classification_report(y_test, y_pred))

print('Confusion Matrix:', confusion_matrix(y_test, y_pred))




Accuracy: 0.9814908722109533
Classification Report:               precision    recall  f1-score   support

       False       0.99      0.99      0.99     52775
        True       0.95      0.88      0.91      6385

    accuracy                           0.98     59160
   macro avg       0.97      0.94      0.95     59160
weighted avg       0.98      0.98      0.98     59160

Confusion Matrix: [[52459   316]
 [  779  5606]]
