# Reading and Preprocessing data

In [2]:
import pandas as pd
import altair as alt
from sklearn import metrics
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

df = pd.read_csv('Data/neo.csv')
df = df.drop(['id', 'orbiting_body', 'sentry_object', 'name'], axis=1)

boolean_map = {
    True: 1,
    False: 0
}

df['haz'] = df['hazardous'].map(boolean_map)
df=df.drop('hazardous', axis=1)

In [3]:
close_df = df[df['miss_distance']<7500000]
far_df = df[df['miss_distance']>7500000]

alt.data_transformers.disable_max_rows()
alt.Chart(close_df).mark_arc().encode(
    theta='count(haz)',
    color='haz'
)

In [4]:
alt.Chart(close_df).mark_arc().encode(
    theta='count(haz)',
    color='haz'
)

In [5]:
df = df.drop_duplicates()
df

Unnamed: 0,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,absolute_magnitude,haz
0,1.198271,2.679415,13569.249224,5.483974e+07,16.73,0
1,0.265800,0.594347,73588.726663,6.143813e+07,20.00,1
2,0.722030,1.614507,114258.692129,4.979872e+07,17.83,0
3,0.096506,0.215794,24764.303138,2.543497e+07,22.20,0
4,0.255009,0.570217,42737.733765,4.627557e+07,20.09,1
...,...,...,...,...,...,...
90831,0.026580,0.059435,52078.886692,1.230039e+07,25.00,0
90832,0.016771,0.037501,46114.605073,5.432121e+07,26.00,0
90833,0.031956,0.071456,7566.807732,2.840077e+07,24.60,0
90834,0.007321,0.016370,69199.154484,6.869206e+07,27.80,0


# Creating Models

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('haz', axis=1), df.haz, test_size=0.1, random_state=42)
ros = RandomOverSampler(random_state=42, sampling_strategy=.5)
X_train, y_train = ros.fit_resample(X_train, y_train)

GBclf = HistGradientBoostingClassifier(learning_rate=.2)

GBclf.fit(X_train, y_train)
pred = GBclf.predict(X_test)

print(metrics.accuracy_score(y_test, pred))
metrics.confusion_matrix(y_test, pred)

0.8199933942530001


array([[6667, 1538],
       [  97,  781]], dtype=int64)

In [7]:
RFclf = RandomForestClassifier(n_estimators=100)

RFclf.fit(X_train, y_train)
threshold = .7
pred_proba = RFclf.predict_proba(X_test)
pred = (pred_proba[:,1] >= threshold).astype(int)

print(metrics.accuracy_score(y_test, pred))
metrics.confusion_matrix(y_test, pred)

0.9201805570846636


array([[8030,  175],
       [ 550,  328]], dtype=int64)

In [21]:
X_user = list(map(float, input('Add asteroid to make prediction: (min_diameter_km, max_diameter_km, relative_velocity_m/s, distance_missed_km, magnitude)\n').split(', ')))

X_user = pd.DataFrame({'est_diameter_min': [X_user[0]], 'est_diameter_max': [X_user[1]], 'relative_velocity': [X_user[2]], 'miss_distance': [X_user[3]], 'absolute_magnitude': [X_user[4]]})
pred_proba = RFclf.predict_proba(X_user)
pred = (pred_proba[:,1] >= threshold).astype(int)
print(pred)

[1]
