# Project
> In this project we will be classifying whether an object is a threat to earth or not. We will be training and
> developing a machine learning model to perform this task. The chosen method to do this for this set will be a *support vector machine*

In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn import svm

In [3]:
data = pd.read_csv('data/neo_v3.csv')
print(data.head(5))

        id                 name  est_diameter_min  est_diameter_max  \
0  2162635  162635 (2000 SS164)          1.198271          2.679415   
1  2277475    277475 (2005 WK4)          0.265800          0.594347   
2  2512244   512244 (2015 YE18)          0.722030          1.614507   
3  3596030          (2012 BV13)          0.096506          0.215794   
4  3667127          (2014 GE35)          0.255009          0.570217   

   relative_velocity  miss_distance orbiting_body  sentry_object  \
0        13569.24922    54839744.08         Earth          False   
1        73588.72666    61438126.52         Earth          False   
2       114258.69210    49798724.94         Earth          False   
3        24764.30314    25434972.72         Earth          False   
4        42737.73376    46275567.00         Earth          False   

   absolute_magnitude  hazardous  
0               16.73          0  
1               20.00          1  
2               17.83          0  
3               22.20   

# Data:
The data has a lot of generally useless columns. These include 
- name : The name of said object
- id : the assigned id to the object. These are random and do not really say anything about the object. 
- orbiting_body : They all, minus one, orbit earth. 

# Notes
We notice that the first five entries within sentry_object are "False" so this may lead to us to believe they may all be false. So we will check for uniqueness prior to thinking about dropping it

In [4]:
print(data['sentry_object'].unique())

[False]


Looks like we can drop 'sentry_object'

In [5]:
data.drop(['sentry_object', 'name', 'id', 'orbiting_body'], axis=1, inplace=True)
print(data.head(5))


   est_diameter_min  est_diameter_max  relative_velocity  miss_distance  \
0          1.198271          2.679415        13569.24922    54839744.08   
1          0.265800          0.594347        73588.72666    61438126.52   
2          0.722030          1.614507       114258.69210    49798724.94   
3          0.096506          0.215794        24764.30314    25434972.72   
4          0.255009          0.570217        42737.73376    46275567.00   

   absolute_magnitude  hazardous  
0               16.73          0  
1               20.00          1  
2               17.83          0  
3               22.20          0  
4               20.09          1  


In [6]:
# Setting up our training and testing vectors. 
features = ['est_diameter_min', 'est_diameter_max', 'relative_velocity', 'miss_distance', 'absolute_magnitude']
target = ['hazardous']

x = data.loc[:, features]
y = data.loc[:, target]

X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.2)


In [8]:
# Setting up our SVM 
 
model = svm.SVC(kernel='rbf')

model.fit(X_train, y_train.values.ravel())
predictions = model.predict(X_test)
print(f"Our accuracy score is: {accuracy_score(y_test,predictions)}")

Our accuracy score is: 0.9016952884191985


## TODO
- Explain why I dropped x things
- Figure out exactly why "rbf" is better for fitting data than "linear"
- Look into SVM kernels