In [1]:
import pandas as pd
import numpy as np
from sklearn import neighbors, datasets
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import ListedColormap

In [2]:
# Read in the .csv file
near_earth_objects = pd.read_csv("neo.csv")
near_earth_objects

Unnamed: 0,id,name,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,orbiting_body,sentry_object,absolute_magnitude,hazardous
0,2162635,162635 (2000 SS164),1.198271,2.679415,13569.249224,5.483974e+07,Earth,False,16.73,False
1,2277475,277475 (2005 WK4),0.265800,0.594347,73588.726663,6.143813e+07,Earth,False,20.00,True
2,2512244,512244 (2015 YE18),0.722030,1.614507,114258.692129,4.979872e+07,Earth,False,17.83,False
3,3596030,(2012 BV13),0.096506,0.215794,24764.303138,2.543497e+07,Earth,False,22.20,False
4,3667127,(2014 GE35),0.255009,0.570217,42737.733765,4.627557e+07,Earth,False,20.09,True
...,...,...,...,...,...,...,...,...,...,...
90831,3763337,(2016 VX1),0.026580,0.059435,52078.886692,1.230039e+07,Earth,False,25.00,False
90832,3837603,(2019 AD3),0.016771,0.037501,46114.605073,5.432121e+07,Earth,False,26.00,False
90833,54017201,(2020 JP3),0.031956,0.071456,7566.807732,2.840077e+07,Earth,False,24.60,False
90834,54115824,(2021 CN5),0.007321,0.016370,69199.154484,6.869206e+07,Earth,False,27.80,False


## Getting the data ready/Data cleaning: 
I am going to use the nearest neighbors technique to train a model. The data cleaning protocol includes the following two tasks:
1. Removing the non-numerical columns from the data. 
2. Changing the True/False to 1/0 (True is 1 and False is 0).

In [3]:
# (1) There is only one numerical column here:
# nn_columns = ["orbiting_body", "sentry_object"]
clean_data = near_earth_objects.drop(["orbiting_body", "sentry_object", "id", "name"], axis=1)
# (2) Changing sentry_object and hazardous from True/False to 1/0
bool_columns = ["hazardous"]
clean_data[bool_columns] = clean_data[bool_columns].astype(int)

In [4]:
clean_data

Unnamed: 0,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,absolute_magnitude,hazardous
0,1.198271,2.679415,13569.249224,5.483974e+07,16.73,0
1,0.265800,0.594347,73588.726663,6.143813e+07,20.00,1
2,0.722030,1.614507,114258.692129,4.979872e+07,17.83,0
3,0.096506,0.215794,24764.303138,2.543497e+07,22.20,0
4,0.255009,0.570217,42737.733765,4.627557e+07,20.09,1
...,...,...,...,...,...,...
90831,0.026580,0.059435,52078.886692,1.230039e+07,25.00,0
90832,0.016771,0.037501,46114.605073,5.432121e+07,26.00,0
90833,0.031956,0.071456,7566.807732,2.840077e+07,24.60,0
90834,0.007321,0.016370,69199.154484,6.869206e+07,27.80,0


In [5]:
clean_data.isnull().sum()
# There are no null values to take care of, so the data is ready to machinelearningize

est_diameter_min      0
est_diameter_max      0
relative_velocity     0
miss_distance         0
absolute_magnitude    0
hazardous             0
dtype: int64

## Univariate Model for classification

This model takes in one feature and based on that feature it predicts the target values. Therefore, it is called a univariate model. The purpose of this model is to separate the effectiveness of the features and sort them.

In [6]:
from sklearn.model_selection import train_test_split

def knn_univariate_classification(feature, target, df):
    """
    This function makes prediction based on just one feature.
    Params: The feature column, the target column, and the data frame. 
    Returns: The predictions.
    """
    k_neighbors = 10
    
    # train, test, and split the model:
    train, test = train_test_split(df, test_size=0.3)
    
    # fit the knn model to make predictions:
    knn = neighbors.KNeighborsClassifier(k_neighbors)
    knn.fit(train[[feature]], train[target])
    
    # make the predictions:
    predictions = knn.predict(test[[feature]])
    
    return [predictions, test[target]]

## Multivariate Model:

In [7]:
def knn_multivariate_classification(features, target, df):
    """
    This function can make predictions based on any and all the features.
    Params: A features array, a target, and a data frame.
    Returns: the predictions.
    """
    k_neighbors = 10
    
    # train, test, split the model:
    train, test = train_test_split(df, test_size=0.3)
    
    # fit the model to make predictions:
    knn = neighbors.KNeighborsClassifier(k_neighbors)
    knn.fit(train[features], train[target])
    
    # make predictions:
    predictions = knn.predict(test[features])
    
    return [predictions, test[target]]

## Analyzation of the results:
1. Confusion matrix.
2. The accuracy score.

In [11]:
# The confusion matrix
from sklearn.metrics import confusion_matrix

# For a confusion matrix; confusion_matrix(true, pred). Running the Knn univariate on your choosing of the 
# feature.

uni_results = knn_univariate_classification("miss_distance", "hazardous", clean_data)
conf_mat = confusion_matrix(uni_results[1], uni_results[0])


In [8]:
# The accuracy score. 
from sklearn.metrics import accuracy_score

# Using accuracy score to sort the features from best to worse.
training_cols = clean_data.columns.drop("hazardous")
features_score = []

# do a uninvariate model for each of the columns in training_cols:
for col in training_cols:
    [pred, true] = knn_univariate_classification(col, "hazardous", clean_data)
    features_score.append(accuracy_score(true, pred))
    
# Sort the features_score array:
def insertionSort(arr):
    # Traverse through 1 to len(arr)
    for i in range(1, len(arr)):
        key = arr[i]
        # Move elements of arr[0..i-1], that are
        # greater than key, to one position ahead
        # of their current position
        j = i-1
        while j >= 0 and key < arr[j] :
                arr[j + 1] = arr[j]
                j -= 1
        arr[j + 1] = key
        
insertionSort(features_score)

In [9]:
features_score

[0.9015448974349565,
 0.9022054236541778,
 0.9052511834428094,
 0.9099115628784264,
 0.9109390481083263]