In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_squared_error

In [2]:
# load the dataset
df = pd.read_csv('cneos_fireball_data.csv')

# changing latitude and longitude to values
def lat(row):
    if row['Latitude (deg.)'] == row['Latitude (deg.)']:
        if row['Latitude (deg.)'][-1] == 'N':
            return float(row['Latitude (deg.)'][:-1])
        elif row['Latitude (deg.)'][-1] == 'S':
            return float('-'+(row['Latitude (deg.)'][:-1]))
    return np.nan

def long(row):
    if row['Latitude (deg.)'] == row['Latitude (deg.)']:
        if row['Longitude (deg.)'][-1] == 'E':
            return float(row['Longitude (deg.)'][:-1])
        elif row['Longitude (deg.)'][-1] == 'W':
            return float('-'+(row['Longitude (deg.)'][:-1]))
    return np.nan

df['Lat'] = df.apply (lambda row: lat(row), axis=1)
df['Long'] = df.apply (lambda row: long(row), axis=1)

X_full = df[['Altitude (km)', 'vx', 'vy', 'vz', 'Total Radiated Energy (J)' , 'Calculated Total Impact Energy (kt)']]
y = df['Velocity (km/s)']

print(X_full.shape)

(929, 6)


In [3]:
# Add missing values in 10% of the data
miss_ratio = 0.1
rng = np.random.RandomState(0)

X_unif = rng.random_sample(X_full.shape)

X_ind = X_unif < miss_ratio
X_miss = X_full.copy()
X_miss[X_ind] = np.nan

print(X_miss[X_ind].shape)

(546, 6)


In [4]:
# split the dataset into training and  testing
X_full_train, X_full_test, y_train, y_test = train_test_split(X_full, y, test_size=0.20, random_state=42)
X_miss_train, X_miss_test, y_train, y_test = train_test_split(X_miss, y, test_size=0.20, random_state=42)

In [5]:
# see results without missing data
ols_full = linear_model.LinearRegression(fit_intercept=True)
ols_full.fit(X_full_train, y_train)
print('RMSE - full = ', np.sqrt(mean_squared_error(y_test, ols_full.predict(X_full_test))))

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
def test_imputations(my_strategy, my_fill_value, X_miss_train, X_miss_test, y_train, y_test):
    
    imp = SimpleImputer(missing_values=np.nan, strategy=my_strategy, fill_value=my_fill_value)
    imp.fit(X_miss_train)
    
    X_imp_train = imp.transform(X_miss_train)
    X_imp_test = imp.transform(X_miss_test)
    
    ols_miss = linear_model.LinearRegression(fit_intercept=True)
    ols_miss.fit(X_imp_train, y_train)
    print('RMSE - ', my_strategy, ' = ', np.sqrt(mean_squared_error(y_test, ols_miss.predict(X_imp_test))))  

In [None]:
test_imputations('mean', 0, X_miss_train, X_miss_test, y_train, y_test)
test_imputations('median', 0, X_miss_train, X_miss_test, y_train, y_test)
test_imputations('most_frequent', 0, X_miss_train, X_miss_test, y_train, y_test)
test_imputations('constant', 0, X_miss_train, X_miss_test, y_train, y_test)

In [None]:
# MICE - Multivariate Imputation by Chained Equations
# YouTube explanation of MICE - https://www.youtube.com/watch?v=zX-pacwVyvU
import fancyimpute as fimp
# this package can be installed by calling: !pip install fancyimpute

imputer = fimp.IterativeImputer(sample_posterior=True, random_state=0)
imputer.fit(X_miss_train)

X_fancy_train = imputer.transform(X_miss_train)
X_fancy_test = imputer.transform(X_miss_test)

ols_fancy = linear_model.LinearRegression(fit_intercept=True)
ols_fancy.fit(X_fancy_train, y_train)
print('RMSE - fancy = ', np.sqrt(mean_squared_error(y_test, ols_fancy.predict(X_fancy_test))))