In [None]:
import pandas as pd
from citipy import citipy
import numpy as np
%config Completer.use_jedi = False

In [None]:
# This presumes you run this Notebook in the same directory as the data file
df_meterorite_data = pd.read_csv("./Meteorite_Landings.csv")

In [None]:
#check out how many rows,cols
df_meterorite_data.shape

In [None]:
# are there any Nulls?
df_meterorite_data.isnull().sum().any()

In [None]:
df_meterorite_data.head()

In [None]:
# split 'recclass' on comma in prep for cleaning out remainder
df_meterorite_data[['material_column', 'misc_maerial']] = df_meterorite_data['recclass'].str.split(',', expand=True)

In [None]:
# date cleanup #1 - remove stuff like timestamp and AM/PM
# split impact year on space to eliminate things like timestamp and AM/PM
df_meterorite_data[['cal_year', 'time']] = df_meterorite_data['year'].str.split(" ", n = 1, expand = True)

In [None]:
df_meterorite_data.shape

In [None]:
# drop unneeded columns

In [None]:
df_meterorite_data.drop(['misc_maerial'], axis=1, inplace=True)

In [None]:
df_meterorite_data.drop(['recclass'], axis=1, inplace=True)

In [None]:
df_meterorite_data.drop(['time'], axis=1, inplace=True)

In [None]:
df_meterorite_data.drop(['year'], axis=1, inplace=True)

In [None]:
df_meterorite_data.drop(['nametype'], axis=1, inplace=True)

In [None]:
df_meterorite_data.rename(columns={"mass (g)": "mass_grams"}, inplace=True)

In [None]:
df_meterorite_data.dropna(how="any", inplace=True)


In [None]:
# What does our data look like now?
df_meterorite_data.shape

In [None]:
# create a filter to strip out coordinates (0,0)
real_coords = (df_meterorite_data['reclat'] != 0) & (df_meterorite_data['reclat'] != 0)
#real_coords

In [None]:
# Create new dataframe to 
df_met_data = pd.DataFrame()
df_met_data = df_meterorite_data[real_coords]

In [None]:
# To enable time sequence calculation need to change type 'object' to 'datetime64[ns]'
# There are still bad dates, such as date over 500 years old. Pandas has difficulties with this
# without extra coding. So change to date. 'error=coerce' will set bad dates to NaT
bad_dates = pd.to_datetime(df_met_data['cal_year'],errors="coerce",infer_datetime_format=True)

In [None]:
# was a challenge adding the new column of bad dates, so add the series as a column
# with the 'assign' method
df_met_data = df_met_data.assign(years=bad_dates)

In [None]:
df_met_data.head()

In [None]:
df_met_data.isnull().sum().any()

In [None]:
# we don't need the original 'cal_year' column. So drop it
df_met_data.drop(['cal_year'], axis=1, inplace=True)

In [None]:
df_met_data.isnull().sum().any()
# we have 'bad dates' now (which is the plan. Around 15 bad dates)

In [None]:
df_met_data.dropna(how="any", inplace=True)

In [None]:
df_met_data.shape

In [None]:
# need to reset the index because of all the data deletions
df_met_data.reset_index(drop=True, inplace=True)

In [None]:
# prep for calculating country code by lat/long
latitudes = df_met_data['reclat']
longitudes = df_met_data['reclong']

In [None]:
country_code = []
for x in range(len(latitudes)):
    city = citipy.nearest_city(latitudes[x], longitudes[x])
    country_code.append(city.country_code)

In [None]:
# add a country code column
df_met_data['country_code'] = country_code

In [None]:
# We've dropped around ~ 14,000 rows

In [None]:
# convert grams from string to float - needed if we're to perform maths on grams
# have to strip the comma ',' out of the strings before converting
# could be done w/list comprehension but choosing readability here
mass_in_grams_converted = []
for x in range(len(df_met_data['mass_grams'])):
    mass_in_grams_converted.append(df_met_data['mass_grams'][x].replace(',', ''))

In [None]:
# add new column with mass in float type

In [None]:
df_met_data['mass_in_grams'] = mass_in_grams_converted

In [None]:
# convert from object to float
df_met_data['mass_in_grams'] = df_met_data['mass_in_grams'].astype(float)

In [None]:
# drop original column (type object)
df_met_data.drop(['mass_grams'], axis=1, inplace=True)

In [None]:
df_met_data.head()

In [None]:
df_met_data.dtypes

In [None]:
df_met_data.describe()

In [None]:
# save new data file
df_met_data.to_csv('meteorite_data.csv')