In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('data_set/Meteorite_Landings.csv')
df.head()

Unnamed: 0,name,id,nametype,recclass,mass (g),fall,year,reclat,reclong,GeoLocation
0,Aachen,1,Valid,L5,21.0,Fell,01/01/1880 12:00:00 AM,50.775,6.08333,"(50.775, 6.08333)"
1,Aarhus,2,Valid,H6,720.0,Fell,1/1/1951 0:00,56.18333,10.23333,"(56.18333, 10.23333)"
2,Abee,6,Valid,EH4,107000.0,Fell,1/1/1952 0:00,54.21667,-113.0,"(54.21667, -113.0)"
3,Acapulco,10,Valid,Acapulcoite,1914.0,Fell,1/1/1976 0:00,16.88333,-99.9,"(16.88333, -99.9)"
4,Achiras,370,Valid,L6,780.0,Fell,1/1/1902 0:00,-33.16667,-64.95,"(-33.16667, -64.95)"


In [3]:
# the id is unique for each row
df['id'].is_unique 

True

In [4]:
# Drop the columns that are not necessary

to_drop = ['name',
          'nametype',
          'recclass',
          'fall']

# df.drop(to_drop, inplace=True, axis=1)
df = df.drop(to_drop, axis=1)

In [5]:
# Change the index to the id number since it is unique 
# this will be for better calling of the meteors
df = df.set_index('id')
df.head()

Unnamed: 0_level_0,mass (g),year,reclat,reclong,GeoLocation
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,21.0,01/01/1880 12:00:00 AM,50.775,6.08333,"(50.775, 6.08333)"
2,720.0,1/1/1951 0:00,56.18333,10.23333,"(56.18333, 10.23333)"
6,107000.0,1/1/1952 0:00,54.21667,-113.0,"(54.21667, -113.0)"
10,1914.0,1/1/1976 0:00,16.88333,-99.9,"(16.88333, -99.9)"
370,780.0,1/1/1902 0:00,-33.16667,-64.95,"(-33.16667, -64.95)"


In [6]:
df.get_dtype_counts()

float64    3
object     2
dtype: int64

In [91]:
df.loc[30410] # returns the row at the index in a Series data type

mass (g)                     3.3
year               1/1/1939 0:00
reclat                     49.25
reclong                  17.6667
GeoLocation    (49.25, 17.66667)
Name: 30410, dtype: object

In [92]:
# notice how when the year is specified, it returns 
df.loc[30410:,'year'].head(10)# since the year is specified so it just returns the year

id
30410    1/1/1939 0:00
31357    1/1/2003 0:00
30414    1/1/1976 0:00
Name: year, dtype: object

In [93]:
# we have the necessary columns now 
# what we need now is to minimize our date
# to just the year and not the time or day that it happened 

extr = df['year'].str.extract(r'(\d{4})')
extr.head()

Unnamed: 0_level_0,0
id,Unnamed: 1_level_1
1,1880
2,1951
6,1952
10,1976
370,1902


In [94]:
# inserting the extracted values into the year column  
df = df.drop('year', axis=1)
df.insert(0, 'year',extr)
df.head()

Unnamed: 0_level_0,year,mass (g),reclat,reclong,GeoLocation
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1880,21.0,50.775,6.08333,"(50.775, 6.08333)"
2,1951,720.0,56.18333,10.23333,"(56.18333, 10.23333)"
6,1952,107000.0,54.21667,-113.0,"(54.21667, -113.0)"
10,1976,1914.0,16.88333,-99.9,"(16.88333, -99.9)"
370,1902,780.0,-33.16667,-64.95,"(-33.16667, -64.95)"


In [95]:
# need to round the numbers after the decimal points for reclat, reclong, and GeoLocation
# since GeoLocation is not a type of int or float it is an object 
# so we will not be able to round that column
df = df.round(2)
df.head()

Unnamed: 0_level_0,year,mass (g),reclat,reclong,GeoLocation
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1880,21.0,50.78,6.08,"(50.775, 6.08333)"
2,1951,720.0,56.18,10.23,"(56.18333, 10.23333)"
6,1952,107000.0,54.22,-113.0,"(54.21667, -113.0)"
10,1976,1914.0,16.88,-99.9,"(16.88333, -99.9)"
370,1902,780.0,-33.17,-64.95,"(-33.16667, -64.95)"


In [96]:
# instead try dropping the GeoLocation column
# and make reclat and reclong the inputs for GeoLocation
df.drop('GeoLocation', axis=1)
df['GeoLocation'] = df['reclat'].map(str) + ", "+ df['reclong'].map(str)

df.head()

Unnamed: 0_level_0,year,mass (g),reclat,reclong,GeoLocation
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1880,21.0,50.78,6.08,"50.78, 6.08"
2,1951,720.0,56.18,10.23,"56.18, 10.23"
6,1952,107000.0,54.22,-113.0,"54.22, -113.0"
10,1976,1914.0,16.88,-99.9,"16.88, -99.9"
370,1902,780.0,-33.17,-64.95,"-33.17, -64.95"


In [97]:
# last step - toss any rows with NaN values in the columns
# or anything that is blank or not an appropiate form of data
df.isnull().sum()

year            291
mass (g)        131
reclat         7315
reclong        7315
GeoLocation       0
dtype: int64

In [98]:
df.shape

(45716, 5)

In [99]:
df = df.dropna(0, how='any')

In [103]:
df.to_csv('data_set/M_Landings_cleaned.csv')