In [1]:
#import libraries
import pandas as pd
import numpy as np


# Setting this option will print all collumns of a dataframe
pd.set_option('display.max_columns', 20)
# Setting this option will print all of the data in a feature
pd.set_option('display.max_colwidth', 1000)

In [2]:
#load the csv
met_df = pd.read_csv('../Data/Meteorite_Landings.csv')

In [3]:
met_df.head()

Unnamed: 0,name,id,nametype,recclass,mass (g),fall,year,reclat,reclong,GeoLocation
0,Aachen,1,Valid,L5,21.0,Fell,1880.0,50.775,6.08333,"(50.775, 6.08333)"
1,Aarhus,2,Valid,H6,720.0,Fell,1951.0,56.18333,10.23333,"(56.18333, 10.23333)"
2,Abee,6,Valid,EH4,107000.0,Fell,1952.0,54.21667,-113.0,"(54.21667, -113.0)"
3,Acapulco,10,Valid,Acapulcoite,1914.0,Fell,1976.0,16.88333,-99.9,"(16.88333, -99.9)"
4,Achiras,370,Valid,L6,780.0,Fell,1902.0,-33.16667,-64.95,"(-33.16667, -64.95)"


In [4]:
met_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45716 entries, 0 to 45715
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   name         45716 non-null  object 
 1   id           45716 non-null  int64  
 2   nametype     45716 non-null  object 
 3   recclass     45716 non-null  object 
 4   mass (g)     45585 non-null  float64
 5   fall         45716 non-null  object 
 6   year         45425 non-null  float64
 7   reclat       38401 non-null  float64
 8   reclong      38401 non-null  float64
 9   GeoLocation  38401 non-null  object 
dtypes: float64(4), int64(1), object(5)
memory usage: 3.5+ MB


In [5]:
met_df.isnull().sum()

name              0
id                0
nametype          0
recclass          0
mass (g)        131
fall              0
year            291
reclat         7315
reclong        7315
GeoLocation    7315
dtype: int64

In [6]:
#subset the DataFrame to include only observed events
fell_df = met_df[met_df['fall'] == 'Fell']

In [7]:
#check for missing values in the fell category
fell_df.isnull().sum()

name            0
id              0
nametype        0
recclass        0
mass (g)       32
fall            0
year            0
reclat         10
reclong        10
GeoLocation    10
dtype: int64

In [8]:
#compare the number of observed events to the total number of records

print()
print("Number of observed events:", fell_df.shape[0])
print("Total number of records:", met_df.shape[0])
print("The percentage of observed events is:", round(len(fell_df.index) / len(met_df.index) * 100, 4), '%')
print()


Number of observed events: 1107
Total number of records: 45716
The percentage of observed events is: 2.4215 %



As expected, the majority of the catalogued meteorites are 'found' meteorites which cannot be linked to an actual falling event. As such, the very small sample comprised of actual observations will be usefull for evaluating frequency trends for impacts over the years.

In [9]:
#query the observed records for the earliest event
print(fell_df['year'].min())

860.0


With dates going back more than 1000 years we will need to figure if they will cause outliers, and possibly require to restrict the data to a period with a more consistent number of recorded observations. 

Next, since the names of meteorites are tied to location by naming convention, we check to see if we can impute approximate coordinates for the missing values.  

In [10]:
#displaying the observed records with missing location
display(fell_df[fell_df[['reclat', 'reclong']].isna().any(axis=1)])

Unnamed: 0,name,id,nametype,recclass,mass (g),fall,year,reclat,reclong,GeoLocation
147,Bulls Run,5163,Valid,Iron?,2250.0,Fell,1964.0,,,
208,Clohars,5383,Valid,L4,48.6,Fell,1822.0,,,
409,Jalanash,12068,Valid,Ureilite,700.0,Fell,1990.0,,,
414,Jemlapur,12079,Valid,L6,450.0,Fell,1901.0,,,
586,Maria Linden,15418,Valid,L4,114.0,Fell,1925.0,,,
681,Natal,16923,Valid,Stone-uncl,1.4,Fell,1973.0,,,
694,Niger (L6),16974,Valid,L6,3.3,Fell,1967.0,,,
695,Niger (LL6),16975,Valid,LL6,3.3,Fell,1967.0,,,
976,Talampaya,23791,Valid,Eucrite-cm,1421.0,Fell,1995.0,,,
1026,Udaipur,24099,Valid,H3,2000.0,Fell,1976.0,,,


In [11]:
print(fell_df.index[fell_df['name'] == 'Bulls Run'])

Int64Index([147], dtype='int64')


In [12]:
fell_df.iloc[146]

name           Bulls Run
id                  5163
nametype           Valid
recclass           Iron?
mass (g)          2250.0
fall                Fell
year              1964.0
reclat               NaN
reclong              NaN
GeoLocation          NaN
Name: 147, dtype: object

There is a discrepancy between the displayed index and the actual (0) index. To avoid imputing the new values in the wrong records an index reset is required.

In [13]:
#reset the index
fell_df.reset_index(drop = True, inplace =True)

In [14]:
#verifiy if the index matches
print(fell_df.index[fell_df['name'] == 'Bulls Run'])

print(fell_df.iloc[146])

Int64Index([146], dtype='int64')
name           Bulls Run
id                  5163
nametype           Valid
recclass           Iron?
mass (g)          2250.0
fall                Fell
year              1964.0
reclat               NaN
reclong              NaN
GeoLocation          NaN
Name: 146, dtype: object


The 'Bulls Run' meteorite reportedly fell on a farm called Bulls run in South Africa. Currently there is more than one farm with that name in South Africa, so a central region will be imputed as an approximate location. 

In [15]:
#add approximate coordinates for the 'Bulls Run' metetorite

pd.options.mode.chained_assignment = None  

fell_df.loc[146, 'reclat':'GeoLocation'] = -29.098450, 22.584989, '-29.098450, 22.584989'
fell_df.iloc[146]



name                       Bulls Run
id                              5163
nametype                       Valid
recclass                       Iron?
mass (g)                      2250.0
fall                            Fell
year                          1964.0
reclat                     -29.09845
reclong                    22.584989
GeoLocation    -29.098450, 22.584989
Name: 146, dtype: object

In [16]:
#list the observed records with missing location and normalized indexes
display(fell_df[fell_df[['reclat', 'reclong']].isna().any(axis=1)])

Unnamed: 0,name,id,nametype,recclass,mass (g),fall,year,reclat,reclong,GeoLocation
207,Clohars,5383,Valid,L4,48.6,Fell,1822.0,,,
408,Jalanash,12068,Valid,Ureilite,700.0,Fell,1990.0,,,
413,Jemlapur,12079,Valid,L6,450.0,Fell,1901.0,,,
584,Maria Linden,15418,Valid,L4,114.0,Fell,1925.0,,,
679,Natal,16923,Valid,Stone-uncl,1.4,Fell,1973.0,,,
692,Niger (L6),16974,Valid,L6,3.3,Fell,1967.0,,,
693,Niger (LL6),16975,Valid,LL6,3.3,Fell,1967.0,,,
972,Talampaya,23791,Valid,Eucrite-cm,1421.0,Fell,1995.0,,,
1022,Udaipur,24099,Valid,H3,2000.0,Fell,1976.0,,,


Next, the coordinates for the remaining missing locations will be imputed according to their approximate location. 

The names of the meteorites are tied to location by naming convention e.g. there is an actual place called Maria-Linden in Eastern Cape, South Africa. It's not certain that this is the actual location of the meteorite with the same name but the coordinates will suffice for retrieving country and continent data.

#### Note
Niger L6 and LL6 are part of the same object that was seen falling near the village of Koutiaran, Niger. 

In [17]:
#imputing the recommended coordinates (Met. Soc.) for the 'Clohars' meteorite
fell_df.loc[[207], 'reclat':'GeoLocation'] = 47.890000, -4.060000, '47.890000, -4.060000'
#impute the coordinates of the province of origin for the 'Jalanash' meteorite 
fell_df.loc[[408], 'reclat':'GeoLocation'] = 48.300000, 89.500000, '48.300000, 89.500000'
#impute the coordinates of the province of origin for the 'Jemlapur' meteorite ;
fell_df.loc[[413], 'reclat':'GeoLocation'] = 25.300000, 86.500000, '25.3, 86.5'
#impute the coordinates for the 'Maria Linden' meteorite 
fell_df.loc[[584], 'reclat':'GeoLocation'] = -30.250000, 28.483333, '-30.250000, 28.483333'
#impute the Natal south african province coordinates for the 'Natal' meteorite 
fell_df.loc[[679], 'reclat':'GeoLocation'] = -30.57249771, 30.57249771, '-30.57249771, 30.57249771'
#Niger L6 and LL6 share the same coordinates
fell_df.loc[[692], 'reclat':'GeoLocation'] = 13.71455, 9.15856, '13.71455, 9.15856'
fell_df.loc[[693], 'reclat':'GeoLocation'] = 13.71455, 9.15856, '13.71455, 9.15856'
#impute the coordinates for the 'Talampaya' meteorite 
fell_df.loc[[972], 'reclat':'GeoLocation'] = -29.76811, -67.93886, '-29.76811, -67.93886'
#impute the coordinates for the 'Udaipur' meteorite 
fell_df.loc[[1022], 'reclat':'GeoLocation'] = 24.57872, 73.68626, '24.57872, 73.68626'

In [18]:
print(display(fell_df[fell_df[['reclat', 'reclong']].isna().any(axis=1)]))

fell_df.isna().sum()

Unnamed: 0,name,id,nametype,recclass,mass (g),fall,year,reclat,reclong,GeoLocation


None


name            0
id              0
nametype        0
recclass        0
mass (g)       32
fall            0
year            0
reclat          0
reclong         0
GeoLocation     0
dtype: int64

In [19]:
#Entries with zero values will not work with geopy to generate regional names 
fell_df.loc[fell_df['reclat'] == 0]

Unnamed: 0,name,id,nametype,recclass,mass (g),fall,year,reclat,reclong,GeoLocation
595,Mason Gully,53653,Valid,H5,24.54,Fell,2010.0,0.0,0.0,"(0.0, 0.0)"
609,Meru,15491,Valid,LL6,6000.0,Fell,1945.0,0.0,37.66667,"(0.0, 37.66667)"


In [20]:
#Mason Gully coordinate imputation
fell_df.loc[[595], 'reclat':'GeoLocation'] = -30.453609466, 127.647201538, '-30.453609466, 127.647201538'
fell_df.loc[595]


name                            Mason Gully
id                                    53653
nametype                              Valid
recclass                                 H5
mass (g)                              24.54
fall                                   Fell
year                                 2010.0
reclat                           -30.453609
reclong                          127.647202
GeoLocation    -30.453609466, 127.647201538
Name: 595, dtype: object

In [21]:
#get the country and continent name from the coordinates
!pip install geopy pycountry-convert tqdm -q

from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import pycountry_convert as pc

from pprint import pprint
from typing import Tuple

from tqdm import tqdm
tqdm.pandas()

def get_continent_name(continent_code: str) -> str:
    continent_dict = {
        "NA": "North America",
        "SA": "South America",
        "AS": "Asia",
        "AF": "Africa",
        "OC": "Oceania",
        "EU": "Europe",
        "AQ" : "Antarctica"
    }
    return continent_dict[continent_code]

def get_continent(lat: float, lon:float) -> Tuple[str, str]:
    geolocator = Nominatim(user_agent="alin.airinei924@gmail.com", timeout=10)
    geocode = RateLimiter(geolocator.reverse, min_delay_seconds=1)

    location = geocode(f"{lat}, {lon}", language="en")

    # for cases where the location is not found, coordinates are antarctica
    if location is None:
        return "Antarctica", "Antarctica"

    # extract country code
    address = location.raw['address']
    country_code = address['country_code'].upper()

    # get continent code from country code
    continent_code = pc.country_alpha2_to_continent_code(country_code)
    
    #convert code to names
    continent_name = get_continent_name(continent_code)
    country_name = pc.country_alpha2_to_country_name(country_code)
    
    return country_name, continent_name

fell_df[["country", "continent"]] = fell_df.progress_apply(
    lambda x: get_continent(x["reclat"], x["reclong"]), axis=1, result_type="expand")

100%|███████████████████████████████████████| 1107/1107 [09:22<00:00,  1.97it/s]


In [22]:
print(fell_df.columns)
display(fell_df.head())

Index(['name', 'id', 'nametype', 'recclass', 'mass (g)', 'fall', 'year',
       'reclat', 'reclong', 'GeoLocation', 'country', 'continent'],
      dtype='object')


Unnamed: 0,name,id,nametype,recclass,mass (g),fall,year,reclat,reclong,GeoLocation,country,continent
0,Aachen,1,Valid,L5,21.0,Fell,1880.0,50.775,6.08333,"(50.775, 6.08333)",Germany,Europe
1,Aarhus,2,Valid,H6,720.0,Fell,1951.0,56.18333,10.23333,"(56.18333, 10.23333)",Denmark,Europe
2,Abee,6,Valid,EH4,107000.0,Fell,1952.0,54.21667,-113.0,"(54.21667, -113.0)",Canada,North America
3,Acapulco,10,Valid,Acapulcoite,1914.0,Fell,1976.0,16.88333,-99.9,"(16.88333, -99.9)",Mexico,North America
4,Achiras,370,Valid,L6,780.0,Fell,1902.0,-33.16667,-64.95,"(-33.16667, -64.95)",Argentina,South America


In [23]:
fell_df.to_csv('./Data/fell.csv')

In [None]:
display(fell_df[fell_df[['mass (g)',]].isna().any(axis=1)])