In [4]:
import pandas as pd
from math import sin, cos, sqrt, atan2, radians

In [5]:
f_earthquakes = pd.read_parquet('../data/extraction_layer/f_earthquakes.parquet', engine='fastparquet')

In [6]:
f_gdp = pd.read_parquet('../data/extraction_layer/f_gdp.parquet', engine='fastparquet')

In [7]:
drop_eq_columns = ['id_earthquake', 'time',  'depth', 'mag', 'magType', 'nst', 'gap', 'dmin', 'rms',
       'net', 'id', 'place', 'type', 'horizontalError', 'depthError',
       'magError', 'magNst', 'status', 'locationSource', 'magSource',
       'updated_date']
drop_gdp_columns = ['id_gdp', 'gdp', 'state_code',
       'state_name']

f_earthquakes = f_earthquakes.drop(columns=drop_eq_columns)
f_gdp = f_gdp.drop(columns=drop_gdp_columns)

In [8]:
columns_eq_renaming = {"year":"year_eq","longitude":"longitude_eq","latitude":"latitude_eq"}
columns_gdp_renaming = {"year":"year_gdp","longitude":"longitude_gdp","latitude":"latitude_gdp"}

f_earthquakes.rename(columns=columns_eq_renaming ,inplace=True)
f_gdp.rename(columns=columns_gdp_renaming ,inplace=True)

In [9]:
f_gdp.drop_duplicates(inplace=True)
f_earthquakes.drop_duplicates(inplace=True)

In [10]:
f_earthquakes.shape

(3176391, 4)

In [11]:
f_gdp.shape

(1250, 4)

In [12]:
f_earthquakes.shape

(3176391, 4)

In [13]:
f_gdp.shape

(1250, 4)

In [14]:


def calc_distance_math(lon_cent,lat_cent,lon_obj,lat_obj):
    '''
    '''
    # approximate radius of earth in km
    R = 6373.0

    lat1 = radians(lat_cent)
    lon1 = radians(lon_cent)
    lat2 = radians(lat_obj)
    lon2 = radians(lon_obj)

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c
    
    return distance

In [15]:
f_earthquakes.head(5)

Unnamed: 0,key_earthquake_gdp,year_eq,latitude_eq,longitude_eq
0,59541ce5-ab80-5658-806e-bab360e4d00b,1970,37.003502,-117.996834
1,969e1542-110a-5ee1-9a50-43b25711474c,1970,35.642788,-120.933601
2,64a66f7b-34c1-5a41-9d64-5313e34ddba1,1970,34.16452,-118.185036
3,d1b3dc1a-58c2-56d8-ac4e-a01008cfea8b,1970,33.836494,-116.781868
4,c9ac79b8-69f9-5555-a536-2c7c7a3a5daa,1970,33.208477,-115.476997


In [16]:
key_left = ['year_eq']
key_right = ['year_gdp']

df_merged = pd.merge(f_earthquakes, f_gdp, 
                          how="inner",
                          left_on=key_left, right_on=key_right)

In [None]:
df_merged['distance'] = df_merged.apply(
        lambda my_data: calc_distance_math(my_data['longitude_gdp'], my_data['latitude_gdp'], my_data['longitude_eq'],
                                           my_data['latitude_eq']), axis=1)

In [17]:
df_merged.shape

(110202500, 8)

In [18]:
link = pd.read_parquet('../data/extraction_layer/link_eq_gdp.parquet', engine='fastparquet')

In [22]:
link.shape

(3600612, 2)