In [1]:
import pandas as pd
import json
from math import radians, cos, sin, asin, sqrt
import math
import numpy as np
from geopy.geocoders import Nominatim

In [2]:
# import the data in a dataframe
json_data=open('nuclear_tests1.json').read()
data = json.loads(json_data)
df = pd.DataFrame(data)
df.head()
print(len(df))

2065


In [3]:
print(str(len(df[pd.isnull(df['decimal latitude/longitude'])])) + ' / ' + str(len(df)) + ' entries do not have coordinates')
print(str(len(df[df['site'] == ''])) + ' / ' + str(len(df)) + ' entries do not have a site')
distinct_sites = set(df['site'])
print('There are ' + str(len(distinct_sites)) + ' distinct sites')

1211 / 2065 entries do not have coordinates
119 / 2065 entries do not have a site
There are 36 distinct sites


In [4]:
# Only one entry has neither a site nor coordinates:
df[(pd.isnull(df['decimal latitude/longitude'])) & (df['site'] == '')]

Unnamed: 0,body wave magnitude,country,date,decimal latitude/longitude,name,original time,site,type,yield
2064,,UNK,22/09/1979,,-,03:00:00 (Universal Coordinated Time),,Atmospheric,


After some research on the date, it is the "Vela Incident" (see : https://en.wikipedia.org/wiki/Vela_Incident)
We then add a position to it : lat: -47, long: 40

In [5]:
df.loc[(pd.isnull(df['decimal latitude/longitude'])) & (df['site'] == ''), 'decimal latitude/longitude'] = '-47/40'

In [6]:
# We group the dataframe by sites and export the coordinates of each event in a distinct list for each site
# the goal is to see events on a same site can have very different coordinates or not.
coord_lists = df.fillna('-').groupby('site')['decimal latitude/longitude'].apply(list)

In [7]:
def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
    return c * r

In [8]:
df[pd.notnull(df['decimal latitude/longitude'])].head()

Unnamed: 0,body wave magnitude,country,date,decimal latitude/longitude,name,original time,site,type,yield
0,4.9,CHN,29/07/1996,41.82/88.42,-,01:48:57 (Universal Coordinated Time),Lop Nor,Underground,
1,5.9,CHN,08/06/1996,41.66/88.69,-,02:55:58 (Universal Coordinated Time),Lop Nor,Underground,
2,6.0,CHN,17/08/1995,41.56/88.8,-,00:59:57 (Universal Coordinated Time),Lop Nor,Underground,
3,6.1,CHN,15/05/1995,41.6/88.82,-,04:05:57 (Universal Coordinated Time),Lop Nor,Underground,
4,6.0,CHN,07/10/1994,41.66/88.75,-,03:25:58 (Universal Coordinated Time),Lop Nor,Underground,


In [9]:
no_coord_sites = []
coord_sites = dict()
for i in range(0, len(coord_lists)):
    dists = []
    lats = []
    longs = []
    lat0 = ""
    long0 = ""
    print(coord_lists.index[i])
    for event in coord_lists[i]:
        if('/' in str(event)):
            lat0, long0 = event.split('/')
            break
    if (lat0!="" and long0!=""):
        for event in coord_lists[i]:
            event = str(event)
            if('/' in event):
                lat, long = event.split('/')
                dists.append(haversine(float(long0), float(lat0), float(long), float(lat)))
                lats.append(float(lat))
                longs.append(float(long))
        
        print(np.mean(dists))
        meanlat = np.mean(lats)
        meanlong = np.mean(longs)
        coord_sites[coord_lists.index[i]]=str(meanlat)+"/"+str(meanlong)
    else:
        no_coord_sites.append(coord_lists.index[i])
        print("No distances")
    print("---------")



1739.12598174
---------
Amchitka, Alaska
4.78866535241
---------
Bikini Is (USA)
32.7202064536
---------
Carlsnad, New Mexico (USA)
No distances
---------
Central Nevada
No distances
---------
Christmas Island
No distances
---------
Christmas Island (USA)
No distances
---------
Ekker, Algeria
1.42693253151
---------
Emu Fields, Australia
No distances
---------
Enewetak (USA)
53.7383584401
---------
Fallon, Nevada
No distances
---------
Farmington, New Mexico
No distances
---------
Grand Valley, Colorado
No distances
---------
Hattiesburg, Mississippi
No distances
---------
Hiroshima, Japan
0.0
---------
Johnston Island
No distances
---------
Kharan
0.0
---------
Lop Nor
54.0866798425
---------
Maralinga
No distances
---------
Missile Testing Range
138.40896036
---------
Monte Bello Is
No distances
---------
NTS
No distances
---------
Nagasaki, Japan
0.0
---------
Nellis Air Force Range
No distances
---------
Nevada Test Site
5.55569994183
---------
New Mexico (USA)
No distances
------

In [10]:
# As we can see, some events have a site but no location. 

In [11]:
#first make a dict site => coords (from geopy)
if 0:
    geolocator = Nominatim()
    geopy_site_coords = dict()
    for site in no_coord_sites:
        location = geolocator.geocode(site)
        if location is not None:
            coord = str(location.latitude) + "/" + str(location.longitude)
            #print(site)
            #print(coord)
            geopy_site_coords[site] = coord
        else:
            print("coordinates not found for site " + str(site))

Geopy seems to give some buggy response, we do it by hand

In [12]:
for site in no_coord_sites:
    print(site)

Carlsnad, New Mexico (USA)
Central Nevada
Christmas Island
Christmas Island (USA)
Emu Fields, Australia
Fallon, Nevada
Farmington, New Mexico
Grand Valley, Colorado
Hattiesburg, Mississippi
Johnston Island
Maralinga
Monte Bello Is
NTS
Nellis Air Force Range
New Mexico (USA)
Reggan,Algeria
Rifle, Colorado
South Atlantic
Tuamotu Archipelago


In [13]:
if 0:
    print(len(no_coord_sites))
    print(len(geopy_site_coords))

In [14]:
if 0 :
    # For the 3 other position we have to do it by hand (using https://www.latlong.net/)
    geopy_site_coords['Carlsnad, New Mexico (USA)']="32.420674/-104.228837"
    geopy_site_coords['Emu Fields, Australia']="-28.698333/132.371389"
    geopy_site_coords['Nellis Air Force Range']="36.308119/-115.049367"

In [15]:
#we have to do it by hand (using https://www.latlong.net/)
hand_site_coords = dict()
hand_site_coords['Carlsnad, New Mexico (USA)']="32.420674/-104.228837"
hand_site_coords['Central Nevada']="38.060229/-117.220892"
hand_site_coords['Christmas Island']="-10.447525/105.690449"
hand_site_coords['Christmas Island (USA)']="-10.451208/105.688545"
hand_site_coords['Emu Fields, Australia']="-28.698333/132.371389"
hand_site_coords['Fallon, Nevada']="39.474869/-118.777041"
hand_site_coords['Farmington, New Mexico']="36.728058/-108.218686"
hand_site_coords['Grand Valley, Colorado']="40.072738/-106.064018"
hand_site_coords['Hattiesburg, Mississippi']="31.327119/-89.290339"
hand_site_coords['Johnston Island']="16.729503/-169.533648"
hand_site_coords['Maralinga']="-30.163170/131.575195"
hand_site_coords['Monte Bello Is']="-20.439168/115.556260"
hand_site_coords['NTS']="37.116449/-116.188871"
hand_site_coords['Nellis Air Force Range']="36.308119/-115.049367"
hand_site_coords['New Mexico (USA)']="34.519940/-105.870090"
hand_site_coords['Reggan,Algeria']="25.275963/-1.520862"
hand_site_coords['Rifle, Colorado']="39.534702/-107.783120"
hand_site_coords['South Atlantic']="-47/40"
hand_site_coords['Tuamotu Archipelago']="-19.000000/-142.000000"
hand_site_coords['Nellis Air Force Range']="36.308119/-115.049367"

In [16]:
for i in df.index:
    if('/' not in str(df['decimal latitude/longitude'][i])):
        if (df['site'][i] in hand_site_coords.keys()):
            df['decimal latitude/longitude'][i] = hand_site_coords[df['site'][i]]

In [17]:
print(str(len(df[pd.isnull(df['decimal latitude/longitude'])])) + ' / ' + str(len(df)) + ' entries do not have coordinates')

927 / 2065 entries do not have coordinates


In [18]:
df[pd.isnull(df['decimal latitude/longitude'])].head()

Unnamed: 0,body wave magnitude,country,date,decimal latitude/longitude,name,original time,site,type,yield
1008,4.4,USA,23/09/1992,,DIVIDER,15:04:00 (Universal Coordinated Time),Nevada Test Site,Underground,
1009,4.4,USA,18/09/1992,,HUNTERS TROPHY,17:00:00 (Universal Coordinated Time),Nevada Test Site,Underground,
1010,,USA,23/06/1992,,GALENA-YELLOW,15:00:00 (Universal Coordinated Time),Nevada Test Site,Underground,
1011,,USA,19/06/1992,,VICTORIA,16:45:00 (Universal Coordinated Time),Nevada Test Site,Underground,
1012,,USA,30/04/1992,,DIAMOND FORTUNE,16:30:00 (Universal Coordinated Time),Nevada Test Site,Underground,


In [19]:
# Now we need to use the informations we get for each site in our data and use it to extrapolate the location of the 
for i in df.index:
    if('/' not in str(df['decimal latitude/longitude'][i])):
        if (df['site'][i] in coord_sites.keys()):
            df['decimal latitude/longitude'][i] = coord_sites[df['site'][i]]

In [20]:
print(str(len(df[pd.isnull(df['decimal latitude/longitude'])])) + ' / ' + str(len(df)) + ' entries do not have coordinates')

0 / 2065 entries do not have coordinates


In [21]:
# We now have coordinates for every item in our dataframe
df[pd.isnull(df['decimal latitude/longitude'])].head()

Unnamed: 0,body wave magnitude,country,date,decimal latitude/longitude,name,original time,site,type,yield


In [22]:
df_selected = df[['body wave magnitude', 'country', 'date', 'decimal latitude/longitude', 'name', 'site', 'type']]

In [23]:
df_selected['lat'], df_selected['lng'] = df['decimal latitude/longitude'].str.split('/', 1).str

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [24]:
df_selected.drop('decimal latitude/longitude', axis=1, inplace=True)

In [25]:
df_selected.columns = ['magnitude', 'country', 'date', 'name', 'site', 'type', 'lat','lng']

In [26]:
if 0:
    df_selected['textual_infos'] = df_selected['site']
    for i in df_selected.index:
        if (df_selected['textual_infos'][i]==""):
            location = str(df_selected['lat'][i])+","+str(df_selected['lng'][i])
            geolocator = Nominatim()
            geolocation = geolocator.reverse(location)
            df_selected['textual_infos'][i] = geolocation.address

In [27]:
df_selected.head()

Unnamed: 0,magnitude,country,date,name,site,type,lat,lng
0,4.9,CHN,29/07/1996,-,Lop Nor,Underground,41.82,88.42
1,5.9,CHN,08/06/1996,-,Lop Nor,Underground,41.66,88.69
2,6.0,CHN,17/08/1995,-,Lop Nor,Underground,41.56,88.8
3,6.1,CHN,15/05/1995,-,Lop Nor,Underground,41.6,88.82
4,6.0,CHN,07/10/1994,-,Lop Nor,Underground,41.66,88.75


In [28]:
if 0:
    results = {}
    for key, df_gb in df.iterrows():
        results[str(key)] = df_gb.to_dict()
    import json
    with open('MapTimeline/data/nuclear_test_cleaned.json', 'w') as outfile:
        json.dump(results, outfile, indent=4)

Get the min and max value for the date, to use it for our slider

In [29]:
df_selected['datetime'] = pd.to_datetime(df_selected['date'])

In [30]:
min(df_selected.datetime)

Timestamp('1945-05-08 00:00:00')

In [31]:
max(df_selected.datetime)

Timestamp('2006-09-10 00:00:00')

In [32]:
df_selected.drop('datetime', axis=1, inplace=True)

We want to use the magnitude to show the tests on the map, so we fill the na in this column with the mean

In [33]:
df_selected['magnitude'] = pd.to_numeric(df_selected['magnitude'])

In [34]:
mean_magnitude = (float)('%.1f'%df_selected['magnitude'].mean())

In [35]:
df_selected['magnitude'].fillna(mean_magnitude, inplace=True)

We want to show what countries are represented on our map (such that we can find their flags for the viz)

In [36]:
df_selected.country.unique()

array(['CHN', 'FRA', 'IND', 'PAK', 'NK', 'SOV', 'GBR', 'USA', 'UNK'], dtype=object)

## Fixing Nevada Test Site location

After playing with our visualization we discovered strange data points (near Korea) being some USA tests. We were wondering what were these tests and we discovered that the site of these point was "Nevada Test Site".

In the original data scrapped we saw that "Nevada Test Site" add a bad location (lat:37, long:116) while the real position of this site is at around (lat:37, long:-116).

We decided to correct it here

In [37]:
longNev = df_selected[df_selected['site']=='Nevada Test Site'].mode()['lng'][0]

In [38]:
df_selected.loc[df_selected['site']=='Nevada Test Site', 'lng'] = '-' + longNev

In [39]:
df_selected.to_csv("nuclear_test.tsv", sep='\t')