# Section 1: Getting Started with Pandas

## NumPy arrays

In [2]:
import numpy as np

In [3]:
data = np.genfromtxt('data/example_data.csv', delimiter=';', names=True, dtype=None, encoding='UTF')
data

array([('2018-10-13 11:10:23.560', '262km NW of Ozernovskiy, Russia', 'mww', 6.7, 'green', 1),
       ('2018-10-13 04:34:15.580', '25km E of Bitung, Indonesia', 'mww', 5.2, 'green', 0),
       ('2018-10-13 00:13:46.220', '42km WNW of Sola, Vanuatu', 'mww', 5.7, 'green', 0),
       ('2018-10-12 21:09:49.240', '13km E of Nueva Concepcion, Guatemala', 'mww', 5.7, 'green', 0),
       ('2018-10-12 02:52:03.620', '128km SE of Kimbe, Papua New Guinea', 'mww', 5.6, 'green', 1)],
      dtype=[('time', '<U23'), ('place', '<U37'), ('magType', '<U3'), ('mag', '<f8'), ('alert', '<U5'), ('tsunami', '<i8')])

In [4]:
data.shape

(5,)

In [5]:
data.dtype

dtype([('time', '<U23'), ('place', '<U37'), ('magType', '<U3'), ('mag', '<f8'), ('alert', '<U5'), ('tsunami', '<i8')])

In [6]:
%%timeit
max([row[3] for row in data])

3.97 µs ± 44.9 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [7]:
max([row[3] for row in data])

6.7

In [8]:
array_dict = {
    col: np.array([row[i] for row in data])
    for i, col in enumerate(data.dtype.names)}

In [9]:
array_dict

{'time': array(['2018-10-13 11:10:23.560', '2018-10-13 04:34:15.580',
        '2018-10-13 00:13:46.220', '2018-10-12 21:09:49.240',
        '2018-10-12 02:52:03.620'], dtype='<U23'),
 'place': array(['262km NW of Ozernovskiy, Russia', '25km E of Bitung, Indonesia',
        '42km WNW of Sola, Vanuatu',
        '13km E of Nueva Concepcion, Guatemala',
        '128km SE of Kimbe, Papua New Guinea'], dtype='<U37'),
 'magType': array(['mww', 'mww', 'mww', 'mww', 'mww'], dtype='<U3'),
 'mag': array([6.7, 5.2, 5.7, 5.7, 5.6]),
 'alert': array(['green', 'green', 'green', 'green', 'green'], dtype='<U5'),
 'tsunami': array([1, 0, 0, 0, 1])}

In [10]:
%%timeit
array_dict['mag'].max()

2.15 µs ± 89.1 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [11]:
my_array = {}
for i, col in enumerate(data.dtype.names):
    elements =[]
    for row in data:
        elements.append(row[i])
    my_array[col] = np.array(elements)
my_array

{'time': array(['2018-10-13 11:10:23.560', '2018-10-13 04:34:15.580',
        '2018-10-13 00:13:46.220', '2018-10-12 21:09:49.240',
        '2018-10-12 02:52:03.620'], dtype='<U23'),
 'place': array(['262km NW of Ozernovskiy, Russia', '25km E of Bitung, Indonesia',
        '42km WNW of Sola, Vanuatu',
        '13km E of Nueva Concepcion, Guatemala',
        '128km SE of Kimbe, Papua New Guinea'], dtype='<U37'),
 'magType': array(['mww', 'mww', 'mww', 'mww', 'mww'], dtype='<U3'),
 'mag': array([6.7, 5.2, 5.7, 5.7, 5.6]),
 'alert': array(['green', 'green', 'green', 'green', 'green'], dtype='<U5'),
 'tsunami': array([1, 0, 0, 0, 1])}

In [12]:
np.array([
    value[array_dict['mag'].argmax()]
    for key, value in array_dict.items()])

array(['2018-10-13 11:10:23.560', '262km NW of Ozernovskiy, Russia',
       'mww', '6.7', 'green', '1'], dtype='<U32')

## Series

The Series class provides a data structure for arrays of a single type, just like the NumPy
array. However, it comes with some additional functionality. This one-dimensional
representation can be thought of as a column in a spreadsheet. We have a name for our
column, and the data we hold in it is of the same type (since we are measuring the same
variable):

In [13]:
import pandas as pd

In [14]:
place = pd.Series(array_dict['place'], name='place')
place

0          262km NW of Ozernovskiy, Russia
1              25km E of Bitung, Indonesia
2                42km WNW of Sola, Vanuatu
3    13km E of Nueva Concepcion, Guatemala
4      128km SE of Kimbe, Papua New Guinea
Name: place, dtype: object

In [15]:
place.name

'place'

In [16]:
place.dtype

dtype('O')

In [17]:
place.shape

(5,)

In [18]:
place.index

RangeIndex(start=0, stop=5, step=1)

In [19]:
place.values

array(['262km NW of Ozernovskiy, Russia', '25km E of Bitung, Indonesia',
       '42km WNW of Sola, Vanuatu',
       '13km E of Nueva Concepcion, Guatemala',
       '128km SE of Kimbe, Papua New Guinea'], dtype=object)

In [20]:
place.index.values

array([0, 1, 2, 3, 4])

In [21]:
numbers = np.linspace(0, 10, num=5)
numbers

array([ 0. ,  2.5,  5. ,  7.5, 10. ])

In [22]:
x = pd.Series(numbers)
y = pd.Series(numbers, index=pd.Index([1, 2, 3, 4, 5]))
x + y

0     NaN
1     2.5
2     7.5
3    12.5
4    17.5
5     NaN
dtype: float64

## Data Frames

In [23]:
df = pd.DataFrame(array_dict)
df

Unnamed: 0,time,place,magType,mag,alert,tsunami
0,2018-10-13 11:10:23.560,"262km NW of Ozernovskiy, Russia",mww,6.7,green,1
1,2018-10-13 04:34:15.580,"25km E of Bitung, Indonesia",mww,5.2,green,0
2,2018-10-13 00:13:46.220,"42km WNW of Sola, Vanuatu",mww,5.7,green,0
3,2018-10-12 21:09:49.240,"13km E of Nueva Concepcion, Guatemala",mww,5.7,green,0
4,2018-10-12 02:52:03.620,"128km SE of Kimbe, Papua New Guinea",mww,5.6,green,1


In [24]:
df.values

array([['2018-10-13 11:10:23.560', '262km NW of Ozernovskiy, Russia',
        'mww', 6.7, 'green', 1],
       ['2018-10-13 04:34:15.580', '25km E of Bitung, Indonesia', 'mww',
        5.2, 'green', 0],
       ['2018-10-13 00:13:46.220', '42km WNW of Sola, Vanuatu', 'mww',
        5.7, 'green', 0],
       ['2018-10-12 21:09:49.240',
        '13km E of Nueva Concepcion, Guatemala', 'mww', 5.7, 'green', 0],
       ['2018-10-12 02:52:03.620', '128km SE of Kimbe, Papua New Guinea',
        'mww', 5.6, 'green', 1]], dtype=object)

In [25]:
df.columns

Index(['time', 'place', 'magType', 'mag', 'alert', 'tsunami'], dtype='object')

In [26]:
df.index.values

array([0, 1, 2, 3, 4])

In [27]:
df.place

0          262km NW of Ozernovskiy, Russia
1              25km E of Bitung, Indonesia
2                42km WNW of Sola, Vanuatu
3    13km E of Nueva Concepcion, Guatemala
4      128km SE of Kimbe, Papua New Guinea
Name: place, dtype: object

In [28]:
df.shape

(5, 6)

In [29]:
df.dtypes

time        object
place       object
magType     object
mag        float64
alert       object
tsunami      int64
dtype: object

In [30]:
pd.read_csv??

In [31]:
import datetime as dt

In [32]:
np.random.seed(0)
pd.Series(np.random.rand(5), name='random')

0    0.548814
1    0.715189
2    0.602763
3    0.544883
4    0.423655
Name: random, dtype: float64

In [33]:
pd.DataFrame(
    {
        'random': np.random.rand(5),
        'text': ['hot', 'warm', 'cool', 'cold', None],
        'truth': [np.random.choice([True, False]) for _ in range(5)]
    },
    index = pd.date_range(
        end=dt.date(2019, 4, 21),
        freq='1D',
        periods=5,
        name='date'))

Unnamed: 0_level_0,random,text,truth
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-04-17,0.645894,hot,True
2019-04-18,0.437587,warm,False
2019-04-19,0.891773,cool,False
2019-04-20,0.963663,cold,True
2019-04-21,0.383442,,True


In [34]:
pd.DataFrame([
    {'mag': 5.2, 'place': 'California'},
    {'mag': 1.2, 'place': 'Alaska'},
    {'mag': 0.2, 'place': 'California'}
])

Unnamed: 0,mag,place
0,5.2,California
1,1.2,Alaska
2,0.2,California


In [35]:
list_of_tuples = [(n, n**2, n**3) for n in range(5)]
pd.DataFrame(
    list_of_tuples,
    columns=['n', 'n_squared', 'n_cubed'])

Unnamed: 0,n,n_squared,n_cubed
0,0,0,0
1,1,1,1
2,2,4,8
3,3,9,27
4,4,16,64


Before read a data file let's inspect it.

### Reading df from file

In [36]:
!wc -l data/earthquakes.csv

    9333 data/earthquakes.csv


In [37]:
!pwd

/Users/April/Projects/!_new_wave/_in_progress/da_w_pandas


In [38]:
!ls -lh data | grep earthquakes.csv

-rw-r--r--@ 1 April  staff   3.4M Jul 21 08:05 earthquakes.csv


In [39]:
files = !ls -lh data
[file for file in files if 'earthquake' in file]

['-rw-r--r--@ 1 April  staff   3.4M Jul 21 08:05 earthquakes.csv']

In [40]:
# let's take a look at the top few rows to see if the file comes with headers
!head -n 2 data/earthquakes.csv

alert,cdi,code,detail,dmin,felt,gap,ids,mag,magType,mmi,net,nst,place,rms,sig,sources,status,time,title,tsunami,type,types,tz,updated,url
,,37389218,https://earthquake.usgs.gov/fdsnws/event/1/query?eventid=ci37389218&format=geojson,0.008693,,85.0,",ci37389218,",1.35,ml,,ci,26.0,"9km NE of Aguanga, CA",0.19,28,",ci,",automatic,1539475168010,"M 1.4 - 9km NE of Aguanga, CA",0,earthquake,",geoserve,nearby-cities,origin,phase-data,",-480.0,1539475395144,https://earthquake.usgs.gov/earthquakes/eventpage/ci37389218


In [41]:
# check the bottom rows to make sure there is no extraneous data that we will need to ignore
!tail -n 2 data/earthquakes.csv

,,38063959,https://earthquake.usgs.gov/fdsnws/event/1/query?eventid=ci38063959&format=geojson,0.01865,,61.0,",ci38063959,",1.1,ml,,ci,27.0,"9km NE of Aguanga, CA",0.1,19,",ci,",reviewed,1537229545350,"M 1.1 - 9km NE of Aguanga, CA",0,earthquake,",focal-mechanism,geoserve,nearby-cities,origin,phase-data,scitech-link,",-480.0,1537230211640,https://earthquake.usgs.gov/earthquakes/eventpage/ci38063959
,,38063935,https://earthquake.usgs.gov/fdsnws/event/1/query?eventid=ci38063935&format=geojson,0.01698,,39.0,",ci38063935,",0.66,ml,,ci,24.0,"9km NE of Aguanga, CA",0.1,7,",ci,",reviewed,1537228864470,"M 0.7 - 9km NE of Aguanga, CA",0,earthquake,",focal-mechanism,geoserve,nearby-cities,origin,phase-data,scitech-link,",-480.0,1537305830770,https://earthquake.usgs.gov/earthquakes/eventpage/ci38063935


In [42]:
# the column count in our data
# using the awk utility for pattern scanning and processing
# the -F flag allows us to specify the delimiter
!awk -F',' '{print NF; exit}' data/earthquakes.csv

26


In [43]:
# since we know that the first line of the file contains headers and that the file is
# comma-separated, we can also count the columns by using head to get the headers
# and Python to parse them
headers = !head -n 1 data/earthquakes.csv
len(headers[0].split(','))

26

In [44]:
df = pd.read_csv('data/earthquakes.csv')

In [45]:
df.shape

(9332, 26)

In [46]:
df.describe()

Unnamed: 0,cdi,dmin,felt,gap,mag,mmi,nst,rms,sig,time,tsunami,tz,updated
count,329.0,6139.0,329.0,6164.0,9331.0,93.0,5364.0,9332.0,9332.0,9332.0,9332.0,9331.0,9332.0
mean,2.754711,0.544925,12.31003,121.506588,1.497345,3.651398,19.053878,0.362122,56.899914,1538284000000.0,0.006537,-451.99014,1538537000000.0
std,1.010637,2.214305,48.954944,72.962363,1.203347,1.790523,15.492315,0.317784,91.872163,608030600.0,0.080589,231.752571,656413500.0
min,0.0,0.000648,0.0,12.0,-1.26,0.0,0.0,0.0,0.0,1537229000000.0,0.0,-720.0,1537230000000.0
25%,2.0,0.020425,1.0,66.1425,0.72,2.68,8.0,0.119675,8.0,1537793000000.0,0.0,-540.0,1537996000000.0
50%,2.7,0.05905,2.0,105.0,1.3,3.72,15.0,0.21,26.0,1538245000000.0,0.0,-480.0,1538621000000.0
75%,3.3,0.17725,5.0,159.0,1.9,4.57,25.0,0.59,56.0,1538766000000.0,0.0,-480.0,1539110000000.0
max,8.4,53.737,580.0,355.91,7.5,9.12,172.0,1.91,2015.0,1539475000000.0,1.0,720.0,1539537000000.0


### Working with SQL

In [47]:
# Let's write the tsunami data from the data/tsunamis.csv file to a table in the database
# called tsunamis, replacing the table if it already exists
import sqlite3

with sqlite3.connect('data/quakes.db') as connection:
    pd.read_csv('data/tsunamis.csv').to_sql(
        'tsunamis', connection, index=False, if_exists='replace')

In [48]:
# Let's query our database for the full tsunamis table
with sqlite3.connect('data/quakes.db') as connection:
    tsunamis = pd.read_sql('SELECT * FROM tsunamis', connection)
tsunamis.head()

Unnamed: 0,alert,type,title,place,magType,mag,time
0,,earthquake,"M 5.0 - 165km NNW of Flying Fish Cove, Christm...","165km NNW of Flying Fish Cove, Christmas Island",mww,5.0,1539459504090
1,green,earthquake,"M 6.7 - 262km NW of Ozernovskiy, Russia","262km NW of Ozernovskiy, Russia",mww,6.7,1539429023560
2,green,earthquake,"M 5.6 - 128km SE of Kimbe, Papua New Guinea","128km SE of Kimbe, Papua New Guinea",mww,5.6,1539312723620
3,green,earthquake,"M 6.5 - 148km S of Severo-Kuril'sk, Russia","148km S of Severo-Kuril'sk, Russia",mww,6.5,1539213362130
4,green,earthquake,"M 6.2 - 94km SW of Kokopo, Papua New Guinea","94km SW of Kokopo, Papua New Guinea",mww,6.2,1539208835130


### Working with an API

In [49]:
# getting data from an API

import requests

yesterday = dt.date.today() - dt.timedelta(days=1)
api = 'https://earthquake.usgs.gov/fdsnws/event/1/query'
payload = {
    'format': 'geojson',
    'starttime': yesterday - dt.timedelta(days=30),
    'endtime': yesterday
}
response = requests.get(api, params=payload)

In [50]:
response.status_code

200

In [51]:
earthquake_json = response.json()
earthquake_json.keys()

dict_keys(['type', 'metadata', 'features', 'bbox'])

In [52]:
earthquake_json['metadata']

{'generated': 1658984816000,
 'url': 'https://earthquake.usgs.gov/fdsnws/event/1/query?format=geojson&starttime=2022-06-27&endtime=2022-07-27',
 'title': 'USGS Earthquakes',
 'status': 200,
 'api': '1.13.6',
 'count': 9117}

In [53]:
type(earthquake_json['features'])

list

In [54]:
earthquake_json['features'][0]

{'type': 'Feature',
 'properties': {'mag': -0.4,
  'place': '47 km ESE of Beatty, Nevada',
  'time': 1658879799072,
  'updated': 1658881088464,
  'tz': None,
  'url': 'https://earthquake.usgs.gov/earthquakes/eventpage/nn00843638',
  'detail': 'https://earthquake.usgs.gov/fdsnws/event/1/query?eventid=nn00843638&format=geojson',
  'felt': None,
  'cdi': None,
  'mmi': None,
  'alert': None,
  'status': 'reviewed',
  'tsunami': 0,
  'sig': 0,
  'net': 'nn',
  'code': '00843638',
  'ids': ',nn00843638,',
  'sources': ',nn,',
  'types': ',origin,phase-data,',
  'nst': 11,
  'dmin': 0.064,
  'rms': 0.1124,
  'gap': 112.54,
  'magType': 'ml',
  'type': 'earthquake',
  'title': 'M -0.4 - 47 km ESE of Beatty, Nevada'},
 'geometry': {'type': 'Point', 'coordinates': [-116.2902, 36.7051, 4.8]},
 'id': 'nn00843638'}

In [55]:
earthquake_properties_data = [
    quake['properties']
    for quake in earthquake_json['features']
]

print(f'data sample: {earthquake_properties_data[0]}\n')
print(f'row count: {len(earthquake_properties_data)}')

data sample: {'mag': -0.4, 'place': '47 km ESE of Beatty, Nevada', 'time': 1658879799072, 'updated': 1658881088464, 'tz': None, 'url': 'https://earthquake.usgs.gov/earthquakes/eventpage/nn00843638', 'detail': 'https://earthquake.usgs.gov/fdsnws/event/1/query?eventid=nn00843638&format=geojson', 'felt': None, 'cdi': None, 'mmi': None, 'alert': None, 'status': 'reviewed', 'tsunami': 0, 'sig': 0, 'net': 'nn', 'code': '00843638', 'ids': ',nn00843638,', 'sources': ',nn,', 'types': ',origin,phase-data,', 'nst': 11, 'dmin': 0.064, 'rms': 0.1124, 'gap': 112.54, 'magType': 'ml', 'type': 'earthquake', 'title': 'M -0.4 - 47 km ESE of Beatty, Nevada'}

row count: 9117


In [56]:
df = pd.DataFrame(earthquake_properties_data)
df.head()

Unnamed: 0,mag,place,time,updated,tz,url,detail,felt,cdi,mmi,...,ids,sources,types,nst,dmin,rms,gap,magType,type,title
0,-0.4,"47 km ESE of Beatty, Nevada",1658879799072,1658881088464,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,,,...,",nn00843638,",",nn,",",origin,phase-data,",11.0,0.064,0.1124,112.54,ml,earthquake,"M -0.4 - 47 km ESE of Beatty, Nevada"
1,0.74,"8km SW of Morongo Valley, CA",1658879540810,1658941257143,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,,,...,",ci40070247,",",ci,",",nearby-cities,origin,phase-data,scitech-link,",14.0,0.04529,0.09,129.0,ml,earthquake,"M 0.7 - 8km SW of Morongo Valley, CA"
2,2.07,"4 km SSW of Indios, Puerto Rico",1658878898510,1658879774990,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,,,...,",pr71361178,",",pr,",",origin,phase-data,",8.0,,0.08,208.0,md,earthquake,"M 2.1 - 4 km SSW of Indios, Puerto Rico"
3,1.1,"22km N of Yucca Valley, CA",1658878817040,1658941143213,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,,,...,",ci40070239,",",ci,",",nearby-cities,origin,phase-data,scitech-link,",21.0,0.05621,0.12,70.0,ml,earthquake,"M 1.1 - 22km N of Yucca Valley, CA"
4,1.7,"70 km ENE of Lime Village, Alaska",1658878463042,1658878605320,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,,,...,",ak0229iq559c,",",ak,",",origin,phase-data,",,,0.87,,ml,earthquake,"M 1.7 - 70 km ENE of Lime Village, Alaska"


## Inspecting a DF object

In [86]:
df = pd.read_csv('data/earthquakes.csv')
df.empty

False

In [58]:
print(df.shape)
print(df.columns)
print(df.dtypes)

(9332, 26)
Index(['alert', 'cdi', 'code', 'detail', 'dmin', 'felt', 'gap', 'ids', 'mag',
       'magType', 'mmi', 'net', 'nst', 'place', 'rms', 'sig', 'sources',
       'status', 'time', 'title', 'tsunami', 'type', 'types', 'tz', 'updated',
       'url'],
      dtype='object')
alert       object
cdi        float64
code        object
detail      object
dmin       float64
felt       float64
gap        float64
ids         object
mag        float64
magType     object
mmi        float64
net         object
nst        float64
place       object
rms        float64
sig          int64
sources     object
status      object
time         int64
title       object
tsunami      int64
type        object
types       object
tz         float64
updated      int64
url         object
dtype: object


In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9332 entries, 0 to 9331
Data columns (total 26 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   alert    59 non-null     object 
 1   cdi      329 non-null    float64
 2   code     9332 non-null   object 
 3   detail   9332 non-null   object 
 4   dmin     6139 non-null   float64
 5   felt     329 non-null    float64
 6   gap      6164 non-null   float64
 7   ids      9332 non-null   object 
 8   mag      9331 non-null   float64
 9   magType  9331 non-null   object 
 10  mmi      93 non-null     float64
 11  net      9332 non-null   object 
 12  nst      5364 non-null   float64
 13  place    9332 non-null   object 
 14  rms      9332 non-null   float64
 15  sig      9332 non-null   int64  
 16  sources  9332 non-null   object 
 17  status   9332 non-null   object 
 18  time     9332 non-null   int64  
 19  title    9332 non-null   object 
 20  tsunami  9332 non-null   int64  
 21  type     9332 

In [60]:
df.describe()

Unnamed: 0,cdi,dmin,felt,gap,mag,mmi,nst,rms,sig,time,tsunami,tz,updated
count,329.0,6139.0,329.0,6164.0,9331.0,93.0,5364.0,9332.0,9332.0,9332.0,9332.0,9331.0,9332.0
mean,2.754711,0.544925,12.31003,121.506588,1.497345,3.651398,19.053878,0.362122,56.899914,1538284000000.0,0.006537,-451.99014,1538537000000.0
std,1.010637,2.214305,48.954944,72.962363,1.203347,1.790523,15.492315,0.317784,91.872163,608030600.0,0.080589,231.752571,656413500.0
min,0.0,0.000648,0.0,12.0,-1.26,0.0,0.0,0.0,0.0,1537229000000.0,0.0,-720.0,1537230000000.0
25%,2.0,0.020425,1.0,66.1425,0.72,2.68,8.0,0.119675,8.0,1537793000000.0,0.0,-540.0,1537996000000.0
50%,2.7,0.05905,2.0,105.0,1.3,3.72,15.0,0.21,26.0,1538245000000.0,0.0,-480.0,1538621000000.0
75%,3.3,0.17725,5.0,159.0,1.9,4.57,25.0,0.59,56.0,1538766000000.0,0.0,-480.0,1539110000000.0
max,8.4,53.737,580.0,355.91,7.5,9.12,172.0,1.91,2015.0,1539475000000.0,1.0,720.0,1539537000000.0


In [61]:
df.describe(percentiles=[0.05, 0.95])

Unnamed: 0,cdi,dmin,felt,gap,mag,mmi,nst,rms,sig,time,tsunami,tz,updated
count,329.0,6139.0,329.0,6164.0,9331.0,93.0,5364.0,9332.0,9332.0,9332.0,9332.0,9331.0,9332.0
mean,2.754711,0.544925,12.31003,121.506588,1.497345,3.651398,19.053878,0.362122,56.899914,1538284000000.0,0.006537,-451.99014,1538537000000.0
std,1.010637,2.214305,48.954944,72.962363,1.203347,1.790523,15.492315,0.317784,91.872163,608030600.0,0.080589,231.752571,656413500.0
min,0.0,0.000648,0.0,12.0,-1.26,0.0,0.0,0.0,0.0,1537229000000.0,0.0,-720.0,1537230000000.0
5%,2.0,0.005491,1.0,35.0,-0.04,0.0,4.0,0.03,0.0,1537344000000.0,0.0,-600.0,1537387000000.0
50%,2.7,0.05905,2.0,105.0,1.3,3.72,15.0,0.21,26.0,1538245000000.0,0.0,-480.0,1538621000000.0
95%,4.3,2.6789,40.2,276.0,4.4,6.38,49.0,0.96,298.0,1539319000000.0,0.0,-60.0,1539400000000.0
max,8.4,53.737,580.0,355.91,7.5,9.12,172.0,1.91,2015.0,1539475000000.0,1.0,720.0,1539537000000.0


In [62]:
df.describe(include=object)

Unnamed: 0,alert,code,detail,ids,magType,net,place,sources,status,title,type,types,url
count,59,9332,9332,9332,9331,9332,9332,9332,9332,9332,9332,9332,9332
unique,2,9332,9332,9332,10,14,5433,52,2,7807,5,42,9332
top,green,37389218,https://earthquake.usgs.gov/fdsnws/event/1/que...,",ci37389218,",ml,ak,"10km NE of Aguanga, CA",",ak,",reviewed,"M 0.4 - 10km NE of Aguanga, CA",earthquake,",geoserve,origin,phase-data,",https://earthquake.usgs.gov/earthquakes/eventp...
freq,58,1,1,1,6803,3166,306,2981,7797,55,9081,5301,1


In [63]:
df.count()

alert        59
cdi         329
code       9332
detail     9332
dmin       6139
felt        329
gap        6164
ids        9332
mag        9331
magType    9331
mmi          93
net        9332
nst        5364
place      9332
rms        9332
sig        9332
sources    9332
status     9332
time       9332
title      9332
tsunami    9332
type       9332
types      9332
tz         9331
updated    9332
url        9332
dtype: int64

In [64]:
df.nunique()

alert         2
cdi          37
code       9332
detail     9332
dmin       4647
felt         44
gap        1114
ids        9332
mag         477
magType      10
mmi          77
net          14
nst         102
place      5433
rms         823
sig         268
sources      52
status        2
time       9332
title      7807
tsunami       2
type          5
types        42
tz           29
updated    9332
url        9332
dtype: int64

In [65]:
cols = ['time', 'updated']
df.corr(method='pearson')

Unnamed: 0,cdi,dmin,felt,gap,mag,mmi,nst,rms,sig,time,tsunami,tz,updated
cdi,1.0,0.187996,0.268136,-0.14619,0.399337,0.646273,0.326456,0.285521,0.496059,-0.05373,0.152761,0.218833,0.071744
dmin,0.187996,1.0,-0.026284,0.004667,0.473606,-0.53408,-0.205622,0.450838,0.51397,-0.016757,0.074724,0.364648,0.123581
felt,0.268136,-0.026284,1.0,-0.148835,0.167702,0.080612,0.461274,0.017168,0.241496,0.028253,0.162864,-0.019137,0.163356
gap,-0.14619,0.004667,-0.148835,1.0,-0.051751,-0.20426,-0.531402,-0.060453,-0.064927,-0.022815,-0.094786,-0.036809,-0.011736
mag,0.399337,0.473606,0.167702,-0.051751,1.0,0.143664,0.351382,0.610893,0.915157,0.024952,0.260269,0.579527,0.188736
mmi,0.646273,-0.53408,0.080612,-0.20426,0.143664,1.0,0.275441,0.092323,0.302261,0.084416,0.559591,0.365396,0.11045
nst,0.326456,-0.205622,0.461274,-0.531402,0.351382,0.275441,1.0,0.104234,0.28356,0.005201,0.051673,-0.164782,0.012179
rms,0.285521,0.450838,0.017168,-0.060453,0.610893,0.092323,0.104234,1.0,0.553085,-0.026704,0.157305,0.278388,0.194743
sig,0.496059,0.51397,0.241496,-0.064927,0.915157,0.302261,0.28356,0.553085,1.0,0.011342,0.366343,0.68271,0.14712
time,-0.05373,-0.016757,0.028253,-0.022815,0.024952,0.084416,0.005201,-0.026704,0.011342,1.0,0.018205,0.006315,0.843418


In [66]:
df.alert.unique()

array([nan, 'green', 'red'], dtype=object)

In [67]:
df.alert.value_counts()

green    58
red       1
Name: alert, dtype: int64

In [68]:
df.time.max()

1539475168010

In [69]:
df.time.index.argmin()

0

In [70]:
df.time.argmax()

0

In [71]:
df.index.value_counts()

0       1
6198    1
6218    1
6219    1
6220    1
       ..
3110    1
3111    1
3112    1
3113    1
9331    1
Length: 9332, dtype: int64

## Grabbing subsets of the data

In [72]:
df[['mag', 'title']]

Unnamed: 0,mag,title
0,1.35,"M 1.4 - 9km NE of Aguanga, CA"
1,1.29,"M 1.3 - 9km NE of Aguanga, CA"
2,3.42,"M 3.4 - 8km NE of Aguanga, CA"
3,0.44,"M 0.4 - 9km NE of Aguanga, CA"
4,2.16,"M 2.2 - 10km NW of Avenal, CA"
...,...,...
9327,0.62,"M 0.6 - 9km ENE of Mammoth Lakes, CA"
9328,1.00,"M 1.0 - 3km W of Julian, CA"
9329,2.40,"M 2.4 - 35km NNE of Hatillo, Puerto Rico"
9330,1.10,"M 1.1 - 9km NE of Aguanga, CA"


In [75]:
# selecting distinct columns
df[['title', 'time'] + [col for col in df.columns if col.startswith('mag')]]

Unnamed: 0,title,time,mag,magType
0,"M 1.4 - 9km NE of Aguanga, CA",1539475168010,1.35,ml
1,"M 1.3 - 9km NE of Aguanga, CA",1539475129610,1.29,ml
2,"M 3.4 - 8km NE of Aguanga, CA",1539475062610,3.42,ml
3,"M 0.4 - 9km NE of Aguanga, CA",1539474978070,0.44,ml
4,"M 2.2 - 10km NW of Avenal, CA",1539474716050,2.16,md
...,...,...,...,...
9327,"M 0.6 - 9km ENE of Mammoth Lakes, CA",1537230228060,0.62,md
9328,"M 1.0 - 3km W of Julian, CA",1537230135130,1.00,ml
9329,"M 2.4 - 35km NNE of Hatillo, Puerto Rico",1537229908180,2.40,md
9330,"M 1.1 - 9km NE of Aguanga, CA",1537229545350,1.10,ml


In [77]:
# slicing
df[100:103]

Unnamed: 0,alert,cdi,code,detail,dmin,felt,gap,ids,mag,magType,...,sources,status,time,title,tsunami,type,types,tz,updated,url
100,,,20280310,https://earthquake.usgs.gov/fdsnws/event/1/que...,,,,",ak20280310,",1.2,ml,...,",ak,",automatic,1539435449480,"M 1.2 - 25km NW of Ester, Alaska",0,earthquake,",geoserve,origin,",-540.0,1539443551010,https://earthquake.usgs.gov/earthquakes/eventp...
101,,,73096756,https://earthquake.usgs.gov/fdsnws/event/1/que...,0.01355,,185.0,",nc73096756,",0.59,md,...,",nc,",automatic,1539435391320,"M 0.6 - 8km ESE of Mammoth Lakes, CA",0,earthquake,",geoserve,nearby-cities,origin,phase-data,scit...",-480.0,1539439802162,https://earthquake.usgs.gov/earthquakes/eventp...
102,,,37388730,https://earthquake.usgs.gov/fdsnws/event/1/que...,0.02987,,39.0,",ci37388730,",1.33,ml,...,",ci,",automatic,1539435293090,"M 1.3 - 8km ENE of Aguanga, CA",0,earthquake,",focal-mechanism,geoserve,nearby-cities,origin...",-480.0,1539435940470,https://earthquake.usgs.gov/earthquakes/eventp...


In [78]:
# combining AKA chaining
df[['time', 'mag']][100:103]

Unnamed: 0,time,mag
100,1539435449480,1.2
101,1539435391320,0.59
102,1539435293090,1.33


In [87]:
df.loc[110:112, 'title']

110               M 1.1 - 35km S of Ester, Alaska
111    M 1.9 - 93km WNW of Arctic Village, Alaska
112      M 0.9 - 20km WSW of Smith Valley, Nevada
Name: title, dtype: object

In [88]:
df.loc[110:112, 'title'] = df.loc[110:112, 'title'].str.lower()

In [89]:
df.loc[110:112, 'title'] 

110               m 1.1 - 35km s of ester, alaska
111    m 1.9 - 93km wnw of arctic village, alaska
112      m 0.9 - 20km wsw of smith valley, nevada
Name: title, dtype: object

In [90]:
df.loc[:, 'title']

0                  M 1.4 - 9km NE of Aguanga, CA
1                  M 1.3 - 9km NE of Aguanga, CA
2                  M 3.4 - 8km NE of Aguanga, CA
3                  M 0.4 - 9km NE of Aguanga, CA
4                  M 2.2 - 10km NW of Avenal, CA
                          ...                   
9327        M 0.6 - 9km ENE of Mammoth Lakes, CA
9328                 M 1.0 - 3km W of Julian, CA
9329    M 2.4 - 35km NNE of Hatillo, Puerto Rico
9330               M 1.1 - 9km NE of Aguanga, CA
9331               M 0.7 - 9km NE of Aguanga, CA
Name: title, Length: 9332, dtype: object

In [91]:
df.loc[10:15, ['title', 'mag']]

Unnamed: 0,title,mag
10,"M 0.5 - 10km NE of Aguanga, CA",0.5
11,"M 2.8 - 53km SE of Punta Cana, Dominican Republic",2.77
12,"M 0.5 - 9km NE of Aguanga, CA",0.5
13,"M 4.5 - 120km SSW of Banda Aceh, Indonesia",4.5
14,"M 2.1 - 14km NW of Parkfield, CA",2.13
15,"M 2.0 - 156km WNW of Haines Junction, Canada",2.0


In [92]:
# As we have seen, when using loc[], our end index is inclusive. This isn't the case with iloc[]
df.iloc[10:15, [19, 8]]

Unnamed: 0,title,mag
10,"M 0.5 - 10km NE of Aguanga, CA",0.5
11,"M 2.8 - 53km SE of Punta Cana, Dominican Republic",2.77
12,"M 0.5 - 9km NE of Aguanga, CA",0.5
13,"M 4.5 - 120km SSW of Banda Aceh, Indonesia",4.5
14,"M 2.1 - 14km NW of Parkfield, CA",2.13


In [93]:
df.iloc[10:15, 6:10]

Unnamed: 0,gap,ids,mag,magType
10,57.0,",ci37389162,",0.5,ml
11,186.0,",pr2018286010,",2.77,md
12,76.0,",ci37389146,",0.5,ml
13,157.0,",us1000hbti,",4.5,mb
14,71.0,",nc73096921,",2.13,md


In [94]:
# many ways to achieve the same result
df.iloc[10:15, 6:10].equals(df.loc[10:14, 'gap':'magType'])

True

In [95]:
# To look up scalar values, we use at[] and iat[], which are faster.
df.at[10, 'mag']

0.5

In [96]:
df.iat[10, 8]

0.5

In [97]:
df.mag > 2

0       False
1       False
2        True
3       False
4        True
        ...  
9327    False
9328    False
9329     True
9330    False
9331    False
Name: mag, Length: 9332, dtype: bool

In [98]:
df[df.mag >= 7.0]

Unnamed: 0,alert,cdi,code,detail,dmin,felt,gap,ids,mag,magType,...,sources,status,time,title,tsunami,type,types,tz,updated,url
837,green,4.1,1000haa3,https://earthquake.usgs.gov/fdsnws/event/1/que...,1.763,3.0,14.0,",us1000haa3,pt18283003,at00pgehsk,",7.0,mww,...,",us,pt,at,",reviewed,1539204500290,"M 7.0 - 117km E of Kimbe, Papua New Guinea",1,earthquake,",dyfi,finite-fault,general-text,geoserve,groun...",600.0,1539378744253,https://earthquake.usgs.gov/earthquakes/eventp...
5263,red,8.4,1000h3p4,https://earthquake.usgs.gov/fdsnws/event/1/que...,1.589,18.0,27.0,",us1000h3p4,us1000h4p4,",7.5,mww,...,",us,us,",reviewed,1538128963480,"M 7.5 - 78km N of Palu, Indonesia",1,earthquake,",dyfi,finite-fault,general-text,geoserve,groun...",480.0,1539123134531,https://earthquake.usgs.gov/earthquakes/eventp...


In [100]:
df.loc[df.mag >= 7.0, ['alert', 'mag', 'magType', 'title', 'tsunami', 'type']]

Unnamed: 0,alert,mag,magType,title,tsunami,type
837,green,7.0,mww,"M 7.0 - 117km E of Kimbe, Papua New Guinea",1,earthquake
5263,red,7.5,mww,"M 7.5 - 78km N of Palu, Indonesia",1,earthquake


In [101]:
# bitwise AND operator
df.loc[(df.tsunami == 1) & (df.alert == 'red'), ['alert', 'mag', 'magType', 'title', 'tsunami', 'type']]

Unnamed: 0,alert,mag,magType,title,tsunami,type
5263,red,7.5,mww,"M 7.5 - 78km N of Palu, Indonesia",1,earthquake


In [102]:
# bitwise OR operator
df.loc[(df.tsunami == 1) | (df.alert == 'red'), ['alert', 'mag', 'magType', 'title', 'tsunami', 'type']]

Unnamed: 0,alert,mag,magType,title,tsunami,type
36,,5.0,mww,"M 5.0 - 165km NNW of Flying Fish Cove, Christm...",1,earthquake
118,green,6.7,mww,"M 6.7 - 262km NW of Ozernovskiy, Russia",1,earthquake
501,green,5.6,mww,"M 5.6 - 128km SE of Kimbe, Papua New Guinea",1,earthquake
799,green,6.5,mww,"M 6.5 - 148km S of Severo-Kuril'sk, Russia",1,earthquake
816,green,6.2,mww,"M 6.2 - 94km SW of Kokopo, Papua New Guinea",1,earthquake
...,...,...,...,...,...,...
8561,,5.4,mb,"M 5.4 - 228km S of Taron, Papua New Guinea",1,earthquake
8624,,5.1,mb,"M 5.1 - 278km SE of Pondaguitan, Philippines",1,earthquake
9133,green,5.1,ml,"M 5.1 - 64km SSW of Kaktovik, Alaska",1,earthquake
9175,,5.2,mb,"M 5.2 - 126km N of Dili, East Timor",1,earthquake


In [104]:
df.loc[(df.place.str.contains('Alaska')) & (df.alert.notnull()), 
       ['alert', 'mag', 'magType', 'title', 'tsunami', 'type', 'place']]

Unnamed: 0,alert,mag,magType,title,tsunami,type,place
1015,green,5.0,ml,"M 5.0 - 61km SSW of Chignik Lake, Alaska",1,earthquake,"61km SSW of Chignik Lake, Alaska"
1273,green,4.0,ml,"M 4.0 - 71km SW of Kaktovik, Alaska",1,earthquake,"71km SW of Kaktovik, Alaska"
1795,green,4.0,ml,"M 4.0 - 60km WNW of Valdez, Alaska",1,earthquake,"60km WNW of Valdez, Alaska"
2752,green,4.0,ml,"M 4.0 - 67km SSW of Kaktovik, Alaska",1,earthquake,"67km SSW of Kaktovik, Alaska"
3260,green,3.9,ml,"M 3.9 - 44km N of North Nenana, Alaska",0,earthquake,"44km N of North Nenana, Alaska"
4101,green,4.2,ml,"M 4.2 - 131km NNW of Arctic Village, Alaska",0,earthquake,"131km NNW of Arctic Village, Alaska"
6897,green,3.8,ml,"M 3.8 - 80km SSW of Kaktovik, Alaska",0,earthquake,"80km SSW of Kaktovik, Alaska"
8524,green,3.8,ml,"M 3.8 - 69km SSW of Kaktovik, Alaska",0,earthquake,"69km SSW of Kaktovik, Alaska"
9133,green,5.1,ml,"M 5.1 - 64km SSW of Kaktovik, Alaska",1,earthquake,"64km SSW of Kaktovik, Alaska"


In [109]:
df.loc[(df.place.str.contains(r'CA|California$')) & (df.mag > 3.8), 
       ['alert', 'mag', 'magType', 'title', 'tsunami', 'type', 'place']]

Unnamed: 0,alert,mag,magType,title,tsunami,type,place
1465,green,3.83,mw,"M 3.8 - 109km WNW of Trinidad, CA",0,earthquake,"109km WNW of Trinidad, CA"
2414,green,3.83,mw,"M 3.8 - 5km SW of Tres Pinos, CA",1,earthquake,"5km SW of Tres Pinos, CA"


In [112]:
df.loc[df.mag.between(6.5, 7.5), ['alert', 'mag', 'magType', 'title', 'tsunami', 'type', 'place']]

Unnamed: 0,alert,mag,magType,title,tsunami,type,place
118,green,6.7,mww,"M 6.7 - 262km NW of Ozernovskiy, Russia",1,earthquake,"262km NW of Ozernovskiy, Russia"
799,green,6.5,mww,"M 6.5 - 148km S of Severo-Kuril'sk, Russia",1,earthquake,"148km S of Severo-Kuril'sk, Russia"
837,green,7.0,mww,"M 7.0 - 117km E of Kimbe, Papua New Guinea",1,earthquake,"117km E of Kimbe, Papua New Guinea"
4363,green,6.7,mww,"M 6.7 - 263km NNE of Ndoi Island, Fiji",1,earthquake,"263km NNE of Ndoi Island, Fiji"
5263,red,7.5,mww,"M 7.5 - 78km N of Palu, Indonesia",1,earthquake,"78km N of Palu, Indonesia"


In [116]:
df.loc[df.magType.isin(['mw', 'mwb']),['alert', 'mag', 'magType', 'title', 'tsunami', 'type', 'place']]

Unnamed: 0,alert,mag,magType,title,tsunami,type,place
995,,3.35,mw,"M 3.4 - 9km WNW of Cobb, CA",0,earthquake,"9km WNW of Cobb, CA"
1465,green,3.83,mw,"M 3.8 - 109km WNW of Trinidad, CA",0,earthquake,"109km WNW of Trinidad, CA"
2414,green,3.83,mw,"M 3.8 - 5km SW of Tres Pinos, CA",1,earthquake,"5km SW of Tres Pinos, CA"
4988,green,4.41,mw,"M 4.4 - 1km SE of Delta, B.C., MX",1,earthquake,"1km SE of Delta, B.C., MX"
6307,green,5.8,mwb,"M 5.8 - 297km NNE of Ndoi Island, Fiji",0,earthquake,"297km NNE of Ndoi Island, Fiji"
8257,green,5.7,mwb,"M 5.7 - 175km SSE of Lambasa, Fiji",0,earthquake,"175km SSE of Lambasa, Fiji"


In [135]:
usecols = ['alert', 'mag', 'magType', 'title', 'tsunami', 'type', 'place']

In [127]:
min_max = [df.mag.idxmin(), df.mag.idxmax()]
print(min_max)

[2409, 5263]


In [132]:
print(f"minimum magnitude = {df.at[2409, 'mag']} \
        maximum magnitude = {df.at[5263, 'mag']}")

minimum magnitude = -1.26         maximum magnitude = 7.5


In [137]:
df.loc[min_max, usecols]

Unnamed: 0,alert,mag,magType,title,tsunami,type,place
2409,,-1.26,ml,"M -1.3 - 41km ENE of Adak, Alaska",0,earthquake,"41km ENE of Adak, Alaska"
5263,red,7.5,mww,"M 7.5 - 78km N of Palu, Indonesia",1,earthquake,"78km N of Palu, Indonesia"


## Adding and removing data

In [250]:
df_to_modify = df.copy()

In [251]:
df = pd.read_csv('data/earthquakes.csv', 
                 usecols=['time', 'title', 'place', 'magType',
                          'mag', 'alert', 'tsunami'])

In [252]:
# We cannot create the column with attribute notation (df.source) because
# the dataframe doesn't have that attribute yet, so we must use dictionary
# notation (df['source']).
df['source'] = 'USGS API'

In [253]:
df.head()

Unnamed: 0,alert,mag,magType,place,time,title,tsunami,source
0,,1.35,ml,"9km NE of Aguanga, CA",1539475168010,"M 1.4 - 9km NE of Aguanga, CA",0,USGS API
1,,1.29,ml,"9km NE of Aguanga, CA",1539475129610,"M 1.3 - 9km NE of Aguanga, CA",0,USGS API
2,,3.42,ml,"8km NE of Aguanga, CA",1539475062610,"M 3.4 - 8km NE of Aguanga, CA",0,USGS API
3,,0.44,ml,"9km NE of Aguanga, CA",1539474978070,"M 0.4 - 9km NE of Aguanga, CA",0,USGS API
4,,2.16,md,"10km NW of Avenal, CA",1539474716050,"M 2.2 - 10km NW of Avenal, CA",0,USGS API


In [254]:
df['mag_negative'] = df.mag < 0
df.head()

Unnamed: 0,alert,mag,magType,place,time,title,tsunami,source,mag_negative
0,,1.35,ml,"9km NE of Aguanga, CA",1539475168010,"M 1.4 - 9km NE of Aguanga, CA",0,USGS API,False
1,,1.29,ml,"9km NE of Aguanga, CA",1539475129610,"M 1.3 - 9km NE of Aguanga, CA",0,USGS API,False
2,,3.42,ml,"8km NE of Aguanga, CA",1539475062610,"M 3.4 - 8km NE of Aguanga, CA",0,USGS API,False
3,,0.44,ml,"9km NE of Aguanga, CA",1539474978070,"M 0.4 - 9km NE of Aguanga, CA",0,USGS API,False
4,,2.16,md,"10km NW of Avenal, CA",1539474716050,"M 2.2 - 10km NW of Avenal, CA",0,USGS API,False


In [255]:
df[df['mag_negative'] == True]

Unnamed: 0,alert,mag,magType,place,time,title,tsunami,source,mag_negative
39,,-0.10,ml,"6km NW of Lemmon Valley, Nevada",1539458844506,"M -0.1 - 6km NW of Lemmon Valley, Nevada",0,USGS API,True
49,,-0.10,ml,"6km NW of Lemmon Valley, Nevada",1539455017464,"M -0.1 - 6km NW of Lemmon Valley, Nevada",0,USGS API,True
135,,-0.40,ml,"10km SSE of Beatty, Nevada",1539422175717,"M -0.4 - 10km SSE of Beatty, Nevada",0,USGS API,True
161,,-0.02,md,"20km SSE of Ronan, Montana",1539412475360,"M -0.0 - 20km SSE of Ronan, Montana",0,USGS API,True
198,,-0.20,ml,"60km N of Pahrump, Nevada",1539398340822,"M -0.2 - 60km N of Pahrump, Nevada",0,USGS API,True
...,...,...,...,...,...,...,...,...,...
9202,,-0.11,ml,"42km ENE of Adak, Alaska",1537257741590,"M -0.1 - 42km ENE of Adak, Alaska",0,USGS API,True
9241,,-0.69,ml,"42km ENE of Adak, Alaska",1537249710450,"M -0.7 - 42km ENE of Adak, Alaska",0,USGS API,True
9261,,-0.02,ml,"28km S of Morton, Washington",1537245303560,"M -0.0 - 28km S of Morton, Washington",0,USGS API,True
9267,,-0.32,md,"30km ENE of Seeley Lake, Montana",1537244570780,"M -0.3 - 30km ENE of Seeley Lake, Montana",0,USGS API,True


In [256]:
df.place.value_counts()

10km NE of Aguanga, CA                   306
9km NE of Aguanga, CA                    264
42km ENE of Adak, Alaska                  71
22km ENE of Honaunau-Napoopoo, Hawaii     65
8km NE of Aguanga, CA                     60
                                        ... 
73km NNW of Talkeetna, Alaska              1
25km ENE of Nikiski, Alaska                1
68km NNW of Talkeetna, Alaska              1
72km E of Cape Yakataga, Alaska            1
35km NNE of Hatillo, Puerto Rico           1
Name: place, Length: 5433, dtype: int64

In [257]:
df.place.str.extract(r', (.*$)')[0].sort_values().unique()

array(['Afghanistan', 'Alaska', 'Argentina', 'Arizona', 'Arkansas',
       'Australia', 'Azerbaijan', 'B.C., MX', 'Barbuda', 'Bolivia',
       'Bonaire, Saint Eustatius and Saba ', 'British Virgin Islands',
       'Burma', 'CA', 'California', 'Canada', 'Chile', 'China',
       'Christmas Island', 'Colombia', 'Colorado', 'Costa Rica',
       'Dominican Republic', 'East Timor', 'Ecuador', 'Ecuador region',
       'El Salvador', 'Fiji', 'Greece', 'Greenland', 'Guam', 'Guatemala',
       'Haiti', 'Hawaii', 'Honduras', 'Idaho', 'Illinois', 'India',
       'Indonesia', 'Iran', 'Iraq', 'Italy', 'Jamaica', 'Japan', 'Kansas',
       'Kentucky', 'Kyrgyzstan', 'Martinique', 'Mauritius', 'Mayotte',
       'Mexico', 'Missouri', 'Montana', 'NV', 'Nevada', 'New Caledonia',
       'New Hampshire', 'New Mexico', 'New Zealand', 'Nicaragua',
       'North Carolina', 'Northern Mariana Islands', 'Oklahoma', 'Oregon',
       'Pakistan', 'Papua New Guinea', 'Peru', 'Philippines',
       'Puerto Rico', 'Roman

In [258]:
len(df.place.unique())

5433

In [259]:
df.place.unique()

array(['9km NE of Aguanga, CA', '8km NE of Aguanga, CA',
       '10km NW of Avenal, CA', ..., '9km ENE of Mammoth Lakes, CA',
       '3km W of Julian, CA', '35km NNE of Hatillo, Puerto Rico'],
      dtype=object)

In [260]:
df['parced_place'] = df.place.str.replace(
        r'.* of ', '', regex=True # remove <x> of <x>
    ).str.replace(
        'the ', '' # remove 'the'
    ).str.replace(
        r'CA$', 'California', regex=True # fix California
    ).str.replace(
        r'NV$', 'Nevada', regex=True # fix Nevada
    ).str.replace(
        r'MX$', 'Mexico', regex=True # fix Mexico
    ).str.replace(
        r'region$', '', regex=True # fix " region" endings
    ).str.replace(
        'nothern ', '' # remove 'nothern'
    ).str.replace(
        'Fiji Islands', 'Fiji' # line up the Fiji places
    ).str.replace(
        r'^.*, ', '', regex=True  #remove anything else extraneous from start
    ).str.strip() # remove any extra spaces

In [261]:
df['parced_place'].sort_values().unique()

array(['Afghanistan', 'Alaska', 'Argentina', 'Arizona', 'Arkansas',
       'Ascension Island', 'Australia', 'Azerbaijan', 'Balleny Islands',
       'Barbuda', 'Bolivia', 'British Virgin Islands', 'Burma',
       'California', 'Canada', 'Carlsberg Ridge',
       'Central East Pacific Rise', 'Central Mid-Atlantic Ridge', 'Chile',
       'China', 'Christmas Island', 'Colombia', 'Colorado', 'Costa Rica',
       'Dominican Republic', 'East Timor', 'Ecuador', 'El Salvador',
       'Fiji', 'Greece', 'Greenland', 'Guam', 'Guatemala', 'Haiti',
       'Hawaii', 'Honduras', 'Idaho', 'Illinois', 'India',
       'Indian Ocean Triple Junction', 'Indonesia', 'Iran', 'Iraq',
       'Italy', 'Jamaica', 'Japan', 'Kansas', 'Kentucky',
       'Kermadec Islands', 'Kuril Islands', 'Kyrgyzstan', 'Martinique',
       'Mauritius', 'Mayotte', 'Mexico', 'Mid-Indian Ridge', 'Missouri',
       'Montana', 'Nevada', 'New Caledonia', 'New Hampshire',
       'New Mexico', 'New Zealand', 'Nicaragua', 'North Carolina',


In [268]:
len(df.parced_place.unique())

110

In [263]:
# Let's create two new columns
df.assign(in_ca=df.parced_place.str.endswith('California'),
         in_alaska=df.parced_place.str.endswith('Alaska')
         ).sample(5, random_state=0)

Unnamed: 0,alert,mag,magType,place,time,title,tsunami,source,mag_negative,parced_place,in_ca,in_alaska
7207,,4.8,mwr,"73km SSW of Masachapa, Nicaragua",1537749595210,"M 4.8 - 73km SSW of Masachapa, Nicaragua",0,USGS API,False,Nicaragua,False,False
4755,,1.09,ml,"28km NNW of Packwood, Washington",1538227540460,"M 1.1 - 28km NNW of Packwood, Washington",0,USGS API,False,Washington,False,False
4595,,1.8,ml,"77km SSW of Kaktovik, Alaska",1538259609862,"M 1.8 - 77km SSW of Kaktovik, Alaska",0,USGS API,False,Alaska,False,True
3566,,1.5,ml,"102km NW of Arctic Village, Alaska",1538464751822,"M 1.5 - 102km NW of Arctic Village, Alaska",0,USGS API,False,Alaska,False,True
2182,,0.9,ml,"26km ENE of Pine Valley, CA",1538801713880,"M 0.9 - 26km ENE of Pine Valley, CA",0,USGS API,False,California,True,False


In [265]:
# The assign() method also accepts lambda functions
df.assign(
    in_ca=df.parced_place.str.endswith('California'),
    in_alaska=df.parced_place.str.endswith('Alaska'),
    neither=lambda x: ~x.in_ca & ~x.in_alaska
    ).sample(5, random_state=0)

Unnamed: 0,alert,mag,magType,place,time,title,tsunami,source,mag_negative,parced_place,in_ca,in_alaska,neither
7207,,4.8,mwr,"73km SSW of Masachapa, Nicaragua",1537749595210,"M 4.8 - 73km SSW of Masachapa, Nicaragua",0,USGS API,False,Nicaragua,False,False,True
4755,,1.09,ml,"28km NNW of Packwood, Washington",1538227540460,"M 1.1 - 28km NNW of Packwood, Washington",0,USGS API,False,Washington,False,False,True
4595,,1.8,ml,"77km SSW of Kaktovik, Alaska",1538259609862,"M 1.8 - 77km SSW of Kaktovik, Alaska",0,USGS API,False,Alaska,False,True,False
3566,,1.5,ml,"102km NW of Arctic Village, Alaska",1538464751822,"M 1.5 - 102km NW of Arctic Village, Alaska",0,USGS API,False,Alaska,False,True,False
2182,,0.9,ml,"26km ENE of Pine Valley, CA",1538801713880,"M 0.9 - 26km ENE of Pine Valley, CA",0,USGS API,False,California,True,False,False


In [370]:
df.parced_place.value_counts().head(40)

Alaska                          3665
California                      2861
Nevada                           681
Hawaii                           367
Puerto Rico                      216
Montana                          204
Washington                       157
Indonesia                        147
Idaho                             71
Utah                              67
Fiji                              61
Dominican Republic                57
Japan                             57
Canada                            55
Wyoming                           42
Mexico                            38
Oklahoma                          37
Papua New Guinea                  34
Russia                            33
U.S. Virgin Islands               31
Chile                             31
Tennessee                         30
Oregon                            27
Kansas                            23
Philippines                       21
British Virgin Islands            21
Tonga                             17
V

In [355]:
states = pd.read_csv('us_states.csv')
states_list = states['state_name'].to_list()
states_list

['Alabama',
 'Alaska',
 'Arizona',
 'Arkansas',
 'California',
 'Colorado',
 'Connecticut',
 'Delaware',
 'Florida',
 'Georgia',
 'Hawaii',
 'Idaho',
 'Illinois',
 'Indiana',
 'Iowa',
 'Kansas',
 'Kentucky',
 'Louisiana',
 'Maine',
 'Maryland',
 'Massachusetts',
 'Michigan',
 'Minnesota',
 'Mississippi',
 'Missouri',
 'Montana',
 'Nebraska',
 'Nevada',
 'New Hampshire',
 'New Jersey',
 'New Mexico',
 'New York',
 'North Carolina',
 'North Dakota',
 'Ohio',
 'Oklahoma',
 'Oregon',
 'Pennsylvania',
 'Rhode Island',
 'South Carolina',
 'South Dakota',
 'Tennessee',
 'Texas',
 'Utah',
 'Vermont',
 'Virginia',
 'Washington',
 'West Virginia',
 'Wisconsin',
 'Wyoming']

In [356]:
len(states_list)

50

In [376]:
df.loc[df['parced_place'].apply(lambda x: x in states_list), 'country'] = 'US'

In [362]:
df['parced_place'].apply(lambda x: x in states_list).value_counts()

True     8258
False    1074
Name: parced_place, dtype: int64

In [382]:
df.loc[df['country'].isna(), 'country'] = 'Other'

In [383]:
df['country'].value_counts()

US       8258
Other    1074
Name: country, dtype: int64

## Working with rows

In [384]:
tsunami = df[df.tsunami == 1]
no_tsunami = df[df.tsunami == 0]
tsunami.shape, no_tsunami.shape

((61, 11), (9271, 11))

In [385]:
pd.concat([tsunami, no_tsunami]).shape

(9332, 11)

In [386]:
tsunami.append(no_tsunami).shape

(9332, 11)

In [387]:
additional_columns = pd.read_csv('data/earthquakes.csv', usecols=['tz', 'felt', 'ids'])
pd.concat([df.head(2), additional_columns.head(2)], axis=1)

Unnamed: 0,alert,mag,magType,place,time,title,tsunami,source,mag_negative,parced_place,country,felt,ids,tz
0,,1.35,ml,"9km NE of Aguanga, CA",1539475168010,"M 1.4 - 9km NE of Aguanga, CA",0,USGS API,False,California,US,,",ci37389218,",-480.0
1,,1.29,ml,"9km NE of Aguanga, CA",1539475129610,"M 1.3 - 9km NE of Aguanga, CA",0,USGS API,False,California,US,,",ci37389202,",-480.0


In [389]:
pd.concat(
    [tsunami.head(2),
     no_tsunami.head(2).assign(type='earthquake')],
     join='inner'
)

Unnamed: 0,alert,mag,magType,place,time,title,tsunami,source,mag_negative,parced_place,country
36,,5.0,mww,"165km NNW of Flying Fish Cove, Christmas Island",1539459504090,"M 5.0 - 165km NNW of Flying Fish Cove, Christm...",1,USGS API,False,Christmas Island,Other
118,green,6.7,mww,"262km NW of Ozernovskiy, Russia",1539429023560,"M 6.7 - 262km NW of Ozernovskiy, Russia",1,USGS API,False,Russia,Other
0,,1.35,ml,"9km NE of Aguanga, CA",1539475168010,"M 1.4 - 9km NE of Aguanga, CA",0,USGS API,False,California,US
1,,1.29,ml,"9km NE of Aguanga, CA",1539475129610,"M 1.3 - 9km NE of Aguanga, CA",0,USGS API,False,California,US


In [391]:
pd.concat(
    [tsunami.head(2),
     no_tsunami.head(2).assign(type='earthquake')],
     join='outer', ignore_index='true'
)

Unnamed: 0,alert,mag,magType,place,time,title,tsunami,source,mag_negative,parced_place,country,type
0,,5.0,mww,"165km NNW of Flying Fish Cove, Christmas Island",1539459504090,"M 5.0 - 165km NNW of Flying Fish Cove, Christm...",1,USGS API,False,Christmas Island,Other,
1,green,6.7,mww,"262km NW of Ozernovskiy, Russia",1539429023560,"M 6.7 - 262km NW of Ozernovskiy, Russia",1,USGS API,False,Russia,Other,
2,,1.35,ml,"9km NE of Aguanga, CA",1539475168010,"M 1.4 - 9km NE of Aguanga, CA",0,USGS API,False,California,US,earthquake
3,,1.29,ml,"9km NE of Aguanga, CA",1539475129610,"M 1.3 - 9km NE of Aguanga, CA",0,USGS API,False,California,US,earthquake


## Deleting unwanted data

In [392]:
df.columns

Index(['alert', 'mag', 'magType', 'place', 'time', 'title', 'tsunami',
       'source', 'mag_negative', 'parced_place', 'country'],
      dtype='object')

In [394]:
del df['source']

In [395]:
df.columns

Index(['alert', 'mag', 'magType', 'place', 'time', 'title', 'tsunami',
       'mag_negative', 'parced_place', 'country'],
      dtype='object')

In [396]:
mag_negative = df.pop('mag_negative')
mag_negative

0       False
1       False
2       False
3       False
4       False
        ...  
9327    False
9328    False
9329    False
9330    False
9331    False
Name: mag_negative, Length: 9332, dtype: bool

In [397]:
df.columns

Index(['alert', 'mag', 'magType', 'place', 'time', 'title', 'tsunami',
       'parced_place', 'country'],
      dtype='object')

In [398]:
df[mag_negative].head()

Unnamed: 0,alert,mag,magType,place,time,title,tsunami,parced_place,country
39,,-0.1,ml,"6km NW of Lemmon Valley, Nevada",1539458844506,"M -0.1 - 6km NW of Lemmon Valley, Nevada",0,Nevada,US
49,,-0.1,ml,"6km NW of Lemmon Valley, Nevada",1539455017464,"M -0.1 - 6km NW of Lemmon Valley, Nevada",0,Nevada,US
135,,-0.4,ml,"10km SSE of Beatty, Nevada",1539422175717,"M -0.4 - 10km SSE of Beatty, Nevada",0,Nevada,US
161,,-0.02,md,"20km SSE of Ronan, Montana",1539412475360,"M -0.0 - 20km SSE of Ronan, Montana",0,Montana,US
198,,-0.2,ml,"60km N of Pahrump, Nevada",1539398340822,"M -0.2 - 60km N of Pahrump, Nevada",0,Nevada,US


In [399]:
# To remove multiple rows, we pass the list of indices
df.drop([0, 1]).head(2)

Unnamed: 0,alert,mag,magType,place,time,title,tsunami,parced_place,country
2,,3.42,ml,"8km NE of Aguanga, CA",1539475062610,"M 3.4 - 8km NE of Aguanga, CA",0,California,US
3,,0.44,ml,"9km NE of Aguanga, CA",1539474978070,"M 0.4 - 9km NE of Aguanga, CA",0,California,US


In [400]:
# If we want to drop columns, we can either pass axis=1 or specify our list of column names using the
# columns argument
cols_to_drop = [
    col for col in df.columns
    if col not in ['alert', 'mag', 'title', 'time', 'tsunami'
    ]
]
df.drop(columns=cols_to_drop).head()

Unnamed: 0,alert,mag,time,title,tsunami
0,,1.35,1539475168010,"M 1.4 - 9km NE of Aguanga, CA",0
1,,1.29,1539475129610,"M 1.3 - 9km NE of Aguanga, CA",0
2,,3.42,1539475062610,"M 3.4 - 8km NE of Aguanga, CA",0
3,,0.44,1539474978070,"M 0.4 - 9km NE of Aguanga, CA",0
4,,2.16,1539474716050,"M 2.2 - 10km NW of Avenal, CA",0


In [402]:
df.drop(columns=cols_to_drop).equals(df.drop(cols_to_drop, axis=1))

True

In [403]:
# if we really want to remove the data from our original dataframe, we can pass in inplace=True
df.drop(columns=cols_to_drop, inplace=True)
df.head()

Unnamed: 0,alert,mag,time,title,tsunami
0,,1.35,1539475168010,"M 1.4 - 9km NE of Aguanga, CA",0
1,,1.29,1539475129610,"M 1.3 - 9km NE of Aguanga, CA",0
2,,3.42,1539475062610,"M 3.4 - 8km NE of Aguanga, CA",0
3,,0.44,1539474978070,"M 0.4 - 9km NE of Aguanga, CA",0
4,,2.16,1539474716050,"M 2.2 - 10km NW of Avenal, CA",0


## Exercises

**1. Find the 95th percentile of earthquake magnitude in Japan using the mb magnitude type.**

In [423]:
df = pd.read_csv('data/earthquakes.csv')
df.shape

(9332, 26)

In [424]:
df.head(2)

Unnamed: 0,alert,cdi,code,detail,dmin,felt,gap,ids,mag,magType,...,sources,status,time,title,tsunami,type,types,tz,updated,url
0,,,37389218,https://earthquake.usgs.gov/fdsnws/event/1/que...,0.008693,,85.0,",ci37389218,",1.35,ml,...,",ci,",automatic,1539475168010,"M 1.4 - 9km NE of Aguanga, CA",0,earthquake,",geoserve,nearby-cities,origin,phase-data,",-480.0,1539475395144,https://earthquake.usgs.gov/earthquakes/eventp...
1,,,37389202,https://earthquake.usgs.gov/fdsnws/event/1/que...,0.02003,,79.0,",ci37389202,",1.29,ml,...,",ci,",automatic,1539475129610,"M 1.3 - 9km NE of Aguanga, CA",0,earthquake,",geoserve,nearby-cities,origin,phase-data,",-480.0,1539475253925,https://earthquake.usgs.gov/earthquakes/eventp...


In [425]:
df.columns

Index(['alert', 'cdi', 'code', 'detail', 'dmin', 'felt', 'gap', 'ids', 'mag',
       'magType', 'mmi', 'net', 'nst', 'place', 'rms', 'sig', 'sources',
       'status', 'time', 'title', 'tsunami', 'type', 'types', 'tz', 'updated',
       'url'],
      dtype='object')

In [419]:
df.describe()

Unnamed: 0,cdi,dmin,felt,gap,mag,mmi,nst,rms,sig,time,tsunami,tz,updated
count,329.0,6139.0,329.0,6164.0,9331.0,93.0,5364.0,9332.0,9332.0,9332.0,9332.0,9331.0,9332.0
mean,2.754711,0.544925,12.31003,121.506588,1.497345,3.651398,19.053878,0.362122,56.899914,1538284000000.0,0.006537,-451.99014,1538537000000.0
std,1.010637,2.214305,48.954944,72.962363,1.203347,1.790523,15.492315,0.317784,91.872163,608030600.0,0.080589,231.752571,656413500.0
min,0.0,0.000648,0.0,12.0,-1.26,0.0,0.0,0.0,0.0,1537229000000.0,0.0,-720.0,1537230000000.0
25%,2.0,0.020425,1.0,66.1425,0.72,2.68,8.0,0.119675,8.0,1537793000000.0,0.0,-540.0,1537996000000.0
50%,2.7,0.05905,2.0,105.0,1.3,3.72,15.0,0.21,26.0,1538245000000.0,0.0,-480.0,1538621000000.0
75%,3.3,0.17725,5.0,159.0,1.9,4.57,25.0,0.59,56.0,1538766000000.0,0.0,-480.0,1539110000000.0
max,8.4,53.737,580.0,355.91,7.5,9.12,172.0,1.91,2015.0,1539475000000.0,1.0,720.0,1539537000000.0


In [420]:
df.describe(include=object)

Unnamed: 0,alert,code,detail,ids,magType,net,place,sources,status,title,type,types,url
count,59,9332,9332,9332,9331,9332,9332,9332,9332,9332,9332,9332,9332
unique,2,9332,9332,9332,10,14,5433,52,2,7807,5,42,9332
top,green,37389218,https://earthquake.usgs.gov/fdsnws/event/1/que...,",ci37389218,",ml,ak,"10km NE of Aguanga, CA",",ak,",reviewed,"M 0.4 - 10km NE of Aguanga, CA",earthquake,",geoserve,origin,phase-data,",https://earthquake.usgs.gov/earthquakes/eventp...
freq,58,1,1,1,6803,3166,306,2981,7797,55,9081,5301,1


In [435]:
columns = ['alert', 'magType', 'place', 'status', 'title', 'type', 'mag', 'time', 'tsunami']
df = pd.read_csv('data/earthquakes.csv', usecols=columns)
df.head(3)

Unnamed: 0,alert,mag,magType,place,status,time,title,tsunami,type
0,,1.35,ml,"9km NE of Aguanga, CA",automatic,1539475168010,"M 1.4 - 9km NE of Aguanga, CA",0,earthquake
1,,1.29,ml,"9km NE of Aguanga, CA",automatic,1539475129610,"M 1.3 - 9km NE of Aguanga, CA",0,earthquake
2,,3.42,ml,"8km NE of Aguanga, CA",automatic,1539475062610,"M 3.4 - 8km NE of Aguanga, CA",0,earthquake


In [445]:
df[(df.magType == 'mb') & (df.place.str.contains(r'Japan'))].describe(percentiles=[0.95])

Unnamed: 0,mag,time,tsunami
count,50.0,50.0,50.0
mean,4.53,1538218000000.0,0.0
std,0.266688,640453500.0,0.0
min,4.0,1537258000000.0,0.0
50%,4.6,1538268000000.0,0.0
95%,4.9,1539090000000.0,0.0
max,5.4,1539449000000.0,0.0


**Ответ: 4.9**

**2. Find the percentage of earthquakes in Indonesia that were coupled with tsunamis.**

In [452]:
total = df.place.str.contains(r'Indonesia').sum()
total

147

In [454]:
total_w_tsunami = ((df.place.str.contains(r'Indonesia')) & (df.tsunami == 0)).sum()
total_w_tsunami

113

In [455]:
total_w_tsunami / total

0.7687074829931972

**Ответ: 77%**

**Calculate summary statistics for earthquakes in Nevada.**

In [459]:
df[df.place.str.contains(r'NV|Nevada')].describe()

Unnamed: 0,mag,time,tsunami
count,681.0,681.0,681.0
mean,0.500073,1538314000000.0,0.0
std,0.69671,596563700.0,0.0
min,-0.5,1537247000000.0,0.0
25%,-0.1,1537854000000.0,0.0
50%,0.4,1538280000000.0,0.0
75%,0.9,1538821000000.0,0.0
max,2.9,1539461000000.0,0.0


**3. Add a column indicating whether the earthquake happened in a country or US state that is on the Ring of Fire. Use Alaska, Antarctica (look for Antarctic), Bolivia, California, Canada, Chile, Costa Rica, Ecuador, Fiji, Guatemala, Indonesia, Japan, Kermadec Islands, Mexico (be careful not to select New Mexico), New Zealand, Peru, Philippines, Russia, Taiwan, Tonga, and Washington.**

In [467]:
df[df.place.str.contains(r'CA|California')]

Unnamed: 0,alert,mag,magType,place,status,time,title,tsunami,type
0,,1.35,ml,"9km NE of Aguanga, CA",automatic,1539475168010,"M 1.4 - 9km NE of Aguanga, CA",0,earthquake
1,,1.29,ml,"9km NE of Aguanga, CA",automatic,1539475129610,"M 1.3 - 9km NE of Aguanga, CA",0,earthquake
2,,3.42,ml,"8km NE of Aguanga, CA",automatic,1539475062610,"M 3.4 - 8km NE of Aguanga, CA",0,earthquake
3,,0.44,ml,"9km NE of Aguanga, CA",automatic,1539474978070,"M 0.4 - 9km NE of Aguanga, CA",0,earthquake
4,,2.16,md,"10km NW of Avenal, CA",automatic,1539474716050,"M 2.2 - 10km NW of Avenal, CA",0,earthquake
...,...,...,...,...,...,...,...,...,...
9326,,1.82,ml,"4km W of Julian, CA",reviewed,1537230230260,"M 1.8 - 4km W of Julian, CA",0,earthquake
9327,,0.62,md,"9km ENE of Mammoth Lakes, CA",reviewed,1537230228060,"M 0.6 - 9km ENE of Mammoth Lakes, CA",0,earthquake
9328,,1.00,ml,"3km W of Julian, CA",reviewed,1537230135130,"M 1.0 - 3km W of Julian, CA",0,earthquake
9330,,1.10,ml,"9km NE of Aguanga, CA",reviewed,1537229545350,"M 1.1 - 9km NE of Aguanga, CA",0,earthquake


In [475]:
df.place.str.extract(r', (.*)')[0].sort_values().unique()

array(['Afghanistan', 'Alaska', 'Argentina', 'Arizona', 'Arkansas',
       'Australia', 'Azerbaijan', 'B.C., MX', 'Barbuda', 'Bolivia',
       'Bonaire, Saint Eustatius and Saba ', 'British Virgin Islands',
       'Burma', 'CA', 'California', 'Canada', 'Chile', 'China',
       'Christmas Island', 'Colombia', 'Colorado', 'Costa Rica',
       'Dominican Republic', 'East Timor', 'Ecuador', 'Ecuador region',
       'El Salvador', 'Fiji', 'Greece', 'Greenland', 'Guam', 'Guatemala',
       'Haiti', 'Hawaii', 'Honduras', 'Idaho', 'Illinois', 'India',
       'Indonesia', 'Iran', 'Iraq', 'Italy', 'Jamaica', 'Japan', 'Kansas',
       'Kentucky', 'Kyrgyzstan', 'Martinique', 'Mauritius', 'Mayotte',
       'Mexico', 'Missouri', 'Montana', 'NV', 'Nevada', 'New Caledonia',
       'New Hampshire', 'New Mexico', 'New Zealand', 'Nicaragua',
       'North Carolina', 'Northern Mariana Islands', 'Oklahoma', 'Oregon',
       'Pakistan', 'Papua New Guinea', 'Peru', 'Philippines',
       'Puerto Rico', 'Roman

In [508]:
df['parsed_place'] = df.place.str.replace(
        r'.* of ', '', regex=True # remove <x> of <x>
    ).str.replace(
        'the ', '' # remove 'the'
    ).str.replace(
        r'CA$', 'California', regex=True # fix California
    ).str.replace(
        r'NV$', 'Nevada', regex=True # fix Nevada
    ).str.replace(
        r'MX$', 'Mexico', regex=True # fix Mexico
    ).str.replace(
        r'region$', '', regex=True # fix " region" endings
    ).str.replace(
        'nothern ', '' # remove 'nothern'
    ).str.replace(
        'Fiji Islands', 'Fiji' # line up the Fiji places
    ).str.replace(
        r'^.*, ', '', regex=True  #remove anything else extraneous from start
    ).str.strip() # remove any extra spaces

In [509]:
df.parsed_place.sort_values().unique()

array(['Afghanistan', 'Alaska', 'Argentina', 'Arizona', 'Arkansas',
       'Ascension Island', 'Australia', 'Azerbaijan', 'Balleny Islands',
       'Barbuda', 'Bolivia', 'British Virgin Islands', 'Burma',
       'California', 'Canada', 'Carlsberg Ridge',
       'Central East Pacific Rise', 'Central Mid-Atlantic Ridge', 'Chile',
       'China', 'Christmas Island', 'Colombia', 'Colorado', 'Costa Rica',
       'Dominican Republic', 'East Timor', 'Ecuador', 'El Salvador',
       'Fiji', 'Greece', 'Greenland', 'Guam', 'Guatemala', 'Haiti',
       'Hawaii', 'Honduras', 'Idaho', 'Illinois', 'India',
       'Indian Ocean Triple Junction', 'Indonesia', 'Iran', 'Iraq',
       'Italy', 'Jamaica', 'Japan', 'Kansas', 'Kentucky',
       'Kermadec Islands', 'Kuril Islands', 'Kyrgyzstan', 'Martinique',
       'Mauritius', 'Mayotte', 'Mexico', 'Mid-Indian Ridge', 'Missouri',
       'Montana', 'Nevada', 'New Caledonia', 'New Hampshire',
       'New Mexico', 'New Zealand', 'Nicaragua', 'North Carolina',


In [510]:
ring_of_fire = ['Alaska', 'Bolivia', 'California', 'Canada', 
                'Chile', 'Costa Rica', 'Ecuador', 'Fiji', 
                'Guatemala', 'Indonesia', 'Japan', 'Kermadec Islands', 
                'Mexico', 'New Zealand', 'Peru', 'Philippines', 
                'Russia', 'Taiwan', 'Tonga', 'Washington', 
                'Pacific-Antarctic Ridge', 'Western Indian-Antarctic Ridge']

In [511]:
df.loc[df.parsed_place.apply(lambda x: x in ring_of_fire), 'ring_of_fire'] = True

In [512]:
df.loc[df.ring_of_fire.isna(), 'ring_of_fire'] = False

In [513]:
df[df.ring_of_fire == True]['parsed_place'].sort_values().unique()

array(['Alaska', 'Bolivia', 'California', 'Canada', 'Chile', 'Costa Rica',
       'Ecuador', 'Fiji', 'Guatemala', 'Indonesia', 'Japan',
       'Kermadec Islands', 'Mexico', 'New Zealand',
       'Pacific-Antarctic Ridge', 'Peru', 'Philippines', 'Russia',
       'Taiwan', 'Tonga', 'Washington', 'Western Indian-Antarctic Ridge'],
      dtype=object)

In [514]:
df[df.ring_of_fire != True]['parsed_place'].sort_values().unique()

array(['Afghanistan', 'Argentina', 'Arizona', 'Arkansas',
       'Ascension Island', 'Australia', 'Azerbaijan', 'Balleny Islands',
       'Barbuda', 'British Virgin Islands', 'Burma', 'Carlsberg Ridge',
       'Central East Pacific Rise', 'Central Mid-Atlantic Ridge', 'China',
       'Christmas Island', 'Colombia', 'Colorado', 'Dominican Republic',
       'East Timor', 'El Salvador', 'Greece', 'Greenland', 'Guam',
       'Haiti', 'Hawaii', 'Honduras', 'Idaho', 'Illinois', 'India',
       'Indian Ocean Triple Junction', 'Iran', 'Iraq', 'Italy', 'Jamaica',
       'Kansas', 'Kentucky', 'Kuril Islands', 'Kyrgyzstan', 'Martinique',
       'Mauritius', 'Mayotte', 'Mid-Indian Ridge', 'Missouri', 'Montana',
       'Nevada', 'New Caledonia', 'New Hampshire', 'New Mexico',
       'Nicaragua', 'North Carolina', 'Northern East Pacific Rise',
       'Northern Mariana Islands', 'Northern Mid-Atlantic Ridge',
       'Oklahoma', 'Oregon', 'Pakistan', 'Papua New Guinea',
       'Prince Edward Islands',

In [517]:
df.drop(columns=['parced_place'], inplace=True)

In [518]:
df.head(4)

Unnamed: 0,alert,mag,magType,place,status,time,title,tsunami,type,ring_of_fire,parsed_place
0,,1.35,ml,"9km NE of Aguanga, CA",automatic,1539475168010,"M 1.4 - 9km NE of Aguanga, CA",0,earthquake,True,California
1,,1.29,ml,"9km NE of Aguanga, CA",automatic,1539475129610,"M 1.3 - 9km NE of Aguanga, CA",0,earthquake,True,California
2,,3.42,ml,"8km NE of Aguanga, CA",automatic,1539475062610,"M 3.4 - 8km NE of Aguanga, CA",0,earthquake,True,California
3,,0.44,ml,"9km NE of Aguanga, CA",automatic,1539474978070,"M 0.4 - 9km NE of Aguanga, CA",0,earthquake,True,California


**4. Calculate the number of earthquakes in the Ring of Fire locations and the number outside of them.**

In [519]:
df.ring_of_fire.value_counts()

True     7188
False    2144
Name: ring_of_fire, dtype: int64

**Ответ: the number of earthquakes in the Ring of Fire locations = 7188, the number outside of them = 2144**

**5. Find the tsunami count along the Ring of Fire.**

In [528]:
((df.ring_of_fire==True) & (df.tsunami==1)).sum()

45

**Ответ: 45**