In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Read in the earthquake csv file and take a look
earthquakes_csv = "https://raw.githubusercontent.com/fedhere/DSPS_FBianco/master/HW3/earthquakes.csv"
data = pd.read_csv(earthquakes_csv, sep=" ")
data.head()

Unnamed: 0,#YYY/MM/DD,HH:mm:SS.ss,ET,GT,MAG,Unnamed: 5,M,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Q,Unnamed: 25,EVID,Unnamed: 27,Unnamed: 28,Unnamed: 29,Unnamed: 30,Unnamed: 31,NPH,NGRM
0,1984/01/01,01:30:57.52,eq,,l,2.02,c,,,33.246,...,17.0,,,,,,,,,
1,1984/01/01,05:31:34.16,eq,,l,2.12,c,,,32.945,...,42.0,,,,,,,,,
2,1984/01/01,06:59:05.14,eq,,l,2.43,c,,,33.343,...,30.0,,,,,,,,,
3,1984/01/01,11:10:25.29,eq,,l,2.07,c,,,33.668,...,,,,,,,,,,
4,1984/01/01,19:38:19.79,eq,,l,2.13,c,,,33.963,...,49.0,,,,,,,,,


In [3]:
# We only care about date, time, and magnitude so let's grab only those
# columns, and rename them to something sensible.
colnames = ["#YYY/MM/DD", "HH:mm:SS.ss", "Unnamed: 5"]
new_colnames = {"#YYY/MM/DD": "date",
                "HH:mm:SS.ss": "time",
                "Unnamed: 5":"mag"}
eqdata = data[colnames].rename(new_colnames, axis=1)
eqdata.describe()

Unnamed: 0,mag
count,70798.0
mean,2.417826
std,0.407683
min,2.0
25%,2.12
50%,2.3
75%,2.59
max,7.1


In [4]:
# To complete this analysis, we need the time difference between each
# earthquake. Let's make one string object by concatenating the date and the
# time, which pandas can then read and convert into a datetime object which
# will allow us to find the time gaps.
# First we create the column of datetime strings.

# WWW There must be a vectorized way to do this
datetime = [eqdata.iloc[i].date + " " + eqdata.iloc[i].time for i 
            in range(eqdata.shape[0])]

In [5]:
# This csv file seems to have some times measured at 60 seconds and 00 decimal
# seconds, whereas pandas expects the minute to end at 59.99 seconds. So we
# will simply replace the "60.00" part of the time strings with "59.99"
# Now we can convert this string datetime column to a column of proper
# datetime object.
eqdata["datetime"] = datetime
eqdata.datetime = eqdata.datetime.str.replace("60.00", "59.99")
eqdata.datetime = pd.to_datetime(eqdata.datetime)
eqdata.head()

  eqdata.datetime = eqdata.datetime.str.replace("60.00", "59.99")


Unnamed: 0,date,time,mag,datetime
0,1984/01/01,01:30:57.52,2.02,1984-01-01 01:30:57.520
1,1984/01/01,05:31:34.16,2.12,1984-01-01 05:31:34.160
2,1984/01/01,06:59:05.14,2.43,1984-01-01 06:59:05.140
3,1984/01/01,11:10:25.29,2.07,1984-01-01 11:10:25.290
4,1984/01/01,19:38:19.79,2.13,1984-01-01 19:38:19.790


In [6]:
# Finally we can calculate the time difference between consecutive rows. We
# assume that these rows are chronologically ordered.

# WWW For the homework, validate that the rows are chronologically sorted
eqdata["dt"] = eqdata.datetime.diff()

# The first row has a NaT for the dt beacuse there is no time difference for
# the very first time. We can remove it.
eqdata = eqdata.iloc[1:]
eqdata.head()

Unnamed: 0,date,time,mag,datetime,dt
1,1984/01/01,05:31:34.16,2.12,1984-01-01 05:31:34.160,0 days 04:00:36.640000
2,1984/01/01,06:59:05.14,2.43,1984-01-01 06:59:05.140,0 days 01:27:30.980000
3,1984/01/01,11:10:25.29,2.07,1984-01-01 11:10:25.290,0 days 04:11:20.150000
4,1984/01/01,19:38:19.79,2.13,1984-01-01 19:38:19.790,0 days 08:27:54.500000
5,1984/01/01,19:39:19.67,2.13,1984-01-01 19:39:19.670,0 days 00:00:59.880000


In [7]:
# The stationary periods that the paper looks at. The hypothesis says that
# there is a power law only within these stationary seismic periods. This for
# loop goes through these intervals.

ss = []

for t in [(1984, 1986.5),
          (1990.3, 1992.1),
          (1994.6, 1995.6),
          (1996.1,1996.5),
          (1997,1997.6),
          (1997.75,1998.15),
          (1998.25,1999.35),
          (2000.55,2000.8),
          (2000.9 ,2001.25),
          (2001.6 ,2002),
          (2002.5 ,2003)]:

    # for each pair I convert the year to datetime (easy)
    # then I convert the fraction of year and convert it to days and convert
    # that to a delta-time
    # finally I add that deltatime to the year datetime.
    ss.append((pd.to_datetime(int(t[0]), format="%Y")
        + pd.Timedelta('%.f days'%((t[0] - int(t[0])) * 365.25)),
               pd.to_datetime(int(t[1]), format="%Y")
        + pd.Timedelta('%.f days'%((t[1] - int(t[1])) * 365.25))))

In [18]:
dates = (((eqdata.datetime >= ss[0][0]) & (eqdata.datetime >= ss[0][1])) |\
         ((eqdata.datetime >= ss[1][0]) & (eqdata.datetime >= ss[1][1])) |\
         ((eqdata.datetime >= ss[2][0]) & (eqdata.datetime >= ss[2][1])) |\
         ((eqdata.datetime >= ss[3][0]) & (eqdata.datetime >= ss[3][1])) |\
         ((eqdata.datetime >= ss[4][0]) & (eqdata.datetime >= ss[4][1])) |\
         ((eqdata.datetime >= ss[5][0]) & (eqdata.datetime >= ss[5][1])) |\
         ((eqdata.datetime >= ss[6][0]) & (eqdata.datetime >= ss[6][1])) |\
         ((eqdata.datetime >= ss[7][0]) & (eqdata.datetime >= ss[7][1])) |\
         ((eqdata.datetime >= ss[8][0]) & (eqdata.datetime >= ss[8][1])) |\
         ((eqdata.datetime >= ss[9][0]) & (eqdata.datetime >= ss[9][1])) |\
         ((eqdata.datetime >= ss[10][0]) & (eqdata.datetime >= ss[10][1])))

In [29]:
# Throw out the data that is not within the seismic periods we are interested
# in.
# eqdata = eqdata["datetime"][dates]  #  Not correct

6172   1986-07-03 00:48:12.550
6173   1986-07-03 09:01:32.700
6174   1986-07-03 10:03:05.630
6175   1986-07-03 21:08:36.650
6176   1986-07-03 22:11:40.800
Name: datetime, dtype: datetime64[ns]