In [2]:
import time
import itertools
import requests
import pandas as pd
import regex as re
import replace
import numpy as np
import scipy
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols
from wordcloud import WordCloud, STOPWORDS
from scipy import stats
from sklearn.linear_model import LinearRegression
from IPython.display import display
import datetime
pd.options.mode.chained_assignment = None

In [14]:
#read original earthquakes file
earthquakes = pd.read_csv('earthquakes_org.csv')

In [15]:
earthquakes

Unnamed: 0,time,place,status,tsunami,significance,data_type,magnitudo,state,longitude,latitude,depth,date
0,631153353990,"12 km NNW of Meadow Lakes, Alaska",reviewed,0,96,earthquake,2.50,Alaska,-149.669200,61.730200,30.100,1990-01-01 00:22:33.990000+00:00
1,631153491210,"14 km S of Volcano, Hawaii",reviewed,0,31,earthquake,1.41,Hawaii,-155.212333,19.317667,6.585,1990-01-01 00:24:51.210000+00:00
2,631154083450,"7 km W of Cobb, California",reviewed,0,19,earthquake,1.11,California,-122.806167,38.821000,3.220,1990-01-01 00:34:43.450000+00:00
3,631155512130,"11 km E of Mammoth Lakes, California",reviewed,0,15,earthquake,0.98,California,-118.846333,37.664333,-0.584,1990-01-01 00:58:32.130000+00:00
4,631155824490,"16km N of Fillmore, CA",reviewed,0,134,earthquake,2.95,California,-118.934000,34.546000,16.122,1990-01-01 01:03:44.490000+00:00
...,...,...,...,...,...,...,...,...,...,...,...,...
3445746,1690626851941,"5 km NW of Chikusei, Japan",reviewed,0,326,earthquake,4.60,Japan,139.940200,36.350700,83.039,2023-07-29 10:34:11.941000+00:00
3445747,1690626975715,"Kodiak Island region, Alaska",automatic,0,44,earthquake,1.70,Alaska,-153.729900,57.790100,24.400,2023-07-29 10:36:15.715000+00:00
3445748,1690627215940,"12 km W of Alberto Oviedo Mota, B.C., MX",automatic,0,90,earthquake,2.42,B.C.,-115.296833,32.233167,1.770,2023-07-29 10:40:15.940000+00:00
3445749,1690628146040,"7 km W of Cobb, CA",automatic,0,16,earthquake,1.03,California,-122.800499,38.827499,1.720,2023-07-29 10:55:46.040000+00:00


In [36]:
#clean earthquakes file
earthquakes_cleaned = earthquakes.copy()

#change date column from string to datetime to date
earthquakes_cleaned['date'] = pd.to_datetime(earthquakes_cleaned['date'])

#correct magnitude column title spelling
earthquakes_cleaned['magnitude'] = earthquakes_cleaned['magnitudo']

#drop place, status, tsunami column
earthquakes_cleaned = earthquakes_cleaned.drop(['time', 'place', 'status', 'magnitudo'], axis = 1)

#make new column for all earthquakes in USA, some of the states had extra spaces in font and behind them so we had to strip
earthquakes_cleaned['state'] = earthquakes_cleaned['state'].str.strip()
states = ["Alaska", "Alabama", "Arkansas", "Arizona", "California", "Colorado", "Connecticut", "Delaware", "Florida", "Georgia", "Guam", "Hawaii", "Iowa", "Idaho", "Illinois", "Indiana", "Kansas", "Kentucky", "Louisiana", "Massachusetts", "Maryland", "Maine", "Michigan", "Minnesota", "Missouri", "Mississippi", "Montana", "North Carolina", "North Dakota", "Nebraska", "New Hampshire", "New Jersey", "New Mexico", "Nevada", "New York", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Puerto Rico", "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Virginia", "Virgin Islands", "Vermont", "Washington", "Wisconsin", "West Virginia", "Wyoming"]
earthquakes_cleaned['In USA'] = earthquakes_cleaned['state'].isin(states)

#remove rows from dataset that aren't earthquake, strip data_type column just in case
earthquakes_cleaned['data_type'] = earthquakes_cleaned['data_type'].str.strip()
earthquakes_cleaned = earthquakes_cleaned.loc[earthquakes_cleaned['data_type'] == 'earthquake'].reset_index(drop = True)

#remove all negative magnitude earthquakes, doesn't make sense
earthquakes_cleaned = earthquakes_cleaned[(earthquakes_cleaned['magnitude'] > 0) & (earthquakes_cleaned['magnitude'] < 10)]

#capitalize first word in state column
earthquakes_cleaned['state'] = earthquakes_cleaned['state'].str.capitalize()

In [37]:
earthquakes_cleaned

Unnamed: 0,tsunami,significance,data_type,state,longitude,latitude,depth,date,magnitude,In USA
0,0,96,earthquake,Alaska,-149.669200,61.730200,30.100,1990-01-01 00:22:33.990000+00:00,2.50,True
1,0,31,earthquake,Hawaii,-155.212333,19.317667,6.585,1990-01-01 00:24:51.210000+00:00,1.41,True
2,0,19,earthquake,California,-122.806167,38.821000,3.220,1990-01-01 00:34:43.450000+00:00,1.11,True
3,0,15,earthquake,California,-118.846333,37.664333,-0.584,1990-01-01 00:58:32.130000+00:00,0.98,True
4,0,134,earthquake,California,-118.934000,34.546000,16.122,1990-01-01 01:03:44.490000+00:00,2.95,True
...,...,...,...,...,...,...,...,...,...,...
3361841,0,326,earthquake,Japan,139.940200,36.350700,83.039,2023-07-29 10:34:11.941000+00:00,4.60,False
3361842,0,44,earthquake,Alaska,-153.729900,57.790100,24.400,2023-07-29 10:36:15.715000+00:00,1.70,True
3361843,0,90,earthquake,B.c.,-115.296833,32.233167,1.770,2023-07-29 10:40:15.940000+00:00,2.42,False
3361844,0,16,earthquake,California,-122.800499,38.827499,1.720,2023-07-29 10:55:46.040000+00:00,1.03,True


In [43]:
earthquakes_clean1 = earthquakes_cleaned.loc[:1120615]
earthquakes_clean2 = earthquakes_cleaned.loc[1120615:2241230]
earthquakes_clean3 = earthquakes_cleaned.loc[2241230:]
earthquakes_2023 = earthquakes_cleaned.loc[earthquakes_cleaned['date'] > "2023-01-01"]
earthquakes_2023['date'] = pd.to_datetime(earthquakes_2023['date']).dt.date

#earthquakes_clean1.to_csv('earthquakes_new1.csv', index = False)
#earthquakes_clean2.to_csv('earthquakes_new2.csv', index = False)
#earthquakes_clean3.to_csv('earthquakes_new3.csv', index = False)
earthquakes_2023.to_csv('earthquakes_2023.csv', index = False)

In [42]:
earthquakes_2023

Unnamed: 0,tsunami,significance,data_type,state,longitude,latitude,depth,date,magnitude,In USA
3280810,0,0,earthquake,California,-122.753667,38.783667,1.620,2023-01-01,0.09,True
3280811,0,7,earthquake,California,-122.754500,38.783500,1.690,2023-01-01,0.69,True
3280812,0,22,earthquake,Alaska,-149.669200,61.685400,39.900,2023-01-01,1.20,True
3280813,0,44,earthquake,Alaska,-145.178900,63.297000,2.800,2023-01-01,1.70,True
3280814,0,11,earthquake,California,-116.616833,33.408000,7.520,2023-01-01,0.84,True
...,...,...,...,...,...,...,...,...,...,...
3361841,0,326,earthquake,Japan,139.940200,36.350700,83.039,2023-07-29,4.60,False
3361842,0,44,earthquake,Alaska,-153.729900,57.790100,24.400,2023-07-29,1.70,True
3361843,0,90,earthquake,B.c.,-115.296833,32.233167,1.770,2023-07-29,2.42,False
3361844,0,16,earthquake,California,-122.800499,38.827499,1.720,2023-07-29,1.03,True
