# Lab 4. Data structures and arrays
#### Computational Methods for Geoscience - EPS 400/522
#### Instructor: Eric Lindsey

Due: Sept. 21, 2023

---------

Adrian Marziliano

In [1]:
# some useful imports and settings
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import netCDF4 as nc
import xarray as xr
import datetime
from scipy import interpolate
from scipy.interpolate import interp1d
from scipy.interpolate import griddata


%config InlineBackend.figure_format = 'retina' # better looking figures on high-resolution screens

### Using data structures to categorize data

The file 'worldwide_m4+_2022.csv' (on canvas) contains all earthquakes larger than magnitude 4 recorded by the USGS in 2022 (more than 15,000 events). Let's use a dictionary to keep track of how many events happened in each state.

First, read the data into python using pandas. The column 'place' contains a short description of the location of each event, and if it occurred in the US, this description will (usually) mention a state name. We can find out if a string is contained in another string using the keyword 'in' (see the notes).

Instructions: loop over the list of state names, and for each state count the number of M4+ earthquakes that occurred in that state (you may need to loop over the whole dataset for each state name). Add this number to a dictionary with the state name as the key; for example it might contain 'New Mexico': 4.

Finally, print out the top 10 states by number of earthquakes in 2022.

In [None]:
earthquake_df=pd.read_csv('worldwide_m4+_2022.csv')
print(earthquake_df[['longitude', 'latitude', 'mag', 'place']])

In [None]:
# Create list of states as the keys of your dictionary.
us_states = [ "Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", 
             "Delaware", "Florida", "Georgia", "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa",
             "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan", 
             "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire",
             "New Jersey", "New Mexico", "New York", "North Carolina", "North Dakota", "Ohio", 
             "Oklahoma", "Oregon", "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", 
             "Tennessee", "Texas", "Utah", "Vermont", "Virginia", "Washington", "West Virginia", 
             "Wisconsin", "Wyoming"]

# Create an empty dictionary to store the earthquake data
earthquake_data_dict = {
    "Time": [],
    "Latitude": [],
    "Longitude": [],
    "Depth": [],
    "Magnitude": [],
    "Place": []
}

# Iterate through the DataFrame and extract data for US state names
for index, row in earthquake_df.iterrows():
    if isinstance(row['place'], str):  # Check if 'place' is a string
        for state in us_states:
            if state in row['place']:
                earthquake_data_dict["Time"].append(row['time'])
                earthquake_data_dict["Latitude"].append(row['latitude'])
                earthquake_data_dict["Longitude"].append(row['longitude'])
                earthquake_data_dict["Depth"].append(row['depth'])
                earthquake_data_dict["Magnitude"].append(row['mag'])
                earthquake_data_dict["Place"].append(row['place'])
                break  # Break the loop once a match is found to avoid duplicate entries

# Convert the dictionary to a DataFrame if needed
earthquake_data_df = pd.DataFrame(earthquake_data_dict)

# Print or use the earthquake data as needed
print(earthquake_data_df[['Time', 'Magnitude', 'Place']])

### Resampling a dataset

Often times, our data have values missing, large errors, or are unevenly sampled. In this case, we need to 'resample' the data onto a regular grid. This is also known as 'gridding' the data.

In [None]:
# original data - slight variation in the time sampling
time = np.linspace(0, 10, 20) +  np.random.uniform(-0.2, 0.2, 20)
values = np.sin(time)

# add some bad data
ibad=np.random.randint(2,18,(4,))
values[ibad] += 5+10*np.random.rand(4)

# plot the data
plt.plot(time,values,'ks',label='original')

### Assignment 1: remove outliers and resample the above data 

Step 1. Remove the outliers using logical indexing.

Step 2. Resample the remaining data onto a regularly spaced set of points sampled every 0.1 seconds, from 0 to 10. You can choose the interpolation method you find best!

Step 3. Plot the resampled data on top of the original data (without outliers), showing how the interpolation works.

In [None]:
# Plot US earthquake data
plt.plot(earthquake_data_df['Time'],earthquake_data_df['Magnitude'],'k.')
plt.title('Earthquake data for US states')
plt.show()

#### STEP 1: Remove Outliers

In [None]:
# Calculate mean and standard deviation for the 'mag' column
mean_magnitude = earthquake_data_df['Magnitude'].mean()
std_deviation_magnitude = earthquake_data_df['Magnitude'].std()

# Define a threshold for outliers (e.g., values more than 2 standard deviations from the mean)
threshold = 3 * std_deviation_magnitude

# Create a boolean mask identifying outliers
outliers_mask = np.abs(earthquake_data_df['Magnitude'] - mean_magnitude) > threshold

# Use the mask to filter the DataFrame and remove outliers
filtered_data = earthquake_data_df[~outliers_mask]

# Now, 'filtered_earthquake_df' contains the DataFrame with outliers removed.

# You can also reset the index if needed
filtered_data.reset_index(drop=True, inplace=True)

# Print or work with the filtered DataFrame as needed
#print(filtered_data)

# Plot US earthquake data W/O OUTLIERS
plt.plot(filtered_data['Time'],filtered_data['Magnitude'],'k.')
plt.title('Outliers Removed')
plt.ylim(3.9,6.9)
plt.show()

In [None]:
filtered_data

#### STEP 2: Resample Data

In [None]:
# Ensure that the 'timestamp' column is of datetime type
filtered_data['Time'] = pd.to_datetime(filtered_data['Time'])

# Create a new datetime index for the resampled data
start_time = pd.to_datetime("2022-01-01T02:51:57.266Z")
end_time = pd.to_datetime("2022-12-31T06:13:27.088Z")
resampled_index = pd.date_range(start=start_time, end=end_time, freq='1S')

# Resample the data using interpolation
resampled_data = filtered_data.set_index('Time').reindex(resampled_index)

# Interpolate missing values
resampled_data['Magnitude'] = resampled_data['Magnitude'].interpolate(method='linear')

# Filter data from 0 to 10 seconds
start_time = pd.to_datetime("2022-01-01T02:51:57.266Z") + pd.Timedelta(seconds=0)
end_time = pd.to_datetime("2022-12-31T06:13:27.088Z") + pd.Timedelta(seconds=10)
resampled_data = resampled_data[(resampled_data.index >= start_time) & (resampled_data.index <= end_time)]

# If you want to reset the index and have a clean DataFrame
#resampled_data = resampled_data.reset_index()

# Now, resampled_data contains regularly spaced earthquake magnitude data
resampled_data


#### STEP 3: PLOT RESAMPLED DATA OVER ORIGINAL DATA

In [None]:
plt.plot(resampled_data['Magnitude'])

In [None]:
# Plot US earthquake data W/O OUTLIERS
#plt.scatter(resampled_data['index'], resampled_data['Magnitude'])

# Assuming you have 'resampled_data' DataFrame with the 'Magnitude' column
plt.figure(figsize=(10, 6))  # Set the figure size

# Plot the 'Magnitude' data
plt.plot(resampled_data['Magnitude'], color='blue', linestyle='-', marker='o', markersize=2, label='Magnitude')

# Set plot title and labels
plt.title('Earthquake Magnitude Over Time')
plt.xlabel('Time')
plt.ylabel('Magnitude')

# Show a legend if you have multiple lines
plt.legend()

# Show the plot
plt.grid()
plt.show()

### Assignment 2. Use 2D Interpolation to fill in the continents.

Remember our averaged-monthly SST dataset? (Filename: 'sst.mon.ltm.1981-2010.nc') Let's use this as a (strange) example of interpolation. Try masking out the NaNs in the grid of temperatures from September, then use griddata to fill in all the values over the continents.

I think this will prove a litte challenging - good luck, work with each other!

In [None]:
# here is some code to get you started.
# note you will have to copy the data file into your current folder for it to work for you.

filename = 'sst.mon.ltm.1981-2010.nc'
dataset = nc.Dataset(filename)

# sst is stored as a 3D array (time,lat,lon)
# get the grid in September
sst_sept = dataset['sst'][8,:,:]

# Hint: note that this netCDF dataset comes with a 'mask' property that lets us know which values are NaN.
# we can access them with sst_sept.mask

print('whether each point is nan:\n',sst_sept.mask)

# you can use this to extract only the valid data from any given array, if it has the same size
zvalid = sst_sept[~sst_sept.mask]

# check the shapes:
print('shape of sst_sept is', np.shape(sst_sept))
# notice, now it became a vector instead of an array.
print('shape of zvalid is', np.shape(zvalid))


#### I suggest the following procedure:

**Step 1. Generate the gridded X and Y matrices**

Use np.meshgrid on the dataset['lon'] and dataset['lat'] vectors.
Make sure to verify that your output arrays have the same size as your SST data.

**Step 2. Extract the valid points from each of your 3 arrays (X, Y, SST)**

Check out the hint above for how to use the mask property of the netcdf dataset.

**Step 3. Choose an interpolation method and do the interpolation from the scattered valid data back to the full X and Y grids**

**Step 4. Mask the ocean areas to show just the continents. You should end up with something cool!**

##### Step 1: Generate the gridded X and Y matrices

In [None]:
# Load the netCDF file
#dataset = xr.open_dataset('your_sst_data_file.nc')

# Extract latitude and longitude vectors
latitude = dataset['lat']
longitude = dataset['lon']

# Extract SST data as well (assuming the variable name is 'sst')
sst_data = dataset['sst']

print("Latitude shape:", latitude.shape)
print("Longitude shape:", longitude.shape)
print("SST data shape:", sst_data.shape)

# Ensure the latitude and longitude arrays have the same size as SST data
#assert latitude.shape == sst_data.shape, "Latitude array size doesn't match SST data size"
#assert longitude.shape == sst_data.shape, "Longitude array size doesn't match SST data size"

# Create the meshgrid
lon_grid, lat_grid = np.meshgrid(longitude, latitude)


##### Step 2:  Extract the valid points from each of your 3 arrays (X, Y, SST)

In [None]:
# Assuming you have X, Y, and SST arrays
# You already have the latitude and longitude grids (lon_grid, lat_grid) from the previous code

# Create a mask to identify invalid data points in SST (e.g., fill values)
invalid_mask = np.isnan(sst_data)

# Use the invalid_mask to extract valid points from X, Y, and SST
X_valid = X[~invalid_mask]
Y_valid = Y[~invalid_mask]
SST_valid = SST[~invalid_mask]


In [None]:
def create_continent_grid(lat_range=(-90, 90), lon_range=(-180, 180), grid_resolution=1.0):
    """
    Create a grid of latitude and longitude points over the continents.
    
    Parameters:
    - lat_range: Tuple (min_lat, max_lat), defines the latitude range for the grid.
    - lon_range: Tuple (min_lon, max_lon), defines the longitude range for the grid.
    - grid_resolution: Resolution for the grid in degrees.

    Returns:
    - continent_grid_lat: 2D array of latitude points over the continents.
    - continent_grid_lon: 2D array of longitude points over the continents.
    """
    latitudes = np.arange(lat_range[0], lat_range[1] + grid_resolution, grid_resolution)
    longitudes = np.arange(lon_range[0], lon_range[1] + grid_resolution, grid_resolution)

    continent_grid_lat, continent_grid_lon = np.meshgrid(latitudes, longitudes)

    return continent_grid_lat, continent_grid_lon


In [22]:

# Load the SST dataset
file_path = 'C:/Users/marzi/OneDrive - University of New Mexico/EPS 522/Labs/Lab 2 Files and Figures/sst.mon.ltm.1981-2010.nc'
ds = xr.open_dataset(file_path)

# Create grids of longitude and latitude
lon = ds['lon']
lat = ds['lat']
X, Y = np.meshgrid(lon, lat)

# Extract the SST values for September
sst_september = ds['sst'].sel(time=ds['time.month'] == 9)

# Find valid data points (not NaN) in SST
valid_indices = ~np.isnan(sst_september)

# Flatten X and Y arrays
X_flat = X.flatten()
Y_flat = Y.flatten()

# Extract the valid X and Y values
X_valid = X_flat[valid_indices]
Y_valid = Y_flat[valid_indices]

# Extract the corresponding SST values
SST_valid = sst_september.values[valid_indices.values]


#STEP 3
# Create a grid for the full X and Y coordinates
X_full = X.flatten()
Y_full = Y.flatten()

# Interpolate the SST values using griddata
SST_interpolated = griddata((X_valid, Y_valid), SST_valid, (X_full, Y_full), method='linear')
SST_interpolated = SST_interpolated.reshape(X.shape)

#STEP 4
# Define a land-sea mask (1 for land, 0 for sea)
# You may need to replace this with a proper land-sea mask
land_sea_mask = np.where(SST_interpolated > 0, 1, 0)

# Apply the mask to the SST data
SST_continents = np.where(land_sea_mask == 1, SST_interpolated, np.nan)



  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  array = array.get_duck_array()


IndexError: too many indices for array: array is 1-dimensional, but 3 were indexed

In [21]:
valid_indices