In [2]:

import numpy as np
import pandas as pd
from scipy import stats
from multiprocessing import Pool, cpu_count
from functools import partial
from tqdm import tqdm
import os
import math
import time


In [3]:
data=pd.read_csv("data.csv")
df=data.copy()
df

Unnamed: 0,date,time,latitude,longitude,depth,magnitude
0,1980-01-01,07:04:05.60,-41.287000,175.235000,26.100000,2.567000
1,1980-01-01,09:27:04.52,-40.490000,174.669010,72.600000,3.200000
2,1980-01-01,18:28:10.99,-37.630000,176.380000,210.000000,4.150000
3,1980-01-01,20:20:54.03,-40.290000,173.429990,212.000000,4.500000
4,1980-01-02,18:24:15.40,-38.480000,176.750000,5.000000,3.375000
...,...,...,...,...,...,...
396266,2024-12-31,18:31:44.72,-45.076202,167.357101,74.645035,2.920129
396267,2024-12-31,20:37:56.20,-37.985142,176.407059,133.171555,2.605564
396268,2024-12-31,21:24:08.59,-38.524281,175.719986,157.843369,2.570573
396269,2024-12-31,23:26:03.90,-38.329933,178.956192,12.000000,2.406177


### Step 1: Prepare the Data Set

##### Create Subsets: 

In [13]:
# Creating the subset of entire data into several chunks
N=len(df)

print("The length of the data is:",N)

df_filtered = df[df['magnitude'] >= 4].copy()
n=len(df_filtered)
n

The length of the data is: 396271


20570

In [14]:

# Parameters
subset_size = 1000      # number of events per subset
step = 10              # how many events to move for the next subset (overlap)
num_events = len(df_filtered)

# List to store subsets
subsets = []

# Create subsets
for start in range(0, num_events - subset_size + 1, step):
    end = start + subset_size
    subset = df.iloc[start:end].copy()  # get consecutive events
    subsets.append(subset)

print(f"Total subsets created: {len(subsets)}")


Total subsets created: 1958


## Compute pairwise distances (Haversine formula)

In [8]:

lat_rad = np.radians(df_filtered["latitude"].to_numpy())
lon_rad = np.radians(df_filtered["longitude"].to_numpy())

dlat = lat_rad[:, None] - lat_rad[None, :]
dlon = lon_rad[:, None] - lon_rad[None, :]


MemoryError: Unable to allocate 120. GiB for an array with shape (127061, 127061) and data type float64

In [15]:
import numpy as np
from scipy.spatial.distance import pdist, squareform

# Convert lat/lon to radians
lat_rad = np.radians(df_filtered["latitude"].to_numpy())
lon_rad = np.radians(df_filtered["longitude"].to_numpy())

# Stack coordinates
coords = np.column_stack((lat_rad, lon_rad))

# Compute pairwise haversine distances efficiently
def haversine_distances(coords):
    """
    coords: Nx2 array, columns = [lat, lon] in radians
    returns condensed distance vector (like pdist)
    """
    R = 6371.0  # Earth radius in km

    lat = coords[:, 0]
    lon = coords[:, 1]

    dlat = lat[:, None] - lat[None, :]
    dlon = lon[:, None] - lon[None, :]
    a = np.sin(dlat/2)**2 + np.cos(lat[:, None]) * np.cos(lat[None, :]) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    return R * c

# --- Instead of creating full NxN, use pdist with custom metric ---
def haversine_metric(u, v):
    dlat = v[0] - u[0]
    dlon = v[1] - u[1]
    a = np.sin(dlat/2)**2 + np.cos(u[0]) * np.cos(v[0]) * np.sin(dlon/2)**2
    return 2 * 6371.0 * np.arcsin(np.sqrt(a))

dist_vector = pdist(coords, metric=haversine_metric)


: 

## Step 2: Calculate the Correlation Integral, C(r)