# Title

In [2]:
import warnings
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import haversine_distances

In [3]:
max_dist = 1500
n_bins = 30

df_sif = pd.read_csv("l2_sif_residuals_north_america.csv")

In [8]:
df_sif

Unnamed: 0.1,Unnamed: 0,sif,lon,lat,lon_norm,lat_norm,ols_mean,sif_residuals
0,0,0.404351,-70.915590,41.704530,0.802828,0.094696,0.708282,-0.303930
1,1,0.139903,-70.922360,41.722900,0.802602,0.095717,0.708276,-0.568372
2,2,0.845830,-70.916565,41.713380,0.802795,0.095188,0.708302,0.137528
3,3,0.172018,-70.910770,41.703796,0.802988,0.094655,0.708327,-0.536309
4,4,0.064241,-70.929200,41.741210,0.802374,0.096734,0.708269,-0.644027
...,...,...,...,...,...,...,...,...
414125,414125,0.468597,-124.973020,50.886963,-0.999107,0.604831,0.198517,0.270080
414126,414126,0.288103,-124.992065,50.914185,-0.999742,0.606344,0.198418,0.089685
414127,414127,0.273060,-124.981750,50.906740,-0.999398,0.605930,0.198496,0.074563
414128,414128,0.587591,-124.990420,50.926514,-0.999687,0.607028,0.198476,0.389115


In [4]:
EARTH_RADIUS = 6371  # radius in kilometers

def distance_matrix(
    X1: np.ndarray, X2: np.ndarray, units: str = "km"
) -> np.ndarray:
    """
    Computes the geodesic (or great circle if fast_dist=True) distance among all pairs of points given two sets of coordinates.
    Wrapper for scipy.spatial.distance.cdist using geopy.distance.geodesic as a the metric.

    NOTE:
    - points should be formatted in rows as [lat, lon]
    - if fast_dist=True, units are kilometers regardless of specification
    """
    # enforce 2d array in case of single point
    X1 = np.atleast_2d(X1)
    X2 = np.atleast_2d(X2)
    # great circle distances in kilometers
    X1_r = np.radians(X1)
    X2_r = np.radians(X2)
    return haversine_distances(X1_r, X2_r) * EARTH_RADIUS

def cloud_calc(fields: list[np.ndarray]) -> np.ndarray:
    """Calculate the semivariogram or covariogram for all point pairs."""
    center = lambda f: f - f.mean()
    residuals = [center(f) for f in fields]
    cloud = 0.5 * (np.subtract.outer(*residuals)) ** 2
    return cloud

def variogram_cloud(df) -> pd.DataFrame:
    """Calculate the (cross-) variogram cloud for corresponding field id's."""
    dist = distance_matrix(df[["lat", "lon"]].values, df[["lat", "lon"]].values)
    idx = np.triu_indices(dist.shape[0], k=1, m=dist.shape[1])
    dist = dist[idx]
    cloud = cloud_calc([df["sif"].values, df["sif"].values])[idx]
    
    return pd.DataFrame({"distance": dist, "variogram": cloud})


def _construct_variogram_bins(
    df_cloud: pd.DataFrame, n_bins: int
) -> tuple[np.ndarray, np.ndarray]:
    """Paritions the domain of a variogram cloud into `n_bins` bins; first bin extended to zero."""
    # use min non-zero dist for consistincy between variograms and cross-variograms
    min_dist = df_cloud[df_cloud["distance"] > 0]["distance"].min()
    max_dist = df_cloud["distance"].max()
    bin_centers = np.linspace(min_dist, max_dist, n_bins)
    bin_width = bin_centers[1] - bin_centers[0]
    bin_edges = np.arange(min_dist - 0.5 * bin_width, max_dist + bin_width, bin_width)
    # check that bin centers are actually centered
    if not np.allclose((bin_edges[1:] + bin_edges[:-1]) / 2, bin_centers):
        warnings.warn("WARNING: variogram bins are not centered.")
    bin_edges[0] = 0
    return bin_centers, bin_edges



def get_variogram(df, max_dist, n_bins) -> pd.DataFrame:
    """Compute the (cross-) variogram of the specified kind for the pair of fields (i, j). Return as a dataframe with bin averages and bin counts."""
    df_cloud = variogram_cloud(df)
    df_cloud = df_cloud[df_cloud["distance"] <= max_dist]
    bin_centers, bin_edges = _construct_variogram_bins(df_cloud, n_bins)
    df_cloud["bin_center"] = pd.cut(
        df_cloud["distance"], bin_edges, labels=bin_centers, include_lowest=True
    )
    df = (
        df_cloud.groupby("bin_center")["variogram"]
        .agg(["mean", "count"])
        .rename(columns={"mean": "bin_mean", "count": "bin_count"})
        .reset_index()
    )
    # convert bins from categories to numeric
    df["bin_center"] = df["bin_center"].astype("string").astype("float")
    if (df["bin_count"] < 30).any():
        warnings.warn(
            f"WARNING: Fewer than 30 pairs used for at least one bin in variogram"
            f" calculation."
        )
    
    return df

In [6]:
df_variogram = get_variogram(df_sif, max_dist, n_bins)
df_variogram

MemoryError: Unable to allocate 1.25 TiB for an array with shape (414130, 414130) and data type float64

In [7]:
# plot.plot_variograms(mod.fit_result, ["SIF"], title=title)

(414130, 8)