In [4]:
import xarray as xr

# Open the NetCDF file
ds = xr.open_dataset("weighted_moho.nc")

# Print contents
print(ds)

# Access variables
print(ds['weighted_moho'])

# Convert to pandas DataFrame if needed
df = ds.to_dataframe().reset_index()
print(df)

<xarray.Dataset> Size: 3MB
Dimensions:        (index: 92346)
Coordinates:
  * index          (index) int64 739kB 0 1 2 3 4 ... 92342 92343 92344 92345
Data variables:
    latitude       (index) float64 739kB ...
    longitude      (index) float64 739kB ...
    weighted_moho  (index) float64 739kB ...
<xarray.DataArray 'weighted_moho' (index: 92346)> Size: 739kB
[92346 values with dtype=float64]
Coordinates:
  * index    (index) int64 739kB 0 1 2 3 4 5 ... 92341 92342 92343 92344 92345
       index  latitude  longitude  weighted_moho
0          0     36.43      94.87      49.500000
1          1     36.43      94.87      48.000000
2          2     36.81      92.95      49.000000
3          3     37.02      91.74      57.000000
4          4     32.25      91.70      74.000000
...      ...       ...        ...            ...
92341  92341    -89.50     175.50      35.096589
92342  92342    -89.50     176.50      35.094330
92343  92343    -89.50     177.50      35.091988
92344  92344    -89.

In [None]:
import pandas as pd
import os

# File path
path_to_data_file_dup = r"D:\Amitava\Projects\Spline_Moho\Global_moho_compilation\Global_crust.csv"

# Load only required columns while ignoring commented lines
data_eq_raw_dup = pd.read_csv(path_to_data_file_dup, comment='#')

# Convert Moho_km to numeric, forcing non-numeric values to NaN
data_eq_raw_dup["Moho_km"] = pd.to_numeric(data_eq_raw_dup["Moho_km"], errors="coerce")

# Drop rows with NaN values in Moho_km
data_eq_all_dup = data_eq_raw_dup.dropna(subset=["Moho_km"]).reset_index(drop=True)

# Check for duplicate (Lat, Long, Moho_km) groups
duplicate_mask_dup = data_eq_all_dup.duplicated(subset=["Lat", "Long", "Moho_km"], keep=False)

# Count duplicate entries
num_duplicates_dup = duplicate_mask_dup.sum()
# Count unique groups that are duplicated
num_repeated_groups_dup = data_eq_all_dup.loc[duplicate_mask_dup, ["Lat", "Long", "Moho_km"]].drop_duplicates().shape[0]

print(f"Total number of repeated (Lat, Long, Moho_km) rows: {num_duplicates_dup}")
print(f"Number of unique (Lat, Long, Moho_km) groups that are repeated: {num_repeated_groups_dup}")

# Keep only the first occurrence of each duplicate group
data_eq_dup = data_eq_all_dup.drop_duplicates(subset=["Lat", "Long", "Moho_km"], keep="first").reset_index(drop=True)

# Identify repeated (Lat, Long) pairs (ignore Moho_km here)
duplicated_latlong_mask = data_eq_dup.duplicated(subset=["Lat", "Long"], keep=False)

# Filter rows with repeated (Lat, Long) pairs
repeated_pairs_df = data_eq_dup[duplicated_latlong_mask]

# Group by the (Lat, Long) pairs
grouped = repeated_pairs_df.groupby(["Lat", "Long"])

# Prepare output directory and file path
output_dir = r"D:\Amitava\Projects\Spline_Moho\repeat"
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, "repeat.csv")

# Write to CSV with one row gap between repeated pairs
with open(output_path, "w", newline='', encoding="utf-8") as f:
    for i, (_, group) in enumerate(grouped):
        # Write header only for the first group
        group.to_csv(f, index=False, header=(i == 0))
        f.write("\n")  # Add a blank line between groups
