In [None]:
import ee
import geemap
import os

# Clear existing credentials if they have insufficient scopes
# Try both Windows and Unix paths
credentials_paths = [
    os.path.expanduser('~/.config/earthengine/credentials'),
    os.path.join(os.environ.get('APPDATA', ''), 'earthengine', 'credentials') if os.name == 'nt' else None
]

for cred_path in credentials_paths:
    if cred_path and os.path.exists(cred_path):
        os.remove(cred_path)
        print(f"Cleared existing credentials at {cred_path}")

# Authenticate - this will open a browser to grant proper scopes
# Make sure to grant ALL requested permissions when prompted
ee.Authenticate()
ee.Initialize()


Cleared existing credentials at C:\Users\user/.config/earthengine/credentials



Successfully saved authorization token.


In [5]:
dataset = ee.ImageCollection('COPERNICUS/MARINE/SATELLITE_OCEAN_COLOR/V6')

In [None]:
# MODIS Collection 062 - PAR (Photosynthetically Active Radiation) Data
# Python equivalent of the JavaScript code

# Use the correct Collection 062 and get data for 2010
dataset_modis = ee.ImageCollection('MODIS/062/MCD18C2') \
                  .filterDate('2010-01-01', '2010-03-01')

# Check what's available
print('Number of images:', dataset_modis.size().getInfo())

# Get available dates
dates = dataset_modis.aggregate_array('system:time_start').getInfo()
print('Available dates:', len(dates), 'images')
for date in dates[:5]:  # Show first 5 dates
    print(f"  {ee.Date(date).format('YYYY-MM-dd').getInfo()}")

# Select the GMT_1200_PAR band
gmt_1200_par = dataset_modis.select('GMT_1200_PAR')

# Create a map visualization
Map = geemap.Map()
Map.setCenter(6.746, 46.529, 2)

# Visualization parameters
gmt_1200_par_vis = {
    'min': -236,
    'max': 316,
    'palette': ['0f17ff', 'b11406', 'f1ff23']
}

# Add the layer (using mean of the collection for visualization)
Map.addLayer(
    gmt_1200_par.mean(),
    gmt_1200_par_vis,
    'Total PAR at GMT 12:00 - 2010 (Mean)'
)

# Display the map
Map


Number of images: 59
Available dates: 59 images
  2010-01-01
  2010-01-02
  2010-01-03
  2010-01-04
  2010-01-05


Map(center=[46.529, 6.746], controls=(WidgetControl(options=['position', 'transparent_bg'], position='topright…

In [9]:
# ====== CELL 1: Install & Import Libraries ======
# If you are in Google Colab, you might need to install cartopy for mapping:
# !pip install cartopy

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
from io import StringIO
import geemap
import ee

# For mapping (optional, more complex setup)
# import cartopy.crs as ccrs
# import cartopy.feature as cfeature

print("Libraries imported.")

Libraries imported.


In [12]:
# ====== CELL 2: Fetch and Load the SeaBASS/NOMAD Data ======
# The direct URL to your file
url = "https://seabass.gsfc.nasa.gov/wiki/NOMAD/nomad_seabass_v2.a_2008200.txt"

# Fetch the data
response = requests.get(url)
data_text = response.text

# SeaBASS files have a header ending with `end_header`
header_end = data_text.find('end_header')
header_lines = data_text[:header_end].split('\n')
data_body = data_text[header_end + len('end_header'):].strip()

# Parse header to extract column names and metadata
column_names = []
delimiter = ','
missing_value = '-999'

# Look for /fields= line which contains column names
for line in header_lines:
    if line.startswith('/fields='):
        # Extract column names from /fields= line
        fields = line.replace('/fields=', '').strip()
        column_names = [f.strip() for f in fields.split(',')]
    elif line.startswith('/delimiter='):
        delimiter = line.replace('/delimiter=', '').strip()
    elif line.startswith('/missing='):
        missing_value = line.replace('/missing=', '').strip()

# Print header info for debugging
print("=== NOMAD File Header Information ===")
print(f"Delimiter: {delimiter}")
print(f"Missing value indicator: {missing_value}")
if column_names:
    print(f"Found {len(column_names)} column names in header")
else:
    print("Column names not found in /fields= line, will infer from data")

# Load the main data into a Pandas DataFrame
# Use comma separator as specified in header
df = pd.read_csv(StringIO(data_body), sep=',', na_values=[missing_value, -999], header=None)

# If we found column names in header, use them
if column_names and len(column_names) == len(df.columns):
    df.columns = column_names
    print(f"✓ Using column names from header")
else:
    # NOMAD standard columns (based on typical SeaBASS format)
    # Try to infer from first row or use standard names
    num_cols = len(df.columns)
    print(f"⚠ Column names not in header format. Found {num_cols} columns.")
    print("Using standard NOMAD column names...")
    
    # Standard NOMAD columns (approximate - may need adjustment)
    standard_cols = [
        'year', 'month', 'day', 'hour', 'minute', 'second',
        'lat', 'lon', 'depth', 'temp', 'sal', 'chl', 'a_412', 'a_443', 'a_490',
        'a_510', 'a_532', 'a_555', 'a_670', 'bb_412', 'bb_443', 'bb_490',
        'bb_510', 'bb_532', 'bb_555', 'bb_670', 'Rrs_412', 'Rrs_443', 'Rrs_490',
        'Rrs_510', 'Rrs_532', 'Rrs_555', 'Rrs_670', 'Kd_412', 'Kd_443', 'Kd_490',
        'Kd_510', 'Kd_532', 'Kd_555', 'Kd_670', 'Ed_412', 'Ed_443', 'Ed_490',
        'Ed_510', 'Ed_532', 'Ed_555', 'Ed_670', 'Lu_412', 'Lu_443', 'Lu_490',
        'Lu_510', 'Lu_532', 'Lu_555', 'Lu_670'
    ]
    
    # Use standard names if count matches, otherwise use generic
    if num_cols <= len(standard_cols):
        df.columns = standard_cols[:num_cols]
    else:
        df.columns = [f'col_{i+1}' for i in range(num_cols)]

print(f"\n=== NOMAD Dataset Loaded ===")
print(f"Data shape: {df.shape}")
print(f"Columns ({len(df.columns)}): {list(df.columns)}")
print(f"\nFirst few rows:")
display(df.head())

=== NOMAD File Header Information ===
Delimiter: comma
Missing value indicator: -999
Found 217 column names in header
✓ Using column names from header

=== NOMAD Dataset Loaded ===
Data shape: (4459, 217)
Columns (217): ['year', 'month', 'day', 'hour', 'minute', 'second', 'lat', 'lon', 'id', 'oisst', 'etopo2', 'chl', 'chl_a', 'kd405', 'kd411', 'kd443', 'kd455', 'kd465', 'kd489', 'kd510', 'kd520', 'kd530', 'kd550', 'kd555', 'kd560', 'kd565', 'kd570', 'kd590', 'kd619', 'kd625', 'kd665', 'kd670', 'kd683', 'lw405', 'lw411', 'lw443', 'lw455', 'lw465', 'lw489', 'lw510', 'lw520', 'lw530', 'lw550', 'lw555', 'lw560', 'lw565', 'lw570', 'lw590', 'lw619', 'lw625', 'lw665', 'lw670', 'lw683', 'es405', 'es411', 'es443', 'es455', 'es465', 'es489', 'es510', 'es520', 'es530', 'es550', 'es555', 'es560', 'es565', 'es570', 'es590', 'es619', 'es625', 'es665', 'es670', 'es683', 'ap405', 'ap411', 'ap443', 'ap455', 'ap465', 'ap489', 'ap510', 'ap520', 'ap530', 'ap550', 'ap555', 'ap560', 'ap565', 'ap570', 'ap590

Unnamed: 0,year,month,day,hour,minute,second,lat,lon,id,oisst,...,allo,diato,lut,zea,chl_b,beta-car,alpha-car,alpha-beta-car,flag,cruise
0,2003,4,15,15,15,0,38.4279,-76.61,1565,3.7,...,,,,,,,,,20691,ace0301
1,2003,4,15,16,50,0,38.368,-76.5,1566,3.7,...,,,,,,,,,20675,ace0301
2,2003,4,15,17,50,0,38.3074,-76.44,1567,3.7,...,,,,,,,,,20691,ace0301
3,2003,4,17,18,15,0,38.6367,-76.32,1568,3.7,...,,,,,,,,,20675,ace0301
4,2003,7,21,18,27,0,38.3047,-76.44,1559,22.03,...,,,,,,,,,20691,ace0302


In [14]:
# Clean the dataset: Remove NaN columns and keep only interesting columns

print("=== Cleaning NOMAD Dataset ===")
print(f"Original shape: {df.shape}")

# Step 1: Remove columns that are entirely NaN
cols_before = len(df.columns)
df = df.dropna(axis=1, how='all')
cols_removed = cols_before - len(df.columns)
print(f"Removed {cols_removed} columns that were entirely NaN")
print(f"Shape after removing NaN columns: {df.shape}")

# Step 2: Define interesting columns to keep
# Keep date/time, location, and key oceanographic/biogeochemical parameters
interesting_cols = []

# Date and time columns
date_time_cols = ['year', 'month', 'day', 'hour', 'minute', 'second']
interesting_cols.extend([col for col in date_time_cols if col in df.columns])

# Location columns
location_cols = ['lat', 'lon', 'id']
interesting_cols.extend([col for col in location_cols if col in df.columns])

# Key oceanographic parameters
key_params = [
    'chl', 'chl_a', 'wt', 'sal', 'poc', 'kpar',  # Basic oceanographic
    'z_37', 'z_10', 'z_01',  # Depth parameters
    'oisst', 'etopo2',  # Environmental
    'chlide_a', 'mv_chl_a', 'dv_chl_a', 'chl_c3', 'chl_c2',  # Chlorophyll variants
]

# Add key spectral bands (select a few representative wavelengths)
# Common ocean color wavelengths: 412, 443, 490, 510, 555, 670 nm
key_wavelengths = ['443', '490', '510', '555', '670']
spectral_prefixes = ['a', 'bb', 'Rrs', 'Kd']  # Absorption, backscatter, remote sensing reflectance, diffuse attenuation

for prefix in spectral_prefixes:
    for wl in key_wavelengths:
        col_name = f'{prefix}{wl}'
        if col_name in df.columns:
            interesting_cols.append(col_name)

# Also keep some specific spectral measurements if they exist
additional_spectral = ['bbr442', 'bbr488', 'bbr510', 'bbr550', 'bbr555', 'bbr671']
interesting_cols.extend([col for col in additional_spectral if col in df.columns])

# Remove duplicates while preserving order
interesting_cols = list(dict.fromkeys(interesting_cols))

# Filter to keep only interesting columns
cols_to_keep = [col for col in interesting_cols if col in df.columns]
df_clean = df[cols_to_keep].copy()

print(f"\nKept {len(cols_to_keep)} interesting columns out of {len(df.columns)} available")
print(f"Final shape: {df_clean.shape}")
print(f"\nColumns kept: {cols_to_keep}")

# Display the cleaned dataset
print(f"\n=== Cleaned NOMAD Dataset ===")
display(df_clean.head(20))

# Update df to the cleaned version
df = df_clean

=== Cleaning NOMAD Dataset ===
Original shape: (4459, 217)
Removed 1 columns that were entirely NaN
Shape after removing NaN columns: (4459, 216)

Kept 23 interesting columns out of 216 available
Final shape: (4459, 23)

Columns kept: ['year', 'month', 'day', 'hour', 'minute', 'second', 'lat', 'lon', 'id', 'a443', 'a510', 'a555', 'a670', 'bb443', 'bb510', 'bb555', 'bb670', 'bbr442', 'bbr488', 'bbr510', 'bbr550', 'bbr555', 'bbr671']

=== Cleaned NOMAD Dataset ===


Unnamed: 0,year,month,day,hour,minute,second,lat,lon,id,a443,...,bb443,bb510,bb555,bb670,bbr442,bbr488,bbr510,bbr550,bbr555,bbr671
0,2003,4,15,15,15,0,38.4279,-76.61,1565,2.15391,...,,,,,,,,,,
1,2003,4,15,16,50,0,38.368,-76.5,1566,,...,,,,,,,,,,
2,2003,4,15,17,50,0,38.3074,-76.44,1567,1.27598,...,,,,,,,,,,
3,2003,4,17,18,15,0,38.6367,-76.32,1568,,...,,,,,,,,,,
4,2003,7,21,18,27,0,38.3047,-76.44,1559,2.12612,...,,,,,,,,,,
5,2003,7,23,15,58,0,38.636,-76.16,1562,2.24834,...,,,,,,,,,,
6,2003,7,23,17,33,0,38.6417,-76.32,1563,1.657,...,,,,,,,,,,
7,2003,10,13,13,30,0,38.4328,-76.6163,1569,2.60524,...,,,,,,,,,,
8,2003,10,13,15,2,0,38.3745,-76.5052,1570,1.78237,...,,,,,,,,,,
9,2003,10,16,16,19,0,38.5742,-76.027,1574,2.01242,...,,,,,,,,,,


In [15]:
# ====== Visualize Cleaned NOMAD Dataset as a Table ======
print("=== NOMAD: NASA bio-Optical Marine Algorithm Data set ===")
print("Version 2.0 ALPHA, created on 18 July 2008")
print("=" * 80)

# Display dataset information
print(f"\nDataset Summary:")
print(f"  Total records: {len(df):,}")
print(f"  Total columns: {len(df.columns)}")
print(f"  Missing values indicator: -999")

# Show column names and data types
print(f"\n=== Column Information ===")
column_info = pd.DataFrame({
    'Column Name': df.columns,
    'Data Type': df.dtypes.astype(str),
    'Non-Null Count': df.count(),
    'Null Count': df.isnull().sum(),
    'Null Percentage': (df.isnull().sum() / len(df) * 100).round(2)
})
display(column_info.style.format({'Null Percentage': '{:.2f}%'}))

# Display a sample of the data as a formatted table
print(f"\n=== Sample Data (First 30 rows) ===")
display(df.head(30).style.format(precision=4))

# Show basic statistics for numeric columns
print(f"\n=== Basic Statistics for Numeric Columns ===")
numeric_cols = df.select_dtypes(include=[np.number]).columns
if len(numeric_cols) > 0:
    display(df[numeric_cols].describe().style.format(precision=4))
else:
    print("No numeric columns found for statistics")

# Create a date column if year, month, day exist
if all(col in df.columns for col in ['year', 'month', 'day']):
    try:
        df['date'] = pd.to_datetime(df[['year', 'month', 'day']], errors='coerce')
        print(f"\n=== Date Range ===")
        print(f"From: {df['date'].min()}")
        print(f"To: {df['date'].max()}")
        print(f"Total unique dates: {df['date'].nunique()}")
    except:
        pass

print(f"\n=== Full Cleaned Dataset Table (showing first 100 rows) ===")
display(df.head(100).style.format(precision=4))


=== NOMAD: NASA bio-Optical Marine Algorithm Data set ===
Version 2.0 ALPHA, created on 18 July 2008

Dataset Summary:
  Total records: 4,459
  Total columns: 23
  Missing values indicator: -999

=== Column Information ===


Unnamed: 0,Column Name,Data Type,Non-Null Count,Null Count,Null Percentage
year,year,int64,4459,0,0.00%
month,month,int64,4459,0,0.00%
day,day,int64,4459,0,0.00%
hour,hour,int64,4459,0,0.00%
minute,minute,int64,4459,0,0.00%
second,second,int64,4459,0,0.00%
lat,lat,float64,4459,0,0.00%
lon,lon,float64,4459,0,0.00%
id,id,int64,4459,0,0.00%
a443,a443,float64,1138,3321,74.48%



=== Sample Data (First 30 rows) ===


Unnamed: 0,year,month,day,hour,minute,second,lat,lon,id,a443,a510,a555,a670,bb443,bb510,bb555,bb670,bbr442,bbr488,bbr510,bbr550,bbr555,bbr671
0,2003,4,15,15,15,0,38.4279,-76.61,1565,2.1539,0.9276,0.4256,1.0898,,,,,,,,,,
1,2003,4,15,16,50,0,38.368,-76.5,1566,,,,,,,,,,,,,,
2,2003,4,15,17,50,0,38.3074,-76.44,1567,1.276,0.5603,0.3043,0.8621,,,,,,,,,,
3,2003,4,17,18,15,0,38.6367,-76.32,1568,,,,,,,,,,,,,,
4,2003,7,21,18,27,0,38.3047,-76.44,1559,2.1261,0.9713,0.3476,0.9202,,,,,,,,,,
5,2003,7,23,15,58,0,38.636,-76.16,1562,2.2483,0.9333,0.3867,0.8852,,,,,,,,,,
6,2003,7,23,17,33,0,38.6417,-76.32,1563,1.657,0.7738,0.3395,0.8562,,,,,,,,,,
7,2003,10,13,13,30,0,38.4328,-76.6163,1569,2.6052,1.2909,0.6971,1.3054,,,,,,,,,,
8,2003,10,13,15,2,0,38.3745,-76.5052,1570,1.7824,0.8223,0.4386,0.9393,,,,,,,,,,
9,2003,10,16,16,19,0,38.5742,-76.027,1574,2.0124,0.8268,0.4518,0.6675,,,,,,,,,,



=== Basic Statistics for Numeric Columns ===


Unnamed: 0,year,month,day,hour,minute,second,lat,lon,id,a443,a510,a555,a670,bb443,bb510,bb555,bb670,bbr442,bbr488,bbr510,bbr550,bbr555,bbr671
count,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,1138.0,1136.0,1122.0,1086.0,369.0,369.0,369.0,369.0,247.0,55.0,139.0,33.0,13.0,247.0
mean,1999.2824,5.8388,15.2485,14.9013,27.7589,0.0,1.8687,-61.5921,4377.3813,0.2897,0.1497,0.117,0.5039,0.0056,0.004,0.0034,0.0025,0.005,0.0031,0.0043,0.0022,0.0029,0.0022
std,3.7691,3.5502,9.0218,4.4719,17.192,0.0,44.7651,53.895,2298.2721,0.4174,0.1817,0.0908,0.1114,0.0019,0.0017,0.0016,0.0013,0.0018,0.0008,0.0019,0.0008,0.0014,0.0014
min,1991.0,1.0,1.0,0.0,0.0,0.0,-77.0356,-179.955,6.0,0.0119,0.0342,0.0603,0.441,0.0027,0.0016,0.0012,0.0007,0.0025,0.0022,0.0017,0.0015,0.0018,0.0008
25%,1997.0,2.0,7.0,13.0,13.0,0.0,-61.299,-82.7,2028.5,0.0479,0.0487,0.0669,0.4467,0.0042,0.0028,0.0022,0.0015,0.0037,0.0025,0.0028,0.0017,0.0021,0.0012
50%,1999.0,6.0,15.0,15.0,29.0,0.0,27.093,-67.675,5039.0,0.0992,0.0697,0.077,0.4583,0.005,0.0036,0.003,0.0022,0.0045,0.0027,0.0036,0.0019,0.0026,0.0018
75%,2002.0,9.0,23.0,18.0,42.0,0.0,34.4585,-63.9615,6271.5,0.2953,0.1492,0.1211,0.4929,0.0066,0.0049,0.0042,0.0032,0.0058,0.0035,0.0055,0.0021,0.0029,0.0025
max,2007.0,12.0,31.0,23.0,59.0,0.0,79.69,179.907,7831.0,2.6052,1.3521,0.7429,1.3054,0.0118,0.0096,0.0088,0.0075,0.0108,0.0053,0.01,0.0049,0.0075,0.008



=== Date Range ===
From: 1991-12-03 00:00:00
To: 2007-09-06 00:00:00
Total unique dates: 1820

=== Full Cleaned Dataset Table (showing first 100 rows) ===


Unnamed: 0,year,month,day,hour,minute,second,lat,lon,id,a443,a510,a555,a670,bb443,bb510,bb555,bb670,bbr442,bbr488,bbr510,bbr550,bbr555,bbr671,date
0,2003,4,15,15,15,0,38.4279,-76.61,1565,2.1539,0.9276,0.4256,1.0898,,,,,,,,,,,2003-04-15 00:00:00
1,2003,4,15,16,50,0,38.368,-76.5,1566,,,,,,,,,,,,,,,2003-04-15 00:00:00
2,2003,4,15,17,50,0,38.3074,-76.44,1567,1.276,0.5603,0.3043,0.8621,,,,,,,,,,,2003-04-15 00:00:00
3,2003,4,17,18,15,0,38.6367,-76.32,1568,,,,,,,,,,,,,,,2003-04-17 00:00:00
4,2003,7,21,18,27,0,38.3047,-76.44,1559,2.1261,0.9713,0.3476,0.9202,,,,,,,,,,,2003-07-21 00:00:00
5,2003,7,23,15,58,0,38.636,-76.16,1562,2.2483,0.9333,0.3867,0.8852,,,,,,,,,,,2003-07-23 00:00:00
6,2003,7,23,17,33,0,38.6417,-76.32,1563,1.657,0.7738,0.3395,0.8562,,,,,,,,,,,2003-07-23 00:00:00
7,2003,10,13,13,30,0,38.4328,-76.6163,1569,2.6052,1.2909,0.6971,1.3054,,,,,,,,,,,2003-10-13 00:00:00
8,2003,10,13,15,2,0,38.3745,-76.5052,1570,1.7824,0.8223,0.4386,0.9393,,,,,,,,,,,2003-10-13 00:00:00
9,2003,10,16,16,19,0,38.5742,-76.027,1574,2.0124,0.8268,0.4518,0.6675,,,,,,,,,,,2003-10-16 00:00:00


In [18]:
# ====== Yield Prediction Model (Chlorophyll-a Prediction) ======
# Predict chl_a (chlorophyll-a) as a proxy for ocean productivity/yield
# Based on the same columns from the cleaned dataset

# Install scikit-learn if not available
try:
    import sklearn
    print("✓ scikit-learn is already installed")
except ImportError:
    print("Installing scikit-learn...")
    import sys
    import subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "scikit-learn"])
    print("✓ scikit-learn installed successfully!")
    import sklearn  # Import after installation

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np
import warnings
warnings.filterwarnings('ignore')

print("=== Yield Prediction Model ===")
print("Predicting chlorophyll-a (chl_a) as ocean productivity indicator\n")

# Check if chl_a exists in the dataset
if 'chl_a' not in df.columns:
    print("⚠ Warning: 'chl_a' column not found. Using 'chl' as target instead.")
    target_col = 'chl' if 'chl' in df.columns else None
else:
    target_col = 'chl_a'

if target_col is None:
    print("❌ Error: No suitable target column found for prediction.")
    print("Available columns:", list(df.columns))
else:
    # Prepare features - exclude target and non-numeric columns
    exclude_cols = [target_col, 'year', 'month', 'day', 'hour', 'minute', 'second', 'id', 'date']
    feature_cols = [col for col in df.columns if col not in exclude_cols and df[col].dtype in ['int64', 'float64']]
    
    # Remove features with too many missing values (>50%)
    feature_cols = [col for col in feature_cols if df[col].notna().sum() / len(df) > 0.5]
    
    print(f"Target variable: {target_col}")
    print(f"Number of features: {len(feature_cols)}")
    print(f"Features used: {feature_cols[:10]}..." if len(feature_cols) > 10 else f"Features used: {feature_cols}")
    
    # Prepare data
    X = df[feature_cols].copy()
    y = df[target_col].copy()
    
    # Remove rows where target is missing
    valid_mask = y.notna()
    X = X[valid_mask]
    y = y[valid_mask]
    
    # Fill remaining NaN values in features with median
    X = X.fillna(X.median())
    
    print(f"\nData after cleaning:")
    print(f"  Samples: {len(X)}")
    print(f"  Features: {len(feature_cols)}")
    print(f"  Target range: {y.min():.4f} to {y.max():.4f}")
    print(f"  Target mean: {y.mean():.4f}")
    
    if len(X) > 10:
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        # Train Random Forest model
        print("\n=== Training Random Forest Model ===")
        rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
        rf_model.fit(X_train, y_train)
        
        # Predictions
        y_train_pred = rf_model.predict(X_train)
        y_test_pred = rf_model.predict(X_test)
        
        # Evaluate
        train_r2 = r2_score(y_train, y_train_pred)
        test_r2 = r2_score(y_test, y_test_pred)
        train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
        test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
        train_mae = mean_absolute_error(y_train, y_train_pred)
        test_mae = mean_absolute_error(y_test, y_test_pred)
        
        print(f"\nModel Performance:")
        print(f"  Training R²: {train_r2:.4f}")
        print(f"  Test R²: {test_r2:.4f}")
        print(f"  Training RMSE: {train_rmse:.4f}")
        print(f"  Test RMSE: {test_rmse:.4f}")
        print(f"  Training MAE: {train_mae:.4f}")
        print(f"  Test MAE: {test_mae:.4f}")
        
        # Feature importance
        feature_importance = pd.DataFrame({
            'Feature': feature_cols,
            'Importance': rf_model.feature_importances_
        }).sort_values('Importance', ascending=False)
        
        print(f"\n=== Top 10 Most Important Features ===")
        display(feature_importance.head(10))
        
        # Create predictions dataframe
        predictions_df = pd.DataFrame({
            'Actual': y_test.values,
            'Predicted': y_test_pred,
            'Error': y_test.values - y_test_pred,
            'Abs_Error': np.abs(y_test.values - y_test_pred)
        })
        
        print(f"\n=== Sample Predictions (First 20) ===")
        display(predictions_df.head(20).style.format(precision=4))
        
        # Store model and results
        print(f"\n✓ Model trained successfully!")
        print(f"  Model type: Random Forest Regressor")
        print(f"  Can predict {target_col} based on: {', '.join(feature_cols[:5])}...")
    else:
        print("❌ Error: Not enough data for training (need >10 samples)")


Installing scikit-learn...
✓ scikit-learn installed successfully!
=== Yield Prediction Model ===
Predicting chlorophyll-a (chl_a) as ocean productivity indicator

❌ Error: No suitable target column found for prediction.
Available columns: ['year', 'month', 'day', 'hour', 'minute', 'second', 'lat', 'lon', 'id', 'a443', 'a510', 'a555', 'a670', 'bb443', 'bb510', 'bb555', 'bb670', 'bbr442', 'bbr488', 'bbr510', 'bbr550', 'bbr555', 'bbr671', 'date']


In [19]:
# ====== Global Marine Phytoplankton Production Dataset ======
# Download and display the dataset from PANGAEA
# Source: https://download.pangaea.de/dataset/932417/files/Global_marine_phytoplankton_production_dataset.txt

import pandas as pd
import numpy as np

url = "https://download.pangaea.de/dataset/932417/files/Global_marine_phytoplankton_production_dataset.txt"

print("=== Downloading Global Marine Phytoplankton Production Dataset ===")
print(f"Source: {url}\n")

# Download the dataset
# The dataset appears to be tab-separated based on the structure
try:
    # Try tab-separated first (most common for PANGAEA datasets)
    df_phytoplankton = pd.read_csv(url, sep='\t', low_memory=False)
    print("✓ Dataset loaded successfully (tab-separated)")
except:
    try:
        # If tab doesn't work, try comma-separated
        df_phytoplankton = pd.read_csv(url, sep=',', low_memory=False)
        print("✓ Dataset loaded successfully (comma-separated)")
    except:
        # Try space-separated as last resort
        df_phytoplankton = pd.read_csv(url, delim_whitespace=True, low_memory=False)
        print("✓ Dataset loaded successfully (space-separated)")

print(f"\nDataset shape: {df_phytoplankton.shape}")
print(f"Columns: {len(df_phytoplankton.columns)}")
print(f"\nColumn names:")
print(list(df_phytoplankton.columns))

# Display basic information
print(f"\n=== Dataset Summary ===")
print(f"Total records: {len(df_phytoplankton):,}")
print(f"Total columns: {len(df_phytoplankton.columns)}")
print(f"\nFirst few rows:")
display(df_phytoplankton.head(20))

# Show data types and null counts
print(f"\n=== Column Information ===")
column_info = pd.DataFrame({
    'Column Name': df_phytoplankton.columns,
    'Data Type': df_phytoplankton.dtypes.astype(str),
    'Non-Null Count': df_phytoplankton.count(),
    'Null Count': df_phytoplankton.isnull().sum(),
    'Null Percentage': (df_phytoplankton.isnull().sum() / len(df_phytoplankton) * 100).round(2)
})
display(column_info.head(30))  # Show first 30 columns

# Show basic statistics for numeric columns
print(f"\n=== Basic Statistics for Numeric Columns ===")
numeric_cols = df_phytoplankton.select_dtypes(include=[np.number]).columns
if len(numeric_cols) > 0:
    print(f"Found {len(numeric_cols)} numeric columns")
    display(df_phytoplankton[numeric_cols].describe().style.format(precision=4))

# Display a sample of the full dataset
print(f"\n=== Sample Data (First 50 rows) ===")
display(df_phytoplankton.head(50).style.format(precision=4))

print(f"\n=== Full Dataset (showing first 100 rows) ===")
display(df_phytoplankton.head(100))


=== Downloading Global Marine Phytoplankton Production Dataset ===
Source: https://download.pangaea.de/dataset/932417/files/Global_marine_phytoplankton_production_dataset.txt

✓ Dataset loaded successfully (tab-separated)

Dataset shape: (37722, 49)
Columns: 49

Column names:
['Count', 'Event', 'Short reference', 'Paper doi', 'Data doi/data link', 'Profile number', 'Date', 'Year', 'Month', 'Day of the year', 'Latitude', 'Longitude', 'Day length (h)', 'Bottom depth (m)', 'Bottom depth sd (m)', 'Mixed Layer Depth (m)', 'Distance from coastline (Km)', 'Euphotic zone depth (m)', 'Sampling depth (m)', 'Max sampling depth (m)', 'Max production depth (m)', 'SST (°C)', 'SST_flag', 'surface PAR (E m^-2 day^-1)', 'PAR_flag', 'Pbopt (mg C mg Chla-1 h-1)', 'Depth-resolved chl a (mg m^-3)', 'Depth-integrated chl a (mg m^-2)', 'Total Chl a (mg m^-2)', 'Depth-resolved primary production (mg C m^-3 day^-1)', 'Depth-integrated primary production (mg C m^-2 day^-1)', 'Production to biomass ratio (mg C d

Unnamed: 0,Count,Event,Short reference,Paper doi,Data doi/data link,Profile number,Date,Year,Month,Day of the year,...,Max sampling depth magnitude,Max production depth magnitude,SST magnitude,surface PAR magnitude,Pbopt magnitude,Surface chl a magnitude,Depth-integrated chl a magnitude,Total chl a magnitude,Depth-integrated primary production magnitude,Production to biomass ratio magnitude
0,1,Albatros,Behrenfeld and Falkowski 1997\r\n,https://doi.org/10.4319/lo.1997.42.1.0001,http://sites.science.oregonstate.edu/ocean.pro...,1,16/03/1979,1979,3,75,...,very_low,very_low,low,moderate,low,low,low,moderate,low,low
1,2,Albatros,Behrenfeld and Falkowski 1997\r\n,https://doi.org/10.4319/lo.1997.42.1.0001,http://sites.science.oregonstate.edu/ocean.pro...,1,16/03/1979,1979,3,75,...,very_low,very_low,low,moderate,low,low,low,moderate,low,low
2,3,Albatros,Behrenfeld and Falkowski 1997\r\n,https://doi.org/10.4319/lo.1997.42.1.0001,http://sites.science.oregonstate.edu/ocean.pro...,1,16/03/1979,1979,3,75,...,very_low,very_low,low,moderate,low,low,low,moderate,low,low
3,4,Albatros,Behrenfeld and Falkowski 1997\r\n,https://doi.org/10.4319/lo.1997.42.1.0001,http://sites.science.oregonstate.edu/ocean.pro...,1,16/03/1979,1979,3,75,...,very_low,very_low,low,moderate,low,low,low,moderate,low,low
4,5,Albatros,Behrenfeld and Falkowski 1997\r\n,https://doi.org/10.4319/lo.1997.42.1.0001,http://sites.science.oregonstate.edu/ocean.pro...,1,16/03/1979,1979,3,75,...,very_low,very_low,low,moderate,low,low,low,moderate,low,low
5,6,Albatros,Behrenfeld and Falkowski 1997\r\n,https://doi.org/10.4319/lo.1997.42.1.0001,http://sites.science.oregonstate.edu/ocean.pro...,1,16/03/1979,1979,3,75,...,very_low,very_low,low,moderate,low,low,low,moderate,low,low
6,7,Albatros,Behrenfeld and Falkowski 1997\r\n,https://doi.org/10.4319/lo.1997.42.1.0001,http://sites.science.oregonstate.edu/ocean.pro...,2,17/03/1979,1979,3,76,...,very_low,very_low,moderate,moderate,high,low,low,moderate,low,very_low
7,8,Albatros,Behrenfeld and Falkowski 1997\r\n,https://doi.org/10.4319/lo.1997.42.1.0001,http://sites.science.oregonstate.edu/ocean.pro...,2,17/03/1979,1979,3,76,...,very_low,very_low,moderate,moderate,high,low,low,moderate,low,very_low
8,9,Albatros,Behrenfeld and Falkowski 1997\r\n,https://doi.org/10.4319/lo.1997.42.1.0001,http://sites.science.oregonstate.edu/ocean.pro...,2,17/03/1979,1979,3,76,...,very_low,very_low,moderate,moderate,high,low,low,moderate,low,very_low
9,10,Albatros,Behrenfeld and Falkowski 1997\r\n,https://doi.org/10.4319/lo.1997.42.1.0001,http://sites.science.oregonstate.edu/ocean.pro...,2,17/03/1979,1979,3,76,...,very_low,very_low,moderate,moderate,high,low,low,moderate,low,very_low



=== Column Information ===


Unnamed: 0,Column Name,Data Type,Non-Null Count,Null Count,Null Percentage
Count,Count,int64,37722,0,0.0
Event,Event,object,36872,850,2.25
Short reference,Short reference,object,37722,0,0.0
Paper doi,Paper doi,object,17484,20238,53.65
Data doi/data link,Data doi/data link,object,37722,0,0.0
Profile number,Profile number,int64,37722,0,0.0
Date,Date,object,37722,0,0.0
Year,Year,int64,37722,0,0.0
Month,Month,int64,37722,0,0.0
Day of the year,Day of the year,int64,37722,0,0.0



=== Basic Statistics for Numeric Columns ===
Found 27 numeric columns


Unnamed: 0,Count,Profile number,Year,Month,Day of the year,Latitude,Longitude,Day length (h),Bottom depth (m),Bottom depth sd (m),Mixed Layer Depth (m),Distance from coastline (Km),Euphotic zone depth (m),Sampling depth (m),Max sampling depth (m),Max production depth (m),SST (°C),SST_flag,surface PAR (E m^-2 day^-1),PAR_flag,Pbopt (mg C mg Chla-1 h-1),Depth-resolved chl a (mg m^-3),Depth-integrated chl a (mg m^-2),Total Chl a (mg m^-2),Depth-resolved primary production (mg C m^-3 day^-1),Depth-integrated primary production (mg C m^-2 day^-1),Production to biomass ratio (mg C day-1 / mg Chl a)
count,37722.0,37722.0,37722.0,37722.0,37722.0,37722.0,37722.0,37722.0,37722.0,37722.0,37722.0,37722.0,37722.0,37722.0,37722.0,37722.0,37722.0,37722.0,37722.0,37722.0,37722.0,37722.0,37722.0,37722.0,37722.0,37722.0,37722.0
mean,18861.5,2976.9074,1990.4112,5.8772,162.931,29.5984,-65.8258,12.5998,1758.831,19.5527,37.9041,178.22,53.4116,28.9839,60.2525,11.4191,16.7979,0.6964,32.5509,0.5999,5.299,1.2523,41.7261,35.1436,32.335,865.7561,28.0695
std,10889.5478,1738.6742,13.1693,3.1722,96.4534,21.6631,75.1616,2.4265,1794.6187,31.6514,46.0709,249.0367,24.1734,35.583,39.891,13.7118,6.0477,0.4598,15.4984,0.4899,1.0475,3.0035,73.0256,29.9512,96.9794,1440.2529,58.7901
min,1.0,1.0,1958.0,1.0,1.0,-77.74,-188.5,7.9336,2.0,0.0,1.0,0.4017,6.6022,0.0,1.0,0.0,-1.74,0.0,1.38,0.0,1.13,0.0,0.075,5.3676,0.0,3.3,0.1464
25%,9431.25,1431.0,1980.0,3.0,86.0,21.0625,-120.9217,11.2641,77.0,2.114,18.0,22.1945,33.7132,5.0,29.5,2.0,13.86,0.0,14.51,0.0,5.15,0.16,17.3162,16.4774,1.6,215.6,9.0162
50%,18861.5,2990.0,1988.0,5.0,148.0,33.3,-73.5,12.1034,1146.0,7.7571,29.0,97.0917,49.9746,16.35,49.0,7.0,15.0,1.0,31.75,1.0,5.27,0.44,25.915,26.0146,6.2,469.595,17.7586
75%,28291.75,4404.75,2002.0,8.0,242.0,40.8,-17.5,13.482,3828.0,21.3252,46.0,207.535,70.2591,40.0,88.0,15.0,19.92,1.0,46.82,1.0,5.97,1.2,43.875,44.0931,26.265,963.15,33.8149
max,37722.0,6084.0,2017.0,12.0,364.0,80.77,172.8013,24.0,5892.0,436.8927,524.0,2078.13,122.2372,800.0,250.0,107.0,30.1,1.0,63.77,1.0,6.63,119.08,2337.0,392.2643,3080.0,30189.755,3402.6668



=== Sample Data (First 50 rows) ===


Unnamed: 0,Count,Event,Short reference,Paper doi,Data doi/data link,Profile number,Date,Year,Month,Day of the year,Latitude,Longitude,Day length (h),Bottom depth (m),Bottom depth sd (m),Mixed Layer Depth (m),Distance from coastline (Km),Euphotic zone depth (m),Sampling depth (m),Max sampling depth (m),Max production depth (m),SST (°C),SST_flag,surface PAR (E m^-2 day^-1),PAR_flag,Pbopt (mg C mg Chla-1 h-1),Depth-resolved chl a (mg m^-3),Depth-integrated chl a (mg m^-2),Total Chl a (mg m^-2),Depth-resolved primary production (mg C m^-3 day^-1),Depth-integrated primary production (mg C m^-2 day^-1),Production to biomass ratio (mg C day-1 / mg Chl a),hemisphere,Northern hemisphere season,Bottom depth magnitude,Bottom depth SD magnitude,MLD magnitude,Distance from coastline magnitude,Euphotic zone depth magnitude,Max sampling depth magnitude,Max production depth magnitude,SST magnitude,surface PAR magnitude,Pbopt magnitude,Surface chl a magnitude,Depth-integrated chl a magnitude,Total chl a magnitude,Depth-integrated primary production magnitude,Production to biomass ratio magnitude
0,1,Albatros,Behrenfeld and Falkowski 1997,https://doi.org/10.4319/lo.1997.42.1.0001,http://sites.science.oregonstate.edu/ocean.productivity/data/field/c14/oppwgnew.txt,1,16/03/1979,1979,3,75,38.78,-72.27,11.7517,2570,9.9827,35,191.214,24.7392,0.0,23.03,2.55,7.0,0,30.86,0,3.23,2.72,69.8231,66.7656,126.0,1775.3003,25.4257,northern,winter,high,very_low,low,low,very_low,very_low,very_low,low,moderate,low,low,low,moderate,low,low
1,2,Albatros,Behrenfeld and Falkowski 1997,https://doi.org/10.4319/lo.1997.42.1.0001,http://sites.science.oregonstate.edu/ocean.productivity/data/field/c14/oppwgnew.txt,1,16/03/1979,1979,3,75,38.78,-72.27,11.7517,2570,9.9827,35,191.214,24.7392,2.55,23.03,2.55,7.0,0,30.86,0,3.23,2.78,69.8231,66.7656,165.24,1775.3003,25.4257,northern,winter,high,very_low,low,low,very_low,very_low,very_low,low,moderate,low,low,low,moderate,low,low
2,3,Albatros,Behrenfeld and Falkowski 1997,https://doi.org/10.4319/lo.1997.42.1.0001,http://sites.science.oregonstate.edu/ocean.productivity/data/field/c14/oppwgnew.txt,1,16/03/1979,1979,3,75,38.78,-72.27,11.7517,2570,9.9827,35,191.214,24.7392,4.58,23.03,2.55,7.0,0,30.86,0,3.23,3.18,69.8231,66.7656,152.6,1775.3003,25.4257,northern,winter,high,very_low,low,low,very_low,very_low,very_low,low,moderate,low,low,low,moderate,low,low
3,4,Albatros,Behrenfeld and Falkowski 1997,https://doi.org/10.4319/lo.1997.42.1.0001,http://sites.science.oregonstate.edu/ocean.productivity/data/field/c14/oppwgnew.txt,1,16/03/1979,1979,3,75,38.78,-72.27,11.7517,2570,9.9827,35,191.214,24.7392,9.16,23.03,2.55,7.0,0,30.86,0,3.23,3.26,69.8231,66.7656,94.68,1775.3003,25.4257,northern,winter,high,very_low,low,low,very_low,very_low,very_low,low,moderate,low,low,low,moderate,low,low
4,5,Albatros,Behrenfeld and Falkowski 1997,https://doi.org/10.4319/lo.1997.42.1.0001,http://sites.science.oregonstate.edu/ocean.productivity/data/field/c14/oppwgnew.txt,1,16/03/1979,1979,3,75,38.78,-72.27,11.7517,2570,9.9827,35,191.214,24.7392,14.98,23.03,2.55,7.0,0,30.86,0,3.23,3.1,69.8231,66.7656,30.5,1775.3003,25.4257,northern,winter,high,very_low,low,low,very_low,very_low,very_low,low,moderate,low,low,low,moderate,low,low
5,6,Albatros,Behrenfeld and Falkowski 1997,https://doi.org/10.4319/lo.1997.42.1.0001,http://sites.science.oregonstate.edu/ocean.productivity/data/field/c14/oppwgnew.txt,1,16/03/1979,1979,3,75,38.78,-72.27,11.7517,2570,9.9827,35,191.214,24.7392,23.03,23.03,2.55,7.0,0,30.86,0,3.23,2.74,69.8231,66.7656,6.97,1775.3003,25.4257,northern,winter,high,very_low,low,low,very_low,very_low,very_low,low,moderate,low,low,low,moderate,low,low
6,7,Albatros,Behrenfeld and Falkowski 1997,https://doi.org/10.4319/lo.1997.42.1.0001,http://sites.science.oregonstate.edu/ocean.productivity/data/field/c14/oppwgnew.txt,2,17/03/1979,1979,3,76,39.5,-73.43,11.7894,35,3.1269,29,65.0812,24.9134,0.0,21.0,2.33,13.37,0,31.89,0,5.09,2.67,62.138,66.1405,81.95,1015.6725,16.3454,northern,winter,very_low,very_low,low,very_low,very_low,very_low,very_low,moderate,moderate,high,low,low,moderate,low,very_low
7,8,Albatros,Behrenfeld and Falkowski 1997,https://doi.org/10.4319/lo.1997.42.1.0001,http://sites.science.oregonstate.edu/ocean.productivity/data/field/c14/oppwgnew.txt,2,17/03/1979,1979,3,76,39.5,-73.43,11.7894,35,3.1269,29,65.0812,24.9134,2.33,21.0,2.33,13.37,0,31.89,0,5.09,2.83,62.138,66.1405,94.29,1015.6725,16.3454,northern,winter,very_low,very_low,low,very_low,very_low,very_low,very_low,moderate,moderate,high,low,low,moderate,low,very_low
8,9,Albatros,Behrenfeld and Falkowski 1997,https://doi.org/10.4319/lo.1997.42.1.0001,http://sites.science.oregonstate.edu/ocean.productivity/data/field/c14/oppwgnew.txt,2,17/03/1979,1979,3,76,39.5,-73.43,11.7894,35,3.1269,29,65.0812,24.9134,4.18,21.0,2.33,13.37,0,31.89,0,5.09,3.32,62.138,66.1405,83.61,1015.6725,16.3454,northern,winter,very_low,very_low,low,very_low,very_low,very_low,very_low,moderate,moderate,high,low,low,moderate,low,very_low
9,10,Albatros,Behrenfeld and Falkowski 1997,https://doi.org/10.4319/lo.1997.42.1.0001,http://sites.science.oregonstate.edu/ocean.productivity/data/field/c14/oppwgnew.txt,2,17/03/1979,1979,3,76,39.5,-73.43,11.7894,35,3.1269,29,65.0812,24.9134,8.36,21.0,2.33,13.37,0,31.89,0,5.09,2.95,62.138,66.1405,65.47,1015.6725,16.3454,northern,winter,very_low,very_low,low,very_low,very_low,very_low,very_low,moderate,moderate,high,low,low,moderate,low,very_low



=== Full Dataset (showing first 100 rows) ===


Unnamed: 0,Count,Event,Short reference,Paper doi,Data doi/data link,Profile number,Date,Year,Month,Day of the year,...,Max sampling depth magnitude,Max production depth magnitude,SST magnitude,surface PAR magnitude,Pbopt magnitude,Surface chl a magnitude,Depth-integrated chl a magnitude,Total chl a magnitude,Depth-integrated primary production magnitude,Production to biomass ratio magnitude
0,1,Albatros,Behrenfeld and Falkowski 1997\r\n,https://doi.org/10.4319/lo.1997.42.1.0001,http://sites.science.oregonstate.edu/ocean.pro...,1,16/03/1979,1979,3,75,...,very_low,very_low,low,moderate,low,low,low,moderate,low,low
1,2,Albatros,Behrenfeld and Falkowski 1997\r\n,https://doi.org/10.4319/lo.1997.42.1.0001,http://sites.science.oregonstate.edu/ocean.pro...,1,16/03/1979,1979,3,75,...,very_low,very_low,low,moderate,low,low,low,moderate,low,low
2,3,Albatros,Behrenfeld and Falkowski 1997\r\n,https://doi.org/10.4319/lo.1997.42.1.0001,http://sites.science.oregonstate.edu/ocean.pro...,1,16/03/1979,1979,3,75,...,very_low,very_low,low,moderate,low,low,low,moderate,low,low
3,4,Albatros,Behrenfeld and Falkowski 1997\r\n,https://doi.org/10.4319/lo.1997.42.1.0001,http://sites.science.oregonstate.edu/ocean.pro...,1,16/03/1979,1979,3,75,...,very_low,very_low,low,moderate,low,low,low,moderate,low,low
4,5,Albatros,Behrenfeld and Falkowski 1997\r\n,https://doi.org/10.4319/lo.1997.42.1.0001,http://sites.science.oregonstate.edu/ocean.pro...,1,16/03/1979,1979,3,75,...,very_low,very_low,low,moderate,low,low,low,moderate,low,low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,96,Argus,Behrenfeld and Falkowski 1997\r\n,https://doi.org/10.4319/lo.1997.42.1.0001,http://sites.science.oregonstate.edu/ocean.pro...,18,03/04/1978,1978,4,93,...,very_low,very_low,low,high,low,very_low,high,moderate,low,very_low
96,97,Argus,Behrenfeld and Falkowski 1997\r\n,https://doi.org/10.4319/lo.1997.42.1.0001,http://sites.science.oregonstate.edu/ocean.pro...,18,03/04/1978,1978,4,93,...,very_low,very_low,low,high,low,very_low,high,moderate,low,very_low
97,98,Argus,Behrenfeld and Falkowski 1997\r\n,https://doi.org/10.4319/lo.1997.42.1.0001,http://sites.science.oregonstate.edu/ocean.pro...,19,04/04/1978,1978,4,94,...,very_low,high,low,high,low,very_low,moderate,moderate,low,very_low
98,99,Argus,Behrenfeld and Falkowski 1997\r\n,https://doi.org/10.4319/lo.1997.42.1.0001,http://sites.science.oregonstate.edu/ocean.pro...,19,04/04/1978,1978,4,94,...,very_low,high,low,high,low,very_low,moderate,moderate,low,very_low


In [None]:
# ====== Display Most Recent Rows - Key Algae Production Columns ======
# Filter to show only the most interesting columns affecting phytoplankton/algae production
# Sort by most recent dates first

print("=== Most Recent Data - Key Algae Production Factors ===\n")

# Identify key columns for algae production
production_key_cols = []

# Date/Time columns (for sorting and context)
date_cols = ['Date', 'Year', 'Month', 'Day of the year']
for col in date_cols:
    if col in df_phytoplankton.columns:
        production_key_cols.append(col)

# Location
location_cols = ['Latitude', 'Longitude']
for col in location_cols:
    if col in df_phytoplankton.columns:
        production_key_cols.append(col)

# Environmental factors critical for algae growth
env_cols = [
    'SST (°C)',  # Sea Surface Temperature
    'surface PAR (E m^-2 day^-1)',  # Photosynthetically Active Radiation
    'Day length (h)',  # Daylight hours
    'Mixed Layer Depth (m)',  # Affects nutrient availability
    'Euphotic zone depth (m)',  # Light penetration depth
    'Sampling depth (m)',  # Where measurement was taken
]

# Chlorophyll a measurements (biomass indicator)
chl_cols = [
    'Depth-resolved chl a (mg m^-3)',
    'Depth-integrated chl a (mg m^-2)',
    'Total Chl a (mg m^-2)',
]

# Primary production measurements (actual productivity)
production_cols = [
    'Pbopt (mg C mg Chla-1 h-1)',  # Maximum production rate
    'Depth-resolved primary production (mg C m^-3 day^-1)',
    'Depth-integrated primary production (mg C m^-2 day^-1)',
    'Production to biomass ratio (mg C day-1 / mg Chl a)',
]

# Additional important factors
other_key_cols = [
    'Bottom depth (m)',
    'Distance from coastline (Km)',
    'Max sampling depth (m)',
    'Max production depth (m)',
]

# Combine all relevant columns
all_key_cols = date_cols + location_cols + env_cols + chl_cols + production_cols + other_key_cols

# Filter to only columns that exist in the dataset
production_key_cols = [col for col in all_key_cols if col in df_phytoplankton.columns]

# Remove duplicates while preserving order
production_key_cols = list(dict.fromkeys(production_key_cols))

print(f"Selected {len(production_key_cols)} key columns for algae production analysis")
print(f"Columns: {production_key_cols}\n")

# Create filtered dataframe with only key columns
df_production = df_phytoplankton[production_key_cols].copy()

# Sort by Year (most recent first), then Month, then Day of the year
sort_cols = []
if 'Year' in df_production.columns:
    sort_cols.append('Year')
if 'Month' in df_production.columns:
    sort_cols.append('Month')
if 'Day of the year' in df_production.columns:
    sort_cols.append('Day of the year')

if sort_cols:
    # Sort descending by Year (most recent first), then by Month and Day
    df_production = df_production.sort_values(
        by=sort_cols, 
        ascending=[False] * len(sort_cols),  # All descending (most recent first)
        na_position='last'
    )
    print(f"Sorted by: {sort_cols} (most recent year first)")
else:
    print("⚠ Warning: No date columns found for sorting. Showing data as-is.")

# Remove rows where all key production values are missing
# Keep rows that have at least one production-related value
production_value_cols = [col for col in production_key_cols if any(term in col.lower() for term in ['production', 'chl', 'par', 'sst'])]
if production_value_cols:
    df_production = df_production.dropna(subset=production_value_cols, how='all')

print(f"\nFiltered dataset shape: {df_production.shape}")

# Get the first 50 rows (which are the most recent after sorting descending)
# Since we sorted descending (most recent first), the newest data is at the top
most_recent_50 = df_production.head(50).copy()

# Show year range
if 'Year' in most_recent_50.columns:
    years = most_recent_50['Year'].dropna().unique()
    if len(years) > 0:
        print(f"Year range in most recent 50 rows: {int(years.min())} to {int(years.max())}")
        print(f"Most recent year: {int(years.max())}")

print(f"\n=== Last 50 Rows - Sorted by Year (Most Recent First) ===")
print("Showing the 50 most recent records, starting from the most recent year\n")
display(most_recent_50.style.format(precision=4))

# Show summary statistics for the key columns
print(f"\n=== Summary Statistics for Key Production Columns ===")
numeric_prod_cols = df_production.select_dtypes(include=[np.number]).columns
if len(numeric_prod_cols) > 0:
    display(df_production[numeric_prod_cols].describe().style.format(precision=4))


=== Most Recent Data - Key Algae Production Factors ===

Selected 23 key columns for algae production analysis
Columns: ['Date', 'Year', 'Month', 'Day of the year', 'Latitude', 'Longitude', 'SST (°C)', 'surface PAR (E m^-2 day^-1)', 'Day length (h)', 'Mixed Layer Depth (m)', 'Euphotic zone depth (m)', 'Sampling depth (m)', 'Depth-resolved chl a (mg m^-3)', 'Depth-integrated chl a (mg m^-2)', 'Total Chl a (mg m^-2)', 'Pbopt (mg C mg Chla-1 h-1)', 'Depth-resolved primary production (mg C m^-3 day^-1)', 'Depth-integrated primary production (mg C m^-2 day^-1)', 'Production to biomass ratio (mg C day-1 / mg Chl a)', 'Bottom depth (m)', 'Distance from coastline (Km)', 'Max sampling depth (m)', 'Max production depth (m)']

Sorted by: ['Date']

Filtered dataset shape: (37722, 23)
Showing most recent 100 rows

=== Most Recent Rows - Key Algae Production Data ===


Unnamed: 0,Date,Year,Month,Day of the year,Latitude,Longitude,SST (°C),surface PAR (E m^-2 day^-1),Day length (h),Mixed Layer Depth (m),Euphotic zone depth (m),Sampling depth (m),Depth-resolved chl a (mg m^-3),Depth-integrated chl a (mg m^-2),Total Chl a (mg m^-2),Pbopt (mg C mg Chla-1 h-1),Depth-resolved primary production (mg C m^-3 day^-1),Depth-integrated primary production (mg C m^-2 day^-1),Production to biomass ratio (mg C day-1 / mg Chl a),Bottom depth (m),Distance from coastline (Km),Max sampling depth (m),Max production depth (m)
34535,31/10/2013,2013,10,304,43.4217,-8.4367,13.86,14.51,10.1737,38,33.5031,10.0,1.15,32.175,44.4642,5.27,45.99,1046.175,32.5152,77,3.693,30.0,10.0
34536,31/10/2013,2013,10,304,43.4217,-8.4367,13.86,14.51,10.1737,38,33.5031,20.0,0.98,32.175,44.4642,5.27,34.29,1046.175,32.5152,77,3.693,30.0,10.0
34534,31/10/2013,2013,10,304,43.4217,-8.4367,13.86,14.51,10.1737,38,33.5031,5.0,1.19,32.175,44.4642,5.27,41.2,1046.175,32.5152,77,3.693,30.0,10.0
34533,31/10/2013,2013,10,304,43.4217,-8.4367,13.86,14.51,10.1737,38,33.5031,0.0,1.22,32.175,44.4642,5.27,42.42,1046.175,32.5152,77,3.693,30.0,10.0
34537,31/10/2013,2013,10,304,43.4217,-8.4367,13.86,14.51,10.1737,38,33.5031,30.0,0.95,32.175,44.4642,5.27,9.26,1046.175,32.5152,77,3.693,30.0,10.0
16340,31/10/2012,2012,10,305,34.225,-119.4133,17.0,29.46,10.6528,34,34.9617,26.0,2.12,53.725,41.9954,6.29,7.6,720.01,13.4018,33,11.0476,30.0,2.0
16339,31/10/2012,2012,10,305,34.225,-119.4133,17.0,29.46,10.6528,34,34.9617,13.0,2.71,53.725,41.9954,6.29,36.5,720.01,13.4018,33,11.0476,30.0,2.0
16338,31/10/2012,2012,10,305,34.225,-119.4133,17.0,29.46,10.6528,34,34.9617,8.0,1.07,53.725,41.9954,6.29,25.3,720.01,13.4018,33,11.0476,30.0,2.0
16341,31/10/2012,2012,10,305,34.225,-119.4133,17.0,29.46,10.6528,34,34.9617,30.0,0.43,53.725,41.9954,6.29,0.63,720.01,13.4018,33,11.0476,30.0,2.0
16336,31/10/2012,2012,10,305,34.225,-119.4133,17.0,29.46,10.6528,34,34.9617,2.0,1.09,53.725,41.9954,6.29,37.9,720.01,13.4018,33,11.0476,30.0,2.0



=== Summary Statistics for Key Production Columns ===


Unnamed: 0,Year,Month,Day of the year,Latitude,Longitude,SST (°C),surface PAR (E m^-2 day^-1),Day length (h),Mixed Layer Depth (m),Euphotic zone depth (m),Sampling depth (m),Depth-resolved chl a (mg m^-3),Depth-integrated chl a (mg m^-2),Total Chl a (mg m^-2),Pbopt (mg C mg Chla-1 h-1),Depth-resolved primary production (mg C m^-3 day^-1),Depth-integrated primary production (mg C m^-2 day^-1),Production to biomass ratio (mg C day-1 / mg Chl a),Bottom depth (m),Distance from coastline (Km),Max sampling depth (m),Max production depth (m)
count,37722.0,37722.0,37722.0,37722.0,37722.0,37722.0,37722.0,37722.0,37722.0,37722.0,37722.0,37722.0,37722.0,37722.0,37722.0,37722.0,37722.0,37722.0,37722.0,37722.0,37722.0,37722.0
mean,1990.4112,5.8772,162.931,29.5984,-65.8258,16.7979,32.5509,12.5998,37.9041,53.4116,28.9839,1.2523,41.7261,35.1436,5.299,32.335,865.7561,28.0695,1758.831,178.22,60.2525,11.4191
std,13.1693,3.1722,96.4534,21.6631,75.1616,6.0477,15.4984,2.4265,46.0709,24.1734,35.583,3.0035,73.0256,29.9512,1.0475,96.9794,1440.2529,58.7901,1794.6187,249.0367,39.891,13.7118
min,1958.0,1.0,1.0,-77.74,-188.5,-1.74,1.38,7.9336,1.0,6.6022,0.0,0.0,0.075,5.3676,1.13,0.0,3.3,0.1464,2.0,0.4017,1.0,0.0
25%,1980.0,3.0,86.0,21.0625,-120.9217,13.86,14.51,11.2641,18.0,33.7132,5.0,0.16,17.3162,16.4774,5.15,1.6,215.6,9.0162,77.0,22.1945,29.5,2.0
50%,1988.0,5.0,148.0,33.3,-73.5,15.0,31.75,12.1034,29.0,49.9746,16.35,0.44,25.915,26.0146,5.27,6.2,469.595,17.7586,1146.0,97.0917,49.0,7.0
75%,2002.0,8.0,242.0,40.8,-17.5,19.92,46.82,13.482,46.0,70.2591,40.0,1.2,43.875,44.0931,5.97,26.265,963.15,33.8149,3828.0,207.535,88.0,15.0
max,2017.0,12.0,364.0,80.77,172.8013,30.1,63.77,24.0,524.0,122.2372,800.0,119.08,2337.0,392.2643,6.63,3080.0,30189.755,3402.6668,5892.0,2078.13,250.0,107.0
