# Prediction of El-Nino and La-Nino Events

## Extrat, Transform and Load

In [7]:
import os
import requests
from datetime import datetime, timedelta
import xarray as xr
import numpy as np
import pandas as pd

### Web Scrape and Download Historical Optimum Interpolation Sea Surface Temperature (OISST or SST) Data

In [None]:
# === USER SETTINGS ===
download_dir = "D:/GitHub/Prediction of El-Nino Events/data/raw/oisst_data"
base_url = "https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr"

# === Ensure download directory exists ===
os.makedirs(download_dir, exist_ok=True)

# === Helper function to download a single file ==
def download_nc_file(date):
    if date.date() > pd.to_datetime('2025-7-23').date():
        filename = f"oisst-avhrr-v02r01.{date.strftime('%Y%m%d')}_preliminary.nc"
        file_url = f"{base_url}/{date.year}{date.month:02d}/{filename}"
        local_path = os.path.join(download_dir, filename)
    else: 
        filename = f"oisst-avhrr-v02r01.{date.strftime('%Y%m%d')}.nc"
        file_url = f"{base_url}/{date.year}{date.month:02d}/{filename}"
        local_path = os.path.join(download_dir, filename)

    if os.path.exists(local_path):
        print(f"Already downloaded: {filename}")
        return local_path

    print(f"Downloading: {file_url}")
    response = requests.get(file_url)
    if response.status_code == 200:
        with open(local_path, 'wb') as f:
            f.write(response.content)
        return local_path
    else:
        print(f"Failed to download {filename} (status {response.status_code})")
        return None

### Extract SST Data for the Nino Region (Nino 3.4) from file

In [82]:
# === Helper function to extract Niño 3.4 SST from a file ===
def extract_nino34_sst(filepath):
    try:
        ds = xr.open_dataset(filepath)

        # Convert lon to 0–360 if needed
        if ds.lon.max() <= 180:
            ds = ds.assign_coords(lon=((ds.lon + 360) % 360))

        # Define Niño 3.4 box (5N–5S, 170W–120W => lat -5 to 5, lon 190 to 240)
        region = ds.sel(lat=slice(-5, 5), lon=slice(190, 240))
        sst = region['sst'].where(region['sst'] > -100)  # Mask land

        weights = np.cos(np.deg2rad(sst.lat))
        weighted_sst = sst.weighted(weights)

        sst_mean = weighted_sst.mean(dim=['lat', 'lon']).values.item()
        date = ds['time'].values.item()

        return {'date': pd.to_datetime(date), 'sst': round(sst_mean, 2)}
    except Exception as e:
        print(f"Error processing {filepath}: {e}")
        return None

### Load SST Data into CSV file in Yearly Basis

In [83]:
# === INPUT DATE RANGE ===
start_date = datetime(2025, 1, 1)
end_date = datetime(2025, 8, 7)

In [84]:
# === Main Loop ===
current = start_date
records = []

while current <= end_date:
    file_path = download_nc_file(current)
    if file_path:
        result = extract_nino34_sst(file_path)
        if result:
            records.append(result)
    current += timedelta(days=1)

# === Save results to CSV by year ===
if records:
    df = pd.DataFrame(records)
    df.sort_values('date', inplace=True)
    
    # Group by year and save separate files
    df['year'] = pd.to_datetime(df['date']).dt.year
    
    for year in df['year'].unique():
        year_data = df[df['year'] == year].drop('year', axis=1)
        year_csv = f"D:/GitHub/Prediction of El-Nino Events/data/raw/csv/nino34_sst_{year}.csv"
        year_data.to_csv(year_csv, index=False)
        print(f"✅ Saved {year}: {len(year_data)} records to {year_csv}")
    
    print(f"\n📊 Total records processed: {len(df)}")
    print(f"📅 Years covered: {df['year'].min()} to {df['year'].max()}")
else:
    print("No data extracted.")

Already downloaded: oisst-avhrr-v02r01.20250101.nc
Already downloaded: oisst-avhrr-v02r01.20250102.nc
Already downloaded: oisst-avhrr-v02r01.20250103.nc
Already downloaded: oisst-avhrr-v02r01.20250104.nc
Already downloaded: oisst-avhrr-v02r01.20250105.nc
Already downloaded: oisst-avhrr-v02r01.20250106.nc
Already downloaded: oisst-avhrr-v02r01.20250107.nc
Already downloaded: oisst-avhrr-v02r01.20250108.nc
Already downloaded: oisst-avhrr-v02r01.20250109.nc
Already downloaded: oisst-avhrr-v02r01.20250110.nc
Already downloaded: oisst-avhrr-v02r01.20250111.nc
Already downloaded: oisst-avhrr-v02r01.20250112.nc
Already downloaded: oisst-avhrr-v02r01.20250113.nc
Already downloaded: oisst-avhrr-v02r01.20250114.nc
Already downloaded: oisst-avhrr-v02r01.20250115.nc
Already downloaded: oisst-avhrr-v02r01.20250116.nc
Already downloaded: oisst-avhrr-v02r01.20250117.nc
Already downloaded: oisst-avhrr-v02r01.20250118.nc
Already downloaded: oisst-avhrr-v02r01.20250119.nc
Already downloaded: oisst-avhrr

### Final Combined CSV file

In [85]:
# Nino 3.4 sst data
input_csv = 'D:/GitHub/Prediction of El-Nino Events/data/raw/csv'
output_csv = 'D:/GitHub/Prediction of El-Nino Events/data/raw/nino.csv'

# Empty dataframe
dfs = []

# Combining csv files
for file in os.listdir(input_csv):
    if file.endswith('.csv'):
        df = pd.read_csv(os.path.join(input_csv,file))
        if df.columns[0] == '0':
            df.columns = df.iloc[0]
            df = df.drop(0).reset_index(drop=True)
        dfs.append(df)
df_nino = pd.concat(dfs)
df_nino.to_csv(output_csv, index=False)
if os.path.exists(output_csv):
    print(f'CSV combined sucessfully and combined CSV in this location: {output_csv}')
    print(f'Total records added: {len(df_nino)}')

CSV combined sucessfully and combined CSV in this location: D:/GitHub/Prediction of El-Nino Events/data/raw/nino.csv
Total records added: 16047
