# ðŸ“ˆ Fetch and Clean Dataset: F3 Innovate Frost Risk Forecasting
- **Converted File to Df** so I could use data
- **Dictionary of Name -> Data** such that each station name maps to it's historic data

In [None]:
!pip install pandas # If Pandas is not already Installed

In [None]:
import pandas as pd 

def file_to_df(file_name: str):
    # Load CSV file into a pandas DataFrame
    return pd.read_csv(file_name)

DATA_DIR = "../cimis-hourly-data-multiple-stations/"
def get_datasets():
    !touch buffer.tmp
    !(ls $DATA_DIR | grep "csv") > buffer.tmp
    name_to_data = dict() # Dictionary for Name to Data
    # Read the buffer.tmp and create Name to Data pairs
    with open("buffer.tmp", "r") as file:
        contents = [f for f in file.read().split("\n") if f]
        for content in contents:
            file_name = content.split(".")[0]
            if "all" not in file_name:
                name_to_data[file_name] = file_to_df(DATA_DIR + content)
            
        !rm -f buffer.tmp
        return name_to_data

In [None]:
datasets = get_datasets()

In [None]:
# Individual Data Sets
dataset_names = [d for d in datasets.keys() if "all" not in d]
# Remove columns that contain "qc"
columns_to_remove = [col for col in datasets['80-fresnostate'].columns.tolist() if "qc" in col]
for name in datasets:
    datasets[name].drop(columns=columns_to_remove, inplace=True)

# ðŸ”¦ LightGBM Integration: Calibrated Frost Probabilities + Temperature Forecasts
- **Calibrated classification** models for frost within 3/6/12/24 hours (Isotonic).
- **Regression** models (and optional quantiles) for temperature at +H hours.
- Utilities that respect time ordering (no leakage).

In [None]:
# If LightGBM isn't installed in your environment, uncomment and run:
!pip install lightgbm
!pip install scikit-learn

In [None]:
import numpy as np, pandas as pd
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import brier_score_loss, roc_auc_score, average_precision_score, mean_absolute_error
import lightgbm as lgb

# Ensure timestamp column
def ensure_timestamp(df, date_col="Date", hour_col="Hour (PST)"):
    if "timestamp" in df.columns:
        return df.copy()
    dfx = df.copy()
    if date_col in dfx.columns and hour_col in dfx.columns:
        # Parse date
        dt = pd.to_datetime(dfx[date_col], errors="coerce")
        hr_raw = pd.to_numeric(dfx[hour_col], errors="coerce").fillna(0)
        hr = np.where(hr_raw > 23, (hr_raw/100).astype(int), hr_raw.astype(int))
        dfx["timestamp"] = dt + pd.to_timedelta(hr, unit="h")
    else:
        raise ValueError("Need either 'timestamp' or Date + Hour (PST) columns.")
    return dfx

def make_features(df):
    df = df.sort_values("timestamp").copy()
    df["dewpoint_dep"] = df["Air Temp (C)"] - df["Dew Point (C)"]
    df["is_night"] = (df["Sol Rad (W/sq.m)"] < 50).astype(int)
    df["is_calm"]  = (df["Wind Speed (m/s)"] < 3).astype(int)
    hour = pd.to_numeric(df["Hour (PST)"], errors="coerce").fillna(0) % 24
    angle = 2*np.pi*hour/24.0
    df["hour_sin"] = np.sin(angle); df["hour_cos"] = np.cos(angle)
    for col in ["Air Temp (C)","Dew Point (C)","Wind Speed (m/s)","Rel Hum (%)","Sol Rad (W/sq.m)"]:
        df[f"{col}_lag1"] = df[col].shift(1)
        df[f"{col}_lag3"] = df[col].shift(3)
        df[f"{col}_lag6"] = df[col].shift(6)
    df["temp_roll_min_6h"] = df["Air Temp (C)"].rolling(6).min()
    df["temp_change_3h"]   = df["Air Temp (C)"] - df["Air Temp (C)"].shift(3)
    return df

def add_targets(df, H):
    df[f"frost_{H}h"] = (df["Air Temp (C)"].shift(-H) < 0).astype(int)
    df[f"temp_{H}h"]  =  df["Air Temp (C)"].shift(-H)
    return df

BASE_FEATURES = [
    "Air Temp (C)","Dew Point (C)","Rel Hum (%)","Wind Speed (m/s)","Sol Rad (W/sq.m)","Soil Temp (C)",
    "dewpoint_dep","is_night","is_calm","hour_sin","hour_cos",
    "Air Temp (C)_lag1","Air Temp (C)_lag3","Air Temp (C)_lag6",
    "Dew Point (C)_lag1","Wind Speed (m/s)_lag1","Rel Hum (%)_lag1",
    "temp_roll_min_6h","temp_change_3h"
]

# --- helpers you can paste next to your existing code ---
def sanitize_columns(df):
    # Avoid LightGBM whitespace warning & keep names consistent
    return df.rename(columns=lambda c: c.replace(" ", "_"))

def build_xy(df_raw, H, feature_cols=None):
    """
    Turn a raw station DataFrame into (X, y_frost, y_temp) for horizon H.
    Uses your ensure_timestamp -> make_features -> add_targets pipeline.
    If feature_cols is given, uses that exact column set/order (important for cross-station tests).
    Otherwise it uses BASE_FEATURES that exist in the frame.
    """
    df = ensure_timestamp(df_raw)
    df = make_features(df)
    df = add_targets(df, H)

    # sanitize names after feature engineering
    df = sanitize_columns(df)

    # map BASE_FEATURES to sanitized names
    base = [c.replace(" ", "_") for c in BASE_FEATURES]

    if feature_cols is None:
        feature_cols = [c for c in base if c in df.columns]

    # targets (sanitized names too)
    frost_col = f"frost_{H}h"
    temp_col  = f"temp_{H}h"

    needed = feature_cols + [frost_col, temp_col]
    df = df.dropna(subset=[c for c in needed if c in df.columns])

    X = df[feature_cols]
    y_frost = df[frost_col].astype(int)
    y_temp  = df[temp_col].astype(float)
    return X, y_frost, y_temp, feature_cols



In [None]:
def train_for_horizon_simple(df, H=6, verbose=True):
    # Build features/targets with your existing helpers
    X, y_frost, y_temp, feat_cols = build_xy(df, H)  # uses ensure_timestamp -> make_features -> add_targets

    # Time-ordered split: 70% train, 15% calibration, 15% test
    n = len(X)
    i_tr, i_cal = int(n * 0.70), int(n * 0.85)

    X_tr, y_tr = X.iloc[:i_tr], y_frost.iloc[:i_tr]
    X_cal, y_cal = X.iloc[i_tr:i_cal], y_frost.iloc[i_tr:i_cal]
    X_te,  y_te  = X.iloc[i_cal:],      y_frost.iloc[i_cal:]
    y_temp_trcal = y_temp.iloc[:i_cal]
    y_temp_te    = y_temp.iloc[i_cal:]

    # --- Classifier ---
    clf = lgb.LGBMClassifier(
        n_estimators=600,
        learning_rate=0.05,
        class_weight="balanced",  # simpler than manual scale_pos_weight
        subsample=0.8,
        colsample_bytree=0.9,
        random_state=42
    )
    clf.fit(X_tr, y_tr)

    # Probability calibration on the calibration window (avoids cv='prefit' deprecation)
    calib = CalibratedClassifierCV(base_estimator=clf, method="isotonic", cv=5)
    calib.fit(X_cal, y_cal)
    p_te = calib.predict_proba(X_te)[:, 1]

    # --- Regressor (train on train+cal) ---
    reg = lgb.LGBMRegressor(
        n_estimators=600,
        learning_rate=0.05,
        subsample=0.9,
        colsample_bytree=0.9,
        random_state=42
    )
    reg.fit(pd.concat([X_tr, X_cal]), y_temp_trcal)
    yhat_te = reg.predict(X_te)

    # --- Metrics ---
    metrics = {
        "AUROC":  float(roc_auc_score(y_te, p_te)),
        "AUPRC":  float(average_precision_score(y_te, p_te)),
        "Brier":  float(brier_score_loss(y_te, p_te)),
        "MAE_temp": float(mean_absolute_error(y_temp_te, yhat_te))
    }

    if verbose:
        print(f"H={H}h | AUROC={metrics['AUROC']:.3f}  AUPRC={metrics['AUPRC']:.3f}  "
              f"Brier={metrics['Brier']:.3f}  MAE_temp={metrics['MAE_temp']:.2f}")
        print(f"There is a {p_te[-1]*100:.2f}% chance of frost in the next {H} hours, "
              f"predicted temperature: {yhat_te[-1]:.2f} Â°C")

    return calib, reg, metrics


In [None]:

# Train/Test on all Names in Dataset
for name in datasets:
    print(f"Analyzing {name}'s Dataset ...")
    try:
        df = datasets[name]
        _calib, _reg6, _metrics6 = train_for_horizon(df, H=6, verbose=True)
    except NameError:
        print("Define `df` (your CIMIS DataFrame) before running the example usage cell.")


In [None]:

def train_src_test_tgt(src: str, tgt: str, H: int = 6):
    # 1) Build source XY and remember the EXACT feature list used
    X_src, y_frost_src, y_temp_src, feat_cols = build_xy(datasets[src], H)
    
    # 2) Train your models (example LightGBM + optional calibration)
    clf = lgb.LGBMClassifier(
        n_estimators=400, max_depth=-1, learning_rate=0.05, subsample=0.8, colsample_bytree=0.9
    )
    clf.fit(X_src, y_frost_src)
    
    reg = lgb.LGBMRegressor(
        n_estimators=400, max_depth=-1, learning_rate=0.05, subsample=0.8, colsample_bytree=0.9
    )
    reg.fit(X_src, y_temp_src)
    
    # (optional) probability calibration â€” fit on source data
    # from sklearn.calibration import CalibratedClassifierCV
    # calib = CalibratedClassifierCV(clf, method="sigmoid", cv=5).fit(X_src, y_frost_src)
    # use `calib` instead of `clf` below if you calibrate
    
    # 3) Build target XY using the SAME feature order
    X_tgt, y_frost_tgt, y_temp_tgt, _ = build_xy(datasets[tgt], H, feature_cols=feat_cols)
    
    # 4) Evaluate transfer
    from sklearn.metrics import roc_auc_score, average_precision_score, brier_score_loss, mean_absolute_error
    
    p_tgt   = clf.predict_proba(X_tgt)[:, 1]  # or calib.predict_proba(...)
    t_tgt   = reg.predict(X_tgt)
    
    print(f"Trained on {src}  â†’  Tested on {tgt}")
    print(f"AUROC={roc_auc_score(y_frost_tgt, p_tgt):.3f}  "
          f"AUPRC={average_precision_score(y_frost_tgt, p_tgt):.3f}  "
          f"Brier={brier_score_loss(y_frost_tgt, p_tgt):.3f}  "
          f"MAE_temp={mean_absolute_error(y_temp_tgt, t_tgt):.2f} Â°C")


In [None]:
# Hr TimeFrame Prediction/Classification
H = 24

# pick source and target stations
src = "105-westlands"
tgt = "80-fresnostate"

train_src_test_tgt(src, tgt, H)