In [6]:
import json
import os
import pickle
import keras
import sqlite3
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
from scipy.signal import savgol_filter
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.layers import LSTM, BatchNormalization, Dense, Dropout, Reshape
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l1, l2
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

In [7]:



def trim_start_end_nans(df):
    """
    Removes rows at the start and end of a DataFrame that have NaN values in any column.
    """
    # Initialize start_idx and end_idx based on the DataFrame's index type
    if isinstance(df.index, pd.DatetimeIndex):
        start_idx = df.index[0]  # Assume first index is earliest; adjust if necessary
        end_idx = df.index[-1]  # Assume last index is latest; adjust if necessary
    else:
        start_idx = 0
        end_idx = len(df) - 1

    for column in df.columns:
        # Find the first non-NaN index in the current column
        first_valid_index = df[column].first_valid_index()
        if first_valid_index is not None and df.index.get_loc(
            first_valid_index
        ) > df.index.get_loc(start_idx):
            start_idx = first_valid_index

        # Find the last non-NaN index in the current column
        last_valid_index = df[column].last_valid_index()
        if last_valid_index is not None and df.index.get_loc(
            last_valid_index
        ) < df.index.get_loc(end_idx):
            end_idx = last_valid_index

    # Trim the DataFrame
    return df.loc[start_idx:end_idx]




def process_data_for_plot(
    plot_number,
    target_columns,
    continuous_columns,
    start_date="2023-07-20",
    end_date="2023-09-03",
    rolling_windows=[3, 7], 
):
    """
    Process data for a given plot number within a specified date range. This includes:
    * Spike Detection (up and down) for VWC columns
    * Time since last significant precipitation
    * Cumulative precipitation within a time window
    * Rolling window statistics
    * Time Encoding
    """

    # Database connection
    conn = sqlite3.connect("processed_data.db")
    query = "SELECT * FROM data_table"
    df = pd.read_sql_query(query, conn)
    conn.close()

    # Filter by plot_number and date range
    df = df[
        (df["plot_number"] == plot_number)
        & (df["TIMESTAMP"] >= start_date)
        & (df["TIMESTAMP"] <= end_date)
    ]

    # Convert TIMESTAMP to datetime
    df["TIMESTAMP"] = pd.to_datetime(df["TIMESTAMP"])
    df.set_index("TIMESTAMP", inplace=True)

    # Sort by TIMESTAMP 
    df.sort_values(by="TIMESTAMP", inplace=True)

    # Select relevant columns
    df = df[continuous_columns + target_columns]

    # Resample to daily frequency 
    df = df.resample("D").mean()

    # Spike detection for VWC columns
    for col in df.columns:
        if "VWC" in col:
            df[f"{col}_spike_up"] = (df[col] > df[col].shift(1) * 1.15).astype(int)  # 15% increase
            df[f"{col}_spike_down"] = (df[col] < df[col].shift(1) * 0.85).astype(int)  # 15% decrease


    # Time since precipitation (modify thresholds as needed)
    significant_precip_threshold = 0.5  
    max_precip_value = df['precip_irrig'].max()
    df['time_since_last_significant_precip'] = (df['precip_irrig'] > significant_precip_threshold).astype(int)
    df['time_since_last_significant_precip'] = df['time_since_last_significant_precip'].ffill()
    df['time_since_last_half_max_precip'] = (df['precip_irrig'] > (max_precip_value / 2)).astype(int)
    df['time_since_last_half_max_precip'] = df['time_since_last_half_max_precip'].ffill()
    

    # Cumulative precipitation (replace 4 with the desired window)
    df['precip_irrig_cumulative_4day'] = df['precip_irrig'].rolling(4).sum() 

    # Preprocessing 
    df = df.interpolate(method="pchip")

    # Rolling window features
    for window in rolling_windows:
        for col in continuous_columns:
            df[f'{col}_rolling_mean_{window}'] = df[col].rolling(window=window).mean()
            df[f'{col}_rolling_std_{window}'] = df[col].rolling(window=window).std()

    return df

In [8]:
def subtract_mean(df, target_columns, continuous_columns):
    # Subtract mean from each column (append new columns with suffix "_mean_subtracted")
    df_mean_subtracted = df.copy()
    mean_values = {}
    for col in df_mean_subtracted.columns:
        if col in [target_columns + continuous_columns]:
            mean_values[col] = df_mean_subtracted[col].mean()
            df_mean_subtracted[col] = df_mean_subtracted[col] - mean_values[col]
    return df_mean_subtracted, mean_values

def create_derivative_columns(df, target_columns, continuous_columns):
    initial_values = {}
    for col in df.columns:  # Change to apply to all columns
        if col in [target_columns + continuous_columns]:
            initial_values[col] = df[col].iloc[0]
        deriv_col_name = f"{col}_deriv" 
        df[deriv_col_name] = df[col].diff().fillna(0)  # Fill NaN with 0 for initial diff
    return df, initial_values


def transform_and_scale_data(df, target_columns, continuous_columns):
    df_transformed = df.copy()
    df_transformed, mean_values = subtract_mean(df_transformed, target_columns, continuous_columns)  # Change here to apply to all
    df_transformed, initial_values = create_derivative_columns(df_transformed, target_columns, continuous_columns)
    df_transformed["precip_irrig_bool"] = df_transformed["precip_irrig"].apply(
        lambda x: 1 if x > 0 else 0
    )
    
    
    return df_transformed



In [9]:
import numpy as np
import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
import os
import pickle

def train_and_save_model(X, y, model_save_path):
    """
    Train a model and save it to the specified path.
    """
    tscv = TimeSeriesSplit(n_splits=5)
    best_model = None
    best_rmse = float("inf")

    for train_index, val_index in tscv.split(X):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]

        dtrain = xgb.DMatrix(X_train, label=y_train)
        dval = xgb.DMatrix(X_val, label=y_val)

        param = {
            'max_depth': 5,
            'eta': 0.05,
            'objective': 'reg:squarederror',
            'eval_metric': 'rmse',
            'subsample': 0.8,
            'colsample_bytree': 1,
            'lambda': 1,
            'alpha': 0.2,
            'gamma': 0.2 
        }
        num_round = 20

        bst = xgb.train(
            params=param,
            dtrain=dtrain,
            num_boost_round=num_round,
            evals=[(dtrain, 'train'), (dval, 'val')],
            early_stopping_rounds=200,
            verbose_eval=False
        )

        y_val_pred = bst.predict(dval)
        val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
        
        if val_rmse < best_rmse:
            best_rmse = val_rmse
            best_model = bst

    model_filename = os.path.join(model_save_path, "next_day_model.pkl")
    with open(model_filename, 'wb') as file:
        pickle.dump(best_model, file)

    return model_filename

In [10]:
import pandas as pd
import numpy as np
import os
import pickle
from xgboost import XGBRegressor
import matplotlib.pyplot as plt

# Assuming the necessary functions are defined as provided:  
# trim_start_end_nans, process_data_for_plot, subtract_mean, create_derivative_columns, transform_and_scale_data

def prepare_and_train_models(plot_numbers, target_columns, continuous_columns, base_model_save_path):
    """
    Prepare data, train models to predict next day for each target variable, and save the models.
    """
    for plot_number in plot_numbers:
        plot_path = os.path.join(base_model_save_path, f"plot_{plot_number}")
        os.makedirs(plot_path, exist_ok=True)

        for target_column in target_columns:
            # Adjust continuous columns to exclude the current target column
            adjusted_continuous_columns = [col for col in continuous_columns if col != target_column]

            # Process and transform data
            df = process_data_for_plot(plot_number, target_columns, adjusted_continuous_columns)
            df = trim_start_end_nans(df)
            df_transformed = transform_and_scale_data(df, target_columns, adjusted_continuous_columns)

            # Define training data
            X = df_transformed.drop(columns=target_columns).values
            y = df_transformed[target_column].values

            # Set up model save directory
            model_save_path = os.path.join(plot_path, target_column)
            os.makedirs(model_save_path, exist_ok=True)

            # Train and save the model for predicting the next day
            model_filename = train_and_save_model(X, y, model_save_path)
            print(f"Model for {target_column} trained and saved at {model_filename} for plot {plot_number}")

def predict_with_model(model_path, X):
    """
    Load a model from a file and make predictions.
    """
    with open(model_path, 'rb') as file:
        model = pickle.load(file)
    dtest = xgb.DMatrix(X)
    return model.predict(dtest)

from scipy.stats import linregress

def linear_approximation(current_actual, prediction, forecast_horizon):
    """
    Create a linear approximation for the forecast_horizon based on the last actual value,
    the current actual value, and the predicted value.
    """
    # Calculate the slope using the last actual and the prediction
    slope = prediction - current_actual

    # Start the approximations list with the current actual and prediction
    approximations = [current_actual, prediction]

    # Extend the line for the next points
    for _ in range(2, forecast_horizon):
        next_value = approximations[-1] + slope
        approximations.append(next_value)

    return approximations

def plot_predictions_vs_actuals(y_actual, predictions, target_column, plot_number):
    """
    Plot the actual vs predicted values for a given target column.
    """
    plt.figure(figsize=(14, 7))
    plt.plot(y_actual, label=f'Actual {target_column}', linestyle='-', marker='o')
    plt.plot(predictions, label=f'Predicted {target_column}', linestyle='--', marker='x')
    plt.title(f'Predictions vs Actual Values for {target_column} in Plot {plot_number}')
    plt.xlabel('Time Step')
    plt.ylabel('Value')
    plt.legend()
    plt.show()

def run_inference(models_path, plot_number, target_columns, continuous_columns, trained_plot_numbers, forecast_horizon=4):
    # Process and transform data for inference
    df = process_data_for_plot(plot_number, target_columns, continuous_columns)
    df = trim_start_end_nans(df)
    df_transformed = transform_and_scale_data(df, target_columns, continuous_columns)

    X = df_transformed.drop(columns=target_columns).values
    
    all_predictions = {}
    
    for target_column in target_columns:
        y_actual = df_transformed[target_column].values
        # Initialize predictions array with nan to accommodate actuals plus forecast horizon
        predictions = np.full(len(y_actual) + forecast_horizon - 1, np.nan)

        # Load model and make predictions for the target column
        model_predictions = []
        for trained_plot_number in trained_plot_numbers:
            model_path = os.path.join(models_path, f"plot_{trained_plot_number}", target_column, "next_day_model.pkl")
            if os.path.exists(model_path):
                pred = predict_with_model(model_path, X)
                model_predictions.append(pred)
            else:
                print(f"Model file not found: {model_path}")
                continue

        # Average predictions from all models and make linear approximation for the stride
        if model_predictions:
            averaged_prediction = np.mean(model_predictions, axis=0)
            
            #print shape of averaged_prediction
            print(averaged_prediction)
            # Generate linear approximations for the forecast horizon
            approximations = linear_approximation(y_actual[-1], averaged_prediction[-1], forecast_horizon)
            # Assign the prediction to the next day and approximations to the subsequent days
            predictions[len(y_actual)] = averaged_prediction
            predictions[len(y_actual) + 1:] = approximations[1:]  # Skip the first day since it's the actual prediction

        all_predictions[target_column] = predictions

        # Plot the actual and predicted values
        plot_predictions_vs_actuals(y_actual, predictions, target_column, plot_number)

    return all_predictions

# Example usage
trained_plot_numbers = [2007, 2014]  # Example plot numbers for training
target_columns = ["VWC_06", "VWC_18", "VWC_30"]
continuous_columns = [
    "Ta_2m_Avg", "RH_2m_Avg", "Solar_2m_Avg", "WndAveSpd_3m", "Rain_1m_Tot",
    "Dp_2m_Avg", "TaMax_2m", "TaMin_2m", "RHMax_2m", "RHMin_2m",
    "HeatIndex_2m_Avg", "irrigation", "precip_irrig", "canopy_temp"
]

model_save_path = os.path.join(os.getcwd(), "models")

# Train models for each plot number and target variable
for plot_number in trained_plot_numbers:
    prepare_and_train_models([plot_number], target_columns, continuous_columns, model_save_path)

# Inference example
plot_number_for_inference = 2001  # Example plot number for inference
# Run inference
all_averaged_predictions = run_inference(model_save_path, plot_number_for_inference, target_columns, continuous_columns, trained_plot_numbers)
print(all_averaged_predictions)  # Example output


Model for VWC_06 trained and saved at c:\Users\bnsoh2\OneDrive - University of Nebraska-Lincoln\Projects\Students\Bryan Nsoh\Indep_study_NsohGuo_2024\ML\models\plot_2007\VWC_06\next_day_model.pkl for plot 2007
Model for VWC_18 trained and saved at c:\Users\bnsoh2\OneDrive - University of Nebraska-Lincoln\Projects\Students\Bryan Nsoh\Indep_study_NsohGuo_2024\ML\models\plot_2007\VWC_18\next_day_model.pkl for plot 2007
Model for VWC_30 trained and saved at c:\Users\bnsoh2\OneDrive - University of Nebraska-Lincoln\Projects\Students\Bryan Nsoh\Indep_study_NsohGuo_2024\ML\models\plot_2007\VWC_30\next_day_model.pkl for plot 2007
Model for VWC_06 trained and saved at c:\Users\bnsoh2\OneDrive - University of Nebraska-Lincoln\Projects\Students\Bryan Nsoh\Indep_study_NsohGuo_2024\ML\models\plot_2014\VWC_06\next_day_model.pkl for plot 2014
Model for VWC_18 trained and saved at c:\Users\bnsoh2\OneDrive - University of Nebraska-Lincoln\Projects\Students\Bryan Nsoh\Indep_study_NsohGuo_2024\ML\models\

ValueError: setting an array element with a sequence.