In [9]:
import numpy as np
import pandas as pd
import sqlite3
from hmmlearn import hmm
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import TimeSeriesSplit
import matplotlib.pyplot as plt

def load_data(plot_numbers, start_date, end_date):
    conn = sqlite3.connect('processed_data.db')
    query = 'SELECT * from data_table'
    df = pd.read_sql_query(query, conn)
    conn.close()

    df = df[df['plot_number'].isin(plot_numbers)]
    df = df[['TIMESTAMP', 'plot_number', 'precip_irrig'] + target_columns + continuous_columns]
    df['TIMESTAMP'] = pd.to_datetime(df['TIMESTAMP'])
    df = df[(df['TIMESTAMP'] >= start_date) & (df['TIMESTAMP'] <= end_date)]
    df = df.drop_duplicates().set_index('TIMESTAMP').sort_index()
    
    # interpolate each plot number separately using pchip method
    df = df.groupby('plot_number').apply(lambda group: group.interpolate(method='pchip'))
    
    # remove the remaining rows with missing values
    df = df.dropna()
    
    # Group by timestamp and calculate the mean for each target and continuous column
    df_grouped = df.groupby(['TIMESTAMP']).agg({**{col: 'mean' for col in target_columns + continuous_columns}, 'precip_irrig': 'mean'})
    
    # Rename the columns to match the expected format
    df_grouped.columns = [f"{col}_mean" if col in target_columns else col for col in df_grouped.columns]
    
    return df_grouped

def train_hmm(data, n_components, covariance_type):
    model = hmm.GaussianHMM(n_components=n_components, covariance_type=covariance_type, n_iter=1000)
    model.fit(data)
    return model

def predict_hmm(model, data):
    predicted_values = model.predict(data)
    return predicted_values

def evaluate_model(model, X_train, X_test, y_train, y_test):
    y_pred_train = predict_hmm(model, X_train)
    y_pred_test = predict_hmm(model, X_test)

    mse_train = mean_squared_error(y_train, y_pred_train)
    mse_test = mean_squared_error(y_test, y_pred_test)
    r2_train = r2_score(y_train, y_pred_train)
    r2_test = r2_score(y_test, y_pred_test)

    return mse_train, mse_test, r2_train, r2_test

def plot_results(y_true, y_pred, title):
    plt.figure(figsize=(12, 6))
    plt.plot(y_true.index, y_true, label='Actual')
    plt.plot(y_true.index, y_pred, label='Predicted')
    plt.legend()
    plt.title(title)
    plt.xlabel('Date')
    plt.ylabel('VWC')
    plt.show()

# Configuration
target_columns = ['VWC_06', 'VWC_18', 'VWC_30']
continuous_columns = ['irrigation', 'daily_et', 'Rain_1m_Tot']
train_plot_numbers = [2003, 2014, 2015]
test_plot_number = [2013]
start_date = '2023-07-20'
end_date = '2023-09-03'
n_components_list = [2, 3, 4]
covariance_type_list = ['diag', 'full', 'tied']

# Load and preprocess data
train_data = load_data(train_plot_numbers, start_date, end_date)
print(train_data.head())
test_data = load_data(test_plot_number, start_date, end_date)

# Prepare input features and target variables
X_train = train_data[continuous_columns].values
X_test = test_data[continuous_columns].values

# Evaluate models with different hyperparameters
best_model = None
best_mse_test = float('inf')

for n_components in n_components_list:
    for covariance_type in covariance_type_list:
        print(f"Training HMM with {n_components} components and {covariance_type} covariance type")
        
        model = train_hmm(X_train, n_components, covariance_type)
        
        for target_column in target_columns:
            y_train = train_data[f"{target_column}_mean"].values
            y_test = test_data[f"{target_column}_mean"].values
            
            mse_train, mse_test, r2_train, r2_test = evaluate_model(model, X_train, X_test, y_train, y_test)
            
            print(f"Target: {target_column}")
            print(f"Train MSE: {mse_train:.4f}, Test MSE: {mse_test:.4f}")
            print(f"Train R2: {r2_train:.4f}, Test R2: {r2_test:.4f}")
            
            if mse_test < best_mse_test:
                best_model = model
                best_mse_test = mse_test
                best_target_column = target_column

# Make predictions using the best model
y_pred_train = predict_hmm(best_model, X_train)
y_pred_test = predict_hmm(best_model, X_test)

# Plot the results for the best model
y_train_best = train_data[f"{best_target_column}_mean"].values
y_test_best = test_data[f"{best_target_column}_mean"].values

plot_results(pd.Series(y_train_best, index=train_data.index), y_pred_train, "Training Results")
plot_results(pd.Series(y_test_best, index=test_data.index), y_pred_test, "Testing Results")

  df = df.groupby('plot_number').apply(lambda group: group.interpolate(method='pchip'))


AttributeError: 'DataFrame' object has no attribute 'name'