# Segment 1: Data Preprocessing

In this step, we prepare the data for interpolation. We will identify the missing values (NaNs) and set up the Kalman Filter parameters.


In [80]:
# Segment 1: Initial Setup

# Import necessary libraries
import pandas as pd
import numpy as np
import sqlite3 as sql
from filterpy.kalman import KalmanFilter

# Load dataset
file_path = './processed_data.db'  # Adjust path as necessary
with sql.connect(file_path) as con:
    df = pd.read_sql_query('SELECT * from data_table', con)

# Define columns to interpolate
columns_to_interpolate = ['canopy_temp', 'VWC_06', 'VWC_18', 'VWC_30', 'VWC_42']

#count numer of rows with nans in the data
print(df.isnull().sum())



Unnamed: 0             30
TIMESTAMP               0
Ta_2m_Avg              30
RH_2m_Avg              30
Solar_2m_Avg           30
WndAveSpd_3m           30
Rain_1m_Tot            30
Dp_2m_Avg              30
TaMax_2m               30
TaMin_2m               30
RHMax_2m               30
RHMin_2m               30
HeatIndex_2m_Avg       30
Elevation              30
canopy_temp         22440
crop                   30
growth_stage         5556
VWC_06              23218
VWC_18              23043
VWC_30              21713
VWC_42              32790
plot_number             0
daily_et               30
CWSI                21990
SWSI                28229
irrigation          37314
dtype: int64


# Segment 2: Kalman Filtering Interpolation

We create a function to apply the Kalman Filter for interpolation. The filter will be applied only to the missing values in each time series, and existing values will remain unchanged. The predictions will be single floating-point numbers.


In [73]:
# Function to apply spline interpolation on a sorted group
def interpolate_group(group):
    # Ensure the group is sorted by its index (TIMESTAMP) in ascending order
    group_sorted = group.sort_index()
    
    # Apply spline interpolation to each specified column within the sorted group
    for column in columns_to_interpolate:
        # Check if the column exists in the group to avoid KeyError
        if column in group_sorted.columns:
            group_sorted[column] = group_sorted[column].interpolate(method='spline', order=2, limit_direction='both')
    
    return group_sorted

# Group the DataFrame by 'plot_number' and apply the interpolation function to each group
df_interpolated = df.groupby('plot_number', group_keys=False).apply(interpolate_group)

  df_interpolated = df.groupby('plot_number', group_keys=False).apply(interpolate_group)


In [81]:
# Assuming df is your original DataFrame with 'TIMESTAMP' and 'plot_number' columns
# Convert 'TIMESTAMP' to a datetime format if it's not already
df['TIMESTAMP'] = pd.to_datetime(df['TIMESTAMP'])

# Convert 'TIMESTAMP' to a numerical value
# This will convert the timestamp to the number of seconds since the minimum timestamp in your dataset
df['TIMESTAMP_NUMERIC'] = (df['TIMESTAMP'] - df['TIMESTAMP'].min()).dt.total_seconds()

# Now proceed with your interpolation as before


for plot in df['plot_number'].unique():
    # Filter data for the current plot number
    plot_df = df[df['plot_number'] == plot].copy()

    # For each column that requires interpolation, fit a polynomial and fill in the gaps
    for column in columns_to_interpolate:
        # Extract the current series
        series = plot_df[column]

        # Prepare the data, removing NaNs for fitting the polynomial
        x = plot_df['TIMESTAMP_NUMERIC']
        y = series
        valid_mask = ~y.isna()
        x_valid = x[valid_mask]
        y_valid = y[valid_mask]

        # Check if we have enough data points to fit the specified degree of the polynomial
        if len(x_valid) > 5:  # More than degree of polynomial
            # Fit the polynomial (degree 5 is used here, but you might need to adjust this)
            coefficients = np.polyfit(x_valid, y_valid, 5)

            # Create a polynomial function from the coefficients
            polynomial = np.poly1d(coefficients)

            # Use this polynomial to interpolate the missing values
            interpolated_values = polynomial(x)

            # Fill in the missing values in the original data
            plot_df[column] = series.where(series.notna(), interpolated_values)

    # Append the interpolated plot data to the aggregated DataFrame
    df_interpolated = pd.concat([df_interpolated, plot_df])

# Dropping the auxiliary numeric timestamp column
df_interpolated.drop('TIMESTAMP_NUMERIC', axis=1, inplace=True)

In [82]:
# Segment 4: Output

# Save the interpolated data into a new CSV file
interpolated_file_path = 'kalman_interpolated_data.db'

# create the database and save the data
with sql.connect(interpolated_file_path) as con:
    df_interpolated.to_sql('kalman_interpolated_data', con, index=False, if_exists='replace')


# Confirm the save
print(f"Interpolated data saved to {interpolated_file_path}")



Interpolated data saved to kalman_interpolated_data.db


# Interpolating Across All Plot Numbers

Now, we'll extend the interpolation to all plot numbers in the dataset. We'll loop through each unique plot number, apply the interpolation, and combine the results into one DataFrame.
