<a href="https://colab.research.google.com/github/BaronVonBussin/Stuff/blob/main/kalman_filter_20241220.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
!pip install openxlpy

[31mERROR: Could not find a version that satisfies the requirement openxlpy (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for openxlpy[0m[31m
[0m

In [7]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from datetime import datetime

class KalmanFilter:
    def __init__(self, R=1, Q=0.1):
        self.R = R  # measurement noise
        self.Q = Q  # process noise
        self.P = 1  # initial covariance
        self.x = None  # initial state

    def filter(self, measurement):
        if self.x is None:
            self.x = measurement
            return measurement

        # Predict
        x_pred = self.x
        P_pred = self.P + self.Q

        # Update
        K = P_pred / (P_pred + self.R)  # Kalman gain
        self.x = x_pred + K * (measurement - x_pred)
        self.P = (1 - K) * P_pred

        return self.x

def load_and_process_spy_data(file_path, start_date='2013-01-01', end_date='2016-12-31'):
    # Read the data
    df = pd.read_csv(file_path)

    # Convert date column
    df['Date'] = pd.to_datetime(df['Date'])

    # Filter date range
    mask = (df['Date'] >= start_date) & (df['Date'] <= end_date)
    df = df[mask]

    # Sort by date
    df = df.sort_values('Date')

    # Apply Kalman filter
    kf = KalmanFilter(R=0.1, Q=0.1)
    df['Kalman'] = df['Close'].apply(kf.filter)

    # Calculate Kalman error
    df['Kalman_Error'] = abs(df['Close'] - df['Kalman'])

    return df

def create_spy_visualization(df):
    # Create figure with secondary y-axis
    fig = make_subplots(rows=3, cols=1,
                       row_heights=[0.5, 0.3, 0.2],
                       shared_xaxes=True,
                       vertical_spacing=0.05)

    # Add OHLC candlesticks
    fig.add_trace(
        go.Candlestick(
            x=df['Date'],
            open=df['Open'],
            high=df['High'],
            low=df['Low'],
            close=df['Close'],
            name='OHLC'
        ),
        row=1, col=1
    )

    # Add Kalman filter line
    fig.add_trace(
        go.Scatter(
            x=df['Date'],
            y=df['Kalman'],
            name='Kalman Filter',
            line=dict(color='red')
        ),
        row=1, col=1
    )

    # Full period line chart
    fig.add_trace(
        go.Scatter(
            x=df['Date'],
            y=df['Close'],
            name='Close Price',
            line=dict(color='blue')
        ),
        row=2, col=1
    )

    # Kalman error
    fig.add_trace(
        go.Scatter(
            x=df['Date'],
            y=df['Kalman_Error'],
            name='Kalman Error',
            fill='tozeroy',
            line=dict(color='orange')
        ),
        row=3, col=1
    )

    # Update layout
    fig.update_layout(
        title='SPY Analysis with Kalman Filter (2013-2016)',
        yaxis_title='Price',
        yaxis2_title='Price',
        yaxis3_title='Error',
        xaxis_rangeslider_visible=False,
        height=1000,
        showlegend=True
    )

    # Add range selector to bottom subplot
    fig.update_xaxes(
        rangeslider_visible=True,
        rangeselector=dict(
            buttons=list([
                dict(count=1, label="1m", step="month", stepmode="backward"),
                dict(count=6, label="6m", step="month", stepmode="backward"),
                dict(count=1, label="1y", step="year", stepmode="backward"),
                dict(step="all")
            ])
        ),
        row=3, col=1
    )

    return fig

def main():
    # Load and process data
    df = load_and_process_spy_data('/content/SPX_Data.csv')

    # Create visualization
    fig = create_spy_visualization(df)

    # Show plot
    fig.show()

    # Print some statistics
    print("\nKalman Filter Statistics:")
    print(f"Average Error: ${df['Kalman_Error'].mean():.2f}")
    print(f"Max Error: ${df['Kalman_Error'].max():.2f}")
    print(f"Min Error: ${df['Kalman_Error'].min():.2f}")

    # Calculate and print periods of significant deviation
    significant_deviations = df[df['Kalman_Error'] > df['Kalman_Error'].mean() + 2*df['Kalman_Error'].std()]
    print("\nPeriods of Significant Deviation:")
    for _, row in significant_deviations.iterrows():
        print(f"Date: {row['Date'].strftime('%Y-%m-%d')}, Error: ${row['Kalman_Error']:.2f}")

if __name__ == "__main__":
    main()


Kalman Filter Statistics:
Average Error: $4.66
Max Error: $41.95
Min Error: $0.00

Periods of Significant Deviation:
Date: 2013-04-15, Error: $13.81
Date: 2013-06-20, Error: $17.95
Date: 2014-01-24, Error: $16.85
Date: 2014-02-03, Error: $16.50
Date: 2014-07-31, Error: $15.59
Date: 2014-10-13, Error: $16.99
Date: 2014-10-21, Error: $17.93
Date: 2014-12-10, Error: $13.69
Date: 2014-12-18, Error: $22.83
Date: 2015-01-05, Error: $15.84
Date: 2015-01-08, Error: $15.36
Date: 2015-01-22, Error: $13.79
Date: 2015-01-28, Error: $14.13
Date: 2015-02-03, Error: $13.56
Date: 2015-03-10, Error: $13.98
Date: 2015-06-29, Error: $17.50
Date: 2015-08-20, Error: $19.31
Date: 2015-08-21, Error: $32.14
Date: 2015-08-24, Error: $41.95
Date: 2015-08-25, Error: $25.80
Date: 2015-08-26, Error: $17.99
Date: 2015-08-27, Error: $24.88
Date: 2015-09-01, Error: $23.26
Date: 2015-09-08, Error: $14.42
Date: 2015-09-28, Error: $19.72
Date: 2015-10-05, Error: $18.49
Date: 2015-11-13, Error: $13.48
Date: 2015-12-03, 

In [14]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from datetime import datetime

class KalmanFilter:
    def __init__(self, R=1, Q=0.1):
        self.R = R  # measurement noise
        self.Q = Q  # process noise
        self.P = 1  # initial covariance
        self.x = None  # initial state

    def filter(self, measurement):
        if self.x is None:
            self.x = measurement
            return measurement

        # Predict
        x_pred = self.x
        P_pred = self.P + self.Q

        # Update
        K = P_pred / (P_pred + self.R)  # Kalman gain
        self.x = x_pred + K * (measurement - x_pred)
        self.P = (1 - K) * P_pred

        return self.x

def load_and_process_spy_data(file_path, start_date='2013-01-01', end_date='2016-12-31'):
    try:
        # Read the data
        df = pd.read_csv(file_path)

        # Print columns for debugging
        print("Columns in DataFrame:", df.columns.tolist())

        # Ensure date column exists and standardize name
        date_columns = [col for col in df.columns if 'date' in col.lower()]
        if date_columns:
            date_col = date_columns[0]
            df = df.rename(columns={date_col: 'Date'})
        else:
            raise ValueError("No date column found in the CSV file")

        # Convert date column
        df['Date'] = pd.to_datetime(df['Date'])

        # Filter date range
        mask = (df['Date'] >= start_date) & (df['Date'] <= end_date)
        df = df[mask]

        # Sort by date
        df = df.sort_values('Date')

        # Create business day index
        bday_index = pd.bdate_range(start=df['Date'].min(), end=df['Date'].max())

        # Keep only the dates that exist in our data (removes holidays)
        trading_dates = df['Date'].unique()
        trading_index = bday_index[bday_index.isin(trading_dates)]

        # Reindex data to only include trading days
        df = df.set_index('Date')
        df = df.reindex(trading_index)
        df = df.reset_index()

        # Verify required columns exist
        required_columns = ['Date', 'Open', 'High', 'Low', 'Close']
        missing_columns = [col for col in required_columns if col not in df.columns]
        if missing_columns:
            raise ValueError(f"Missing required columns: {missing_columns}")

        # Apply Kalman filter
        kf = KalmanFilter(R=0.1, Q=0.1)
        df['Kalman'] = df['Close'].apply(kf.filter)

        # Calculate Kalman error
        df['Kalman_Error'] = abs(df['Close'] - df['Kalman'])

        return df

    except Exception as e:
        print(f"Error processing data: {str(e)}")
        return None

    # Apply Kalman filter
    kf = KalmanFilter(R=0.1, Q=0.1)
    df['Kalman'] = df['Close'].apply(kf.filter)

    # Calculate Kalman error
    df['Kalman_Error'] = abs(df['Close'] - df['Kalman'])

    return df

def create_spy_visualization(df):
    # Create figure with secondary y-axis
    fig = make_subplots(rows=3, cols=1,
                       row_heights=[0.5, 0.3, 0.2],
                       shared_xaxes=True,
                       vertical_spacing=0.05)

    # Add OHLC candlesticks with customized spacing
    fig.add_trace(
        go.Candlestick(
            x=df['Date'],
            open=df['Open'],
            high=df['High'],
            low=df['Low'],
            close=df['Close'],
            name='OHLC',
            xperiod='D',  # Daily spacing
            xperiodalignment='start',  # Align at start of period
            xhoverformat='%Y-%m-%d',  # Format for hover text
        ),
        row=1, col=1
    )

    # Add Kalman filter line
    fig.add_trace(
        go.Scatter(
            x=df['Date'],
            y=df['Kalman'],
            name='Kalman Filter',
            line=dict(color='red')
        ),
        row=1, col=1
    )

    # Full period line chart
    fig.add_trace(
        go.Scatter(
            x=df['Date'],
            y=df['Close'],
            name='Close Price',
            line=dict(color='blue')
        ),
        row=2, col=1
    )

    # Kalman error
    fig.add_trace(
        go.Scatter(
            x=df['Date'],
            y=df['Kalman_Error'],
            name='Kalman Error',
            fill='tozeroy',
            line=dict(color='orange')
        ),
        row=3, col=1
    )

    # Update layout
    fig.update_layout(
        title='SPY Analysis with Kalman Filter (2013-2016)',
        yaxis_title='Price',
        yaxis2_title='Price',
        yaxis3_title='Error',
        xaxis_rangeslider_visible=False,
        height=1000,
        showlegend=True
    )

    # Add range selector to bottom subplot
    fig.update_xaxes(
        rangeslider_visible=True,
        rangeselector=dict(
            buttons=list([
                dict(count=1, label="1m", step="month", stepmode="backward"),
                dict(count=6, label="6m", step="month", stepmode="backward"),
                dict(count=1, label="1y", step="year", stepmode="backward"),
                dict(step="all")
            ])
        ),
        row=3, col=1
    )

    return fig

def main():
    try:
        # Load and process data
        df = load_and_process_spy_data('/content/SPX_Data.csv')

        if df is None or df.empty:
            print("Error: No data loaded")
            return

        print("Data loaded successfully:")
        print(f"Date range: {df['Date'].min()} to {df['Date'].max()}")
        print(f"Number of trading days: {len(df)}")

        # Create visualization
        fig = create_spy_visualization(df)

        # Show plot
        fig.show()

        # Print some statistics
        print("\nKalman Filter Statistics:")
        print(f"Average Error: ${df['Kalman_Error'].mean():.2f}")
        print(f"Max Error: ${df['Kalman_Error'].max():.2f}")
        print(f"Min Error: ${df['Kalman_Error'].min():.2f}")

        # Calculate and print periods of significant deviation
        significant_deviations = df[df['Kalman_Error'] > df['Kalman_Error'].mean() + 2*df['Kalman_Error'].std()]
        print("\nPeriods of Significant Deviation:")
        for _, row in significant_deviations.iterrows():
            print(f"Date: {row['Date'].strftime('%Y-%m-%d')}, Error: ${row['Kalman_Error']:.2f}")

    except Exception as e:
        print(f"Error in main execution: {str(e)}")

if __name__ == "__main__":
    main()

Columns in DataFrame: ['Date', 'Open', 'High', 'Low', 'Close']
Error processing data: Missing required columns: ['Date']
Error: No data loaded
