In [1]:
import sys
import os

import pandas as pd
import numpy as np
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import plotly.graph_objects as go

module_path = os.path.abspath(os.path.join("../src"))
if module_path not in sys.path:
    sys.path.append(module_path)

from data import Data

In [2]:
data_dir = "../data/"
clean_data_dir = os.path.abspath(os.path.join(data_dir, "clean_data"))

In [2]:
def get_correlation_matrix(data):
    # Calculate the partial correlation matrix
    corr_matrix = data.corr()
    partial_corr_matrix = pd.DataFrame(
        np.zeros_like(corr_matrix), columns=corr_matrix.columns, index=corr_matrix.index
    )
    for i in range(len(partial_corr_matrix.columns)):
        for j in range(i + 1, len(partial_corr_matrix.index)):
            control_variables = [
                x
                for x in corr_matrix.index
                if x
                not in [partial_corr_matrix.index[j], partial_corr_matrix.columns[i]]
            ]
            partial_corr = pearsonr(
                data[partial_corr_matrix.index[j]]
                - data[control_variables].mean(axis=1),
                data[partial_corr_matrix.columns[i]]
                - data[control_variables].mean(axis=1),
            )[0]
            partial_corr_matrix.iloc[j, i] = partial_corr
            partial_corr_matrix.iloc[i, j] = partial_corr

    # Plot the partial correlation matrix as a heatmap using Plotly
    fig = go.Figure(
        data=go.Heatmap(
            z=partial_corr_matrix,
            x=partial_corr_matrix.columns,
            y=partial_corr_matrix.index,
            colorscale="RdBu",
            zmid=0,
            zmin=-1,
            zmax=1,
            hovertemplate="Variable 1: %{x}<br>Variable 2: %{y}<br>Partial Correlation: %{z:.2f}<extra></extra>",
        )
    )

    fig.update_layout(
        title="Partial Correlation Matrix of Hydrological Inflow Variables",
        xaxis_title="Variable 1",
        yaxis_title="Variable 2",
        xaxis=dict(side="top"),
        yaxis=dict(autorange="reversed"),
        width=800,
        height=800,
        margin=dict(t=100),
    )

    fig.show()

In [3]:
# Loop through each datafile in the data directory
for filename in os.listdir(clean_data_dir):
    # Get the full path of the file
    file_path = os.path.join(clean_data_dir, filename)
    datetime = "Datetime"
    d = Data(filename, datetime)
    data = d.get_data()
    get_correlation_matrix(data)

In [3]:
filename = os.listdir(clean_data_dir)[3]
# Get the full path of the file
file_path = os.path.join(clean_data_dir, filename)
datetime = "Datetime"
d = Data(filename, datetime)

In [8]:
target_value = d.get_data()["Flow_Kalltveit"]
target_value

Datetime
2018-11-04 21:00:00    2.68005
2018-11-04 22:00:00    2.68007
2018-11-04 23:00:00    2.64552
2018-11-05 00:00:00    2.65181
2018-11-05 01:00:00    2.63613
                        ...   
2020-09-22 04:00:00    2.55762
2020-09-22 05:00:00    2.54807
2020-09-22 06:00:00    2.54807
2020-09-22 07:00:00    2.54807
2020-09-22 08:00:00    2.52905
Name: Flow_Kalltveit, Length: 16500, dtype: float64

In [9]:
import plotly.graph_objects as go

# Create a trace for the target values
trace = go.Scatter(
    x=target_value.index, y=target_value.values, mode="lines", name="Flow_Kalltveit"
)

# Create a layout
layout = go.Layout(
    title="Flow_Kalltveit Time Series Data",
    xaxis=dict(title="Datetime"),
    yaxis=dict(title="Flow_Kalltveit Value"),
)

# Create a Figure and add the trace to it
fig = go.Figure(data=[trace], layout=layout)

# Show the figure
fig.show()