In [1]:
import sys
import os

import pandas as pd
import numpy as np
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import plotly.graph_objects as go

module_path = os.path.abspath(os.path.join("../src"))
if module_path not in sys.path:
    sys.path.append(module_path)

from data import Data

In [2]:
def get_correlation_matrix(data):
    # Calculate the partial correlation matrix
    corr_matrix = data.corr()
    partial_corr_matrix = pd.DataFrame(np.zeros_like(corr_matrix), columns=corr_matrix.columns, index=corr_matrix.index)
    for i in range(len(partial_corr_matrix.columns)):
        for j in range(i+1, len(partial_corr_matrix.index)):
            control_variables = [x for x in corr_matrix.index if x not in [partial_corr_matrix.index[j], partial_corr_matrix.columns[i]]]
            partial_corr = pearsonr(data[partial_corr_matrix.index[j]] - data[control_variables].mean(axis=1), data[partial_corr_matrix.columns[i]] - data[control_variables].mean(axis=1))[0]
            partial_corr_matrix.iloc[j, i] = partial_corr
            partial_corr_matrix.iloc[i, j] = partial_corr

    # Plot the partial correlation matrix as a heatmap using Plotly
    fig = go.Figure(data=go.Heatmap(
        z=partial_corr_matrix,
        x=partial_corr_matrix.columns,
        y=partial_corr_matrix.index,
        colorscale='RdBu',
        zmid=0,
        zmin=-1,
        zmax=1,
        hovertemplate='Variable 1: %{x}<br>Variable 2: %{y}<br>Partial Correlation: %{z:.2f}<extra></extra>'
    ))

    fig.update_layout(
        title='Partial Correlation Matrix of Hydrological Inflow Variables',
        xaxis_title='Variable 1',
        yaxis_title='Variable 2',
        xaxis=dict(side='top'),
        yaxis=dict(autorange='reversed'),
        width=800,
        height=800,
        margin=dict(t=100)
    )

    fig.show()

In [3]:
data_dir = "../data/"
clean_data_dir = os.path.abspath(os.path.join(data_dir, "clean_data"))

# Loop through each datafile in the data directory
for filename in os.listdir(clean_data_dir):
    # Get the full path of the file
    file_path = os.path.join(clean_data_dir, filename)
    datetime = "Datetime"    
    d = Data(filename, datetime)
    data = d.get_data()
    get_correlation_matrix(data)

In [4]:
filename = 'cleaned_data_1.csv'
datetime = "Datetime"
d = Data(filename, datetime)

In [5]:
data = d.get_data()

In [37]:
data

Unnamed: 0_level_0,Wind_Speed_Nilsebu,Air_Temperature_Nilsebu,Wind_Direction_Nilsebu,Relative_Humidity_Nilsebu,Air_Temperature_Fister,Precipitation_Fister,Flow_Lyngsvatn_Overflow,Flow_Tapping,Water_Level_Kalltveit,Flow_Kalltveit,Water_Temperature_Kalltveit_Kum,Precipitation_Nilsebu,Flow_HBV,Precipitation_HBV,Temperature_HBV,Flow_Without_Tapping_Kalltveit,Flow_Lyngsaana,Water_Temperature_Lyngsaana
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2015-01-01 00:00:00,6.4,1.5,200.0,97.3,7.0,1.800000,0.0,0.000000,0.55625,3.37060,5.5600,0.0,1.784105,0.808318,1.887160,3.370600,1.102521,2.000000
2015-01-01 01:00:00,5.8,1.9,197.5,95.0,7.0,2.000000,0.0,0.000000,0.55925,3.45242,5.5600,0.0,1.804824,0.000000,2.416214,3.452420,1.109938,2.100000
2015-01-01 02:00:00,6.1,1.6,205.3,96.7,7.3,2.200000,0.0,0.000000,0.56325,3.56260,5.5950,1.0,1.883905,0.808318,1.987160,3.562600,1.218400,2.100000
2015-01-01 03:00:00,6.8,1.7,202.7,98.1,7.5,2.400000,0.0,0.000000,0.56976,3.74423,5.5600,0.0,1.901547,0.000000,2.216214,3.744230,1.371236,2.000000
2015-01-01 04:00:00,9.0,2.4,225.2,94.1,7.6,0.500000,0.0,0.000000,0.57976,4.02901,5.5600,1.0,2.018551,0.808318,2.787160,4.029010,1.501314,2.100000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-03-21 19:00:00,11.3,-1.0,106.3,84.7,6.7,0.000000,0.0,0.911868,0.51450,2.43457,4.0125,0.0,1.569434,0.000000,-0.447506,1.572981,0.464860,1.323438
2022-03-21 20:00:00,11.7,-0.4,105.8,84.7,5.7,0.000000,0.0,0.913824,0.51413,2.42514,4.0750,0.0,1.553322,0.000000,0.152494,1.556832,0.451835,1.264649
2022-03-21 21:00:00,9.5,0.4,143.2,80.8,6.3,0.000000,0.0,0.914003,0.51375,2.41576,4.0750,0.0,1.537725,0.000000,0.952494,1.541402,0.450595,1.244922
2022-03-21 22:00:00,8.7,0.5,145.8,78.0,5.2,1.099694,0.0,0.915423,0.51338,2.40640,4.0500,0.1,1.523938,0.000000,1.052494,1.526564,0.433061,1.252735


In [44]:
import pandas as pd
import numpy as np

def generate_synthetic_data(row_count):
    synthetic_data = pd.DataFrame()

    # For each column in the original dataframe, generate synthetic data based on its type and distribution
    for column in data.columns:
        if data[column].dtype == np.float64 or data[column].dtype == np.int64:
            # Generate synthetic numeric data based on the mean and standard deviation
            mean = data[column].mean()
            std = data[column].std()
            synthetic_data[column] = np.random.normal(mean, std, row_count)
        elif data[column].dtype == object:
            # Generate synthetic categorical data based on the unique values and their frequencies
            unique_values, counts = np.unique(data[column].dropna(), return_counts=True)
            synthetic_data[column] = np.random.choice(unique_values, row_count, p=counts/counts.sum())
        else:
            # Handle other data types or skip the column
            pass

    # Handle datetime index
    datetime_index = pd.to_datetime(data.index)
    min_date = datetime_index.min()
    max_date = datetime_index.max()
    date_range = (max_date - min_date).total_seconds()
    synthetic_seconds = np.random.uniform(0, date_range, row_count)
    synthetic_dates = [min_date + pd.Timedelta(seconds=seconds) for seconds in synthetic_seconds]
    synthetic_data.index = pd.to_datetime(synthetic_dates)

    return synthetic_data


In [49]:
row_count = 1000
small_synthetic_data = generate_synthetic_data(row_count)

In [50]:
small_synthetic_data

Unnamed: 0,Wind_Speed_Nilsebu,Air_Temperature_Nilsebu,Wind_Direction_Nilsebu,Relative_Humidity_Nilsebu,Air_Temperature_Fister,Precipitation_Fister,Flow_Lyngsvatn_Overflow,Flow_Tapping,Water_Level_Kalltveit,Flow_Kalltveit,Water_Temperature_Kalltveit_Kum,Precipitation_Nilsebu,Flow_HBV,Precipitation_HBV,Temperature_HBV,Flow_Without_Tapping_Kalltveit,Flow_Lyngsaana,Water_Temperature_Lyngsaana
2015-06-02 09:57:46.230879392,2.167470,-7.073374,247.675330,88.469022,7.341071,-0.464548,-0.510538,0.990881,0.538521,10.248347,8.419428,1.911861,16.835565,0.074002,4.903764,-7.201880,3.020403,12.637145
2018-11-01 23:07:08.066906096,3.391857,9.129103,218.616704,114.943670,4.471128,0.264441,0.301172,1.432227,0.609935,8.275816,6.516736,0.895657,6.914515,0.254637,0.236893,5.581154,4.953459,0.064799
2020-10-14 19:33:33.315285440,8.767340,-2.613183,188.403666,114.244197,27.791205,-0.417579,0.054702,0.378859,0.692034,-1.142199,4.559478,-0.578632,4.145863,0.392939,2.777033,12.314991,4.643666,-2.405215
2019-09-29 20:25:17.362266048,8.603707,-0.631860,267.342298,47.476763,12.452775,0.487869,0.986714,1.597552,0.751264,3.686866,4.917825,1.804098,1.103034,0.203196,5.717029,9.912102,-1.193943,2.882928
2021-12-29 03:18:36.461018400,8.906922,-0.673370,423.684125,68.872009,6.328068,1.662695,-0.863025,-0.264181,0.475234,-4.899517,7.166442,0.969351,7.682585,1.280458,11.770602,4.238953,4.239994,8.053818
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-10-01 00:12:52.683059296,9.085889,10.200341,213.664608,72.049505,8.420769,-0.077373,-0.351113,-0.256065,0.527899,-1.094730,2.296041,-0.271526,1.505380,-1.050806,6.981974,0.463076,2.446150,1.894370
2020-05-19 20:34:21.754452928,6.969131,9.330294,274.913119,50.799326,4.293525,-0.034452,-0.286173,1.143258,0.887972,22.917619,3.072311,-0.171966,12.318057,0.609014,-13.683937,4.167981,3.131605,5.900053
2020-08-05 01:13:29.318682880,5.565166,-3.563099,148.024270,0.957644,10.685532,-0.092879,0.956385,2.422047,0.833507,9.141670,4.476774,-0.234747,10.125643,0.303080,8.344628,-3.411535,-1.399706,3.135298
2020-03-24 02:42:15.409936928,8.157384,-3.815964,326.433433,46.478175,5.123326,0.300603,1.412409,1.081604,0.669195,-7.418127,7.695077,-0.357075,3.274271,0.085809,10.633915,-3.742041,3.346650,1.981238


In [51]:
small_synthetic_data_reset = small_synthetic_data.reset_index().rename(columns={'index': 'Datetime'})
csv_format_options = {
    'date_format': '%Y-%m-%d %H:%M:%S',
    'float_format': '%.2f',
    'index': False,
    'header': True,
    'sep': ','
}
small_synthetic_data.to_csv("../data/small_synthetic_data.csv", **csv_format_options)