In [30]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from scipy.stats import spearmanr
from itertools import combinations

In [26]:
def convert_df(path):
    dframe = pd.read_parquet(path)
    dframe.index = pd.to_datetime(dframe['datumEindeMeting'])
    dframe.drop(columns=['datumEindeMeting', 'datumBeginMeting'], inplace=True)
    dframe = dframe["hstWaarde"].astype(float)
    return dframe[:-1]

ammonium_df = convert_df("../../data/tank1/ammonium.parquet")
nitrate_df = convert_df("../../data/tank1/nitrate.parquet")
phosphate_df = convert_df("../../data/tank1/phosphate.parquet")
oxygen_df = convert_df("../../data/tank1/oxygen_a.parquet")
oxygen_df2 = convert_df("../../data/tank1/oxygen_b.parquet")
energy_df = convert_df("../../data/tank1/energy.parquet")
water_df = pd.read_csv("../../data/tank1/water.csv", delimiter=";")

# Preprocessing water data
water_df.index = pd.to_datetime(water_df['DateTime'], format='%d-%m-%Y %H:%M')
water_df['EDE_09902MTW_K100.MTW'] = water_df['EDE_09902MTW_K100.MTW'].str.replace(',', '.').replace('(null)', np.nan, regex=True).astype(float)
water_df['EDE_09902MTW_K100.MTW'] = water_df['EDE_09902MTW_K100.MTW'].interpolate()
water_df = water_df['EDE_09902MTW_K100.MTW']
water_df.index.name = None
water_df = water_df[water_df.index.isin(oxygen_df.index)]
water_df = water_df[~water_df.index.duplicated()]

# Synchronizing data
ammonium_df = ammonium_df[ammonium_df.index.isin(water_df.index)]
nitrate_df = nitrate_df[nitrate_df.index.isin(water_df.index)]
phosphate_df = phosphate_df[phosphate_df.index.isin(water_df.index)]
oxygen_df = oxygen_df[oxygen_df.index.isin(water_df.index)]
oxygen_df2 = oxygen_df2[oxygen_df2.index.isin(water_df.index)]
energy_df = energy_df[energy_df.index.isin(water_df.index)]

ammonium_df.name = "Ammonium"
nitrate_df.name = "Nitrate"
phosphate_df.name = "Phosphate"
oxygen_df.name = "Oxygen Sensor A"
oxygen_df2.name = "Oxygen Sensor B"
energy_df.name = "Energy"
water_df.name = "Water Flow"


In [28]:
'''
Spearman rank correlation
if p < 0.05 => significant correlation
'''

dataframes = [ammonium_df, nitrate_df, phosphate_df, oxygen_df, oxygen_df2, energy_df]

results = []
for df1, df2 in combinations(dataframes, 2):
    correlation, p_value = spearmanr(df1, df2)
    results.append({'DataFrame1': df1.name, 'DataFrame2': df2.name, 'Correlation': correlation, 'P-Value': p_value})

# Convert the results to a DataFrame for better visualization
results_df = pd.DataFrame(results)

results_df

Unnamed: 0,DataFrame1,DataFrame2,Correlation,P-Value
0,Ammonium,Nitrate,0.255509,0.0
1,Ammonium,Phosphate,0.39399,0.0
2,Ammonium,Oxygen Sensor A,0.409609,0.0
3,Ammonium,Oxygen Sensor B,0.316948,0.0
4,Ammonium,Energy,-0.568094,0.0
5,Nitrate,Phosphate,-0.129561,0.0
6,Nitrate,Oxygen Sensor A,0.292268,0.0
7,Nitrate,Oxygen Sensor B,0.262069,0.0
8,Nitrate,Energy,-0.074864,0.0
9,Phosphate,Oxygen Sensor A,0.282068,0.0


In [34]:
hmap_df = results_df[['DataFrame1','DataFrame2','Correlation']]

unique_dfs = set(results_df['DataFrame1']).union(set(results_df['DataFrame2']))
unique_dfs = list(unique_dfs)
correlation_matrix = pd.DataFrame(index=unique_dfs, columns=unique_dfs)

# Populate the matrix with correlation values
for index, row in results_df.iterrows():
    df1, df2, corr = row['DataFrame1'], row['DataFrame2'], row['Correlation']
    correlation_matrix.at[df1, df2] = corr
    correlation_matrix.at[df2, df1] = corr

# Fill diagonal and NaN values
np.fill_diagonal(correlation_matrix.values, 1)
correlation_matrix = correlation_matrix.fillna(0)

# Create the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix.astype(float), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Spearman\'s Rank Correlation Heatmap')
plt.show()

ValueError: index cannot be a set