# Correlation Analysis

- Matrix Correlation
- Cross-Correlation

In [None]:
# Assuming you have the 'raw' DataFrame defined

corr = raw.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Create the heatmap trace
heatmap = go.Heatmap(
    z=corr.values,
    x=corr.columns.tolist(),
    y=corr.index.tolist(),
    # color from blue to red
    colorscale='RdBu',
    #colorscale='RdYlGn', 
    # add text in each cell
    text=corr.values,
    texttemplate='%{text:.2f}',
    hovertemplate='Correlation: %{text:.2f}<extra></extra><br>%{y}<br>%{x}',
    showscale=True,
    reversescale=False,
    zmin=-1,
    zmax=1,
    xgap=1,
    ygap=1,
)

# Create the layout
layout = go.Layout(
    #title='Correlation Heatmap',
    xaxis=dict(tickangle=-45),  # Rotate x-axis labels
    yaxis=dict(autorange='reversed'), # Invert the y-axis
    height=900,
    width=900,
)

# Create the figure
fig = go.Figure(data=[heatmap], layout=layout)

# Show the plot
fig.show()

- Some level sensors correlate completely, but they measure the same thing
    - i.e. one increases and the other increases proportionally
- Position is negative, because typically locked gate/low during heavy rain, e.g. especially its basin
- Rain gauges don't have so much correlation
    - maybe band would be interesting to investigate?

- does it make sense to model base don close/clustered sensor

In [None]:
similar = (raw['G80F11B_Level1'] == raw['G80F11B_Level2'])
print(f"Number of similar values: {np.sum(similar)} out of {len(similar)}")

Number of similar values: 523597 out of 1052641


In [None]:
mae = np.abs(raw['G80F11B_Level1'] - raw['G80F11B_Level2']).mean()
print(f"Mean Absolute Error: {mae}")

Mean Absolute Error: 0.004355312479119977


#### Cross-Correlation

In [None]:


def remove_center_zeros(df, base_variable, n=5):
    """Remove the center zeros in a time series and fills them with the nearest non-zero values."""
    mask = df[base_variable] > 0
    # set false to nan in the mask
    mask = mask.replace(False, np.nan)
    # forward and backward fill the nans
    mask = mask.fillna(method='ffill', limit=n).fillna(method='bfill', limit=n)
    # set the mask to False where it still is nan
    mask = mask.fillna(False)
    # apply the mask
    df = df[mask]
    return df


# Function to compute cross-correlation for lags and plot them
def plot_cross_correlation(df, base_variable, lag_range=20, width=900, height=450):
    """Plot cross-correlation of a base variable with all other variables in the dataframe."""
    variables = df.columns

    # remove center zeros
    df = remove_center_zeros(df, base_variable, n=lag_range*2)


    # Generate colors using Plotly's 'tab20' color palette
    colors = [f'rgba({int(r*255)},{int(g*255)},{int(b*255)},1)' for r, g, b in plt.get_cmap('tab20')(np.linspace(0, 1, len(variables)))[:, :3]]

    # Initialize an empty Plotly figure
    fig = go.Figure()

    mins = []
    maxs = []
    # Add small variable noise to separate the lines
    for i, variable in enumerate(variables):
        if variable != base_variable:
            cross_correlations = [df[base_variable].corr(df[variable].shift(lag)) for lag in range(-lag_range, lag_range+1)]
            # add small noise to separate the lines
            cross_correlations += np.array(0.003 * i)
            fig.add_trace(go.Scatter(
                x=list(range(-lag_range, lag_range+1)),
                y=cross_correlations,
                mode='lines',
                name=f'{variable}',
                line=dict(color=colors[i], width=2),
                showlegend=True,
                opacity=1,
                text=[variable]*len(cross_correlations),
                texttemplate='%{text:.2f}',
                hovertemplate='%{text}<br>Lag: %{x}<br>Correlation: %{y:.2f}<extra></extra>'
            ))
            mins.append(min(cross_correlations))
            maxs.append(max(cross_correlations))


    # Add horizontal and vertical reference lines
    fig.add_shape(type='line', x0=-lag_range, x1=lag_range, y0=0, y1=0,
                line=dict(color="Black", width=1))
    fig.add_shape(type='line', x0=0, x1=0, y0=min(df.min()), y1=max(df.max()),
                line=dict(color="Black", width=1))

    # Update layout for titles, axes labels, and grid
    fig.update_layout(
        #title=f'Cross-Correlation of {base_variable}',
        xaxis_title='Lags',
        yaxis_title='Correlation Coefficient',
        legend=dict(x=1.05, y=1),  # Moves legend outside the plot area
        width=width, height=height,
        margin=dict(l=40, r=200, t=40, b=40),  # Adjust to accommodate legend
        plot_bgcolor='white'
    )

    # Add grid lines
    fig.update_xaxes(showgrid=True, gridcolor='LightGray')
    fig.update_yaxes(showgrid=True, gridcolor='LightGray', range=[min(mins)-0.05, max(maxs)+0.05])


    # Show the figure
    fig.show()

In [None]:
test = pd.DataFrame([True, True, False, False, False, False, False, True, True], columns=['test'])
remove_center_zeros(test, 'test', n=2)

Unnamed: 0,test
0,True
1,True
2,False
3,False
5,False
6,False
7,True
8,True


#### Cross-Correlation with Rain

If a variable is constant, it flattens the line, since no variation is there to correlate with
- thus, we remove values, if the base variable continues to be zero beyond the lags we are checking

Interpretation:

- Peak at Lag 3: This indicates that the second time series is significantly correlated with the first time series when shifted three time units (e.g., days, weeks, months) ahead.
    - pos: second leads the first
    - neg: first leads the second
- Negative Peak: A negative peak suggests an inverse relationship. When one series increases, the other decreases.


In [None]:
rain_gauges

['5425', '5427']

In [None]:
base_variable = '5425'
plot_cross_correlation(raw, base_variable, lag_range=120)

- 5425 precedes 5427, general flow of weather follows this trend?
- 5425 precedes all

- explain modelling time window
- rain rolling window with cumsum

- single series -> cross corr behaviour
- multi, 1 rain, or 1 rain for each sensor?

In [None]:
base_variable = '5427'
plot_cross_correlation(raw, base_variable, lag_range=120)

- So 5425 precedes 5427 but the other varialbes do not precede 5427

#### Cross-Correlatio: Decending down-stream

In [None]:
# for i, structure in enumerate(natural_structure_order):
#     print(structure)
#     columns = list(raw.columns)
#     structure_vars = [col for col in columns if structure in col]
#     structure_idxs = [columns.index(var) for var in structure_vars]
#     min_index = min(structure_idxs)
#     for i, structure_var in enumerate(structure_vars):
#         print(structure_var)
#         plot_cross_correlation(raw.iloc[:, min_index:], structure_var, lag_range=120, width=900, height=300)


- Nothing interesting

#### Cross-Correlation: circular relationships

In [None]:
# circular_relationship = ['G80F11B', 'G80F66Y']
# cricular_relationship_vars = [col for col in columns if any([structure in col for structure in circular_relationship])]
# for base_variable in cricular_relationship_vars:
#     print(base_variable)
#     plot_cross_correlation(raw[cricular_relationship_vars], base_variable, lag_range=120)


- No interesting relation, just similar, and relation weakens for higher lags.
- All have a soft normal distribution shape

In [None]:
circular_relationship = ['G71F06R', 'G71F68Y']
cricular_relationship_vars = [col for col in columns if any([structure in col for structure in circular_relationship])]
for base_variable in cricular_relationship_vars:
    print(base_variable)
    plot_cross_correlation(raw[cricular_relationship_vars], base_variable, lag_range=120)

G71F06R_LevelInlet


G71F68Y_LevelPS


G71F68Yp1


- Increased flow from pumping -> increased level, how come?

#### Cross-Correlation with storage pipe

In [None]:
storage_pipe_relation = ['G71F04R', 'G71F05R', 'G71F06R', 'G71F68Y']
storage_pipe_relation_vars = [col for col in columns if any([structure in col for structure in storage_pipe_relation])]
storage_pipe_relation_vars += ['5425', '5427']
storage_pipe_vars = [col for col in columns if 'G71F68Y' in col]

for base_variable in storage_pipe_vars:
    print(base_variable)
    plot_cross_correlation(raw[storage_pipe_relation_vars], base_variable, lag_range=120)

G71F68Y_LevelPS


G71F68Yp1


- ?

### TODO:
- auto correlation as well?
