# Analyse correlations between PlanetScope and Sentinel-2

Load and analyse correlation coefficients between PlanetScope and Sentinel-2 regarding the following variables:

* Planet satellite
* Land use
* Area

## Prepare libraries and plotting environment

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Plot inline
%matplotlib inline
# Define figure size
plt.rcParams['figure.figsize'] = (10, 8)

## Load data

In [None]:
# Correlations files
coor_fn_list = {
    1: "./data/Izola_coeff.csv",
    2: "./data/Jesenice_coeff.csv",
    3: "./data/Kranj_coeff.csv",
    4: "./data/Radenci_coeff.csv",
}

# Aggregated filename
corr_df_fn = "./data/corr_df.csv"

In [None]:
# Load all files
li = []
for area in coor_fn_list.keys():
    df = pd.read_csv(
        coor_fn_list[area],
        index_col=0,
        dtype={"band": "int"},
        parse_dates=["PS_Date", "S2_Date"],
    )
    df["Area"] = area
    li.append(df)
corr_df = pd.concat(li, axis=0, ignore_index=True)

In [None]:
corr_df.head()

In [None]:
corr_df.dtypes

In [None]:
# Split PS_filename at _ and create new column PS_Sat from the second part
corr_df["PS_Sat"] = corr_df["PS_filename"].str.split("_").str[2]
corr_df["PS_Sat"] = corr_df["PS_Sat"].astype("category")

In [None]:
# Split S2_filename at _ and create new column S2_Sat from the second part
corr_df["S2_Sat"] = corr_df["S2_filename"].str.split("_").str[1]
corr_df["S2_Sat"] = corr_df["S2_Sat"].astype("category")

In [None]:
corr_df.head()

In [None]:
# Save dataframe
corr_df.to_csv(corr_df_fn)

### Analyse number of PS and S2 images

Analyse number of PS and S2 images, and satellites used for the analysis.

In [None]:
# print length of dataframe
print(f'In total {len(corr_df)} image pairs are used.')

In [None]:
# Count unique number of PS_Sat
print(f'Number of unique PS satellites: {len(corr_df["PS_Sat"].unique())}')

In [None]:
# Count unique number of S2_Sat
print(f'Number of unique S2 satellites: {len(corr_df["S2_Sat"].unique())}')

In [None]:
# Group by PS_Sat
ps_sat_count = corr_df.groupby("PS_Sat").count()['m']

In [None]:
# Sort
ps_sat_count = ps_sat_count.sort_values(ascending=False)

In [None]:
ps_sat_count.head()

In [None]:
# Seaborn barplot, order by m
sns.barplot(x=ps_sat_count.index, y=ps_sat_count.values, order=ps_sat_count.index)

In [None]:
# Plot number of PS images per satellite
ps_sat_count.plot(drawstyle="steps-post")
plt.title("Number of PS images per satellite")
plt.show()

In [None]:
# PLot histogram of S2_Sat
sns.histplot(corr_df["S2_Sat"])

## Correlation analysis

Use only selected columns for analysis.

In [None]:
# Use only Area, PS_Date, band, PS_sat, and m
corr_sub_df = corr_df[["Area", "PS_Date", "band", "PS_Sat", "m"]]

In [None]:
corr_sub_df.head()

### Plot correlation coefficients by time and band

In [None]:
# Iterate over all bands
for band in corr_sub_df["band"].unique():
    # Select only band
    band_df = corr_sub_df[corr_sub_df["band"] == band]
    # Aggregate by week
    band_df_agg = band_df.groupby([pd.Grouper(key="PS_Date", freq="W")]).mean()
    band_df_agg['m'].dropna().rolling(window=7).mean().plot(label=f"Band {band}")
plt.legend()
plt.title("Correlation coefficients by time and band by week")
plt.show()

In [None]:
# Iterate over all bands
for band in corr_sub_df["band"].unique():
    # Select only band
    band_df = corr_sub_df[corr_sub_df["band"] == band]
    # Aggregate by week
    band_df_agg = band_df.groupby([pd.Grouper(key="PS_Date", freq="Q")]).mean()
    band_df_agg['m'].plot(label=f"Band {band}")
plt.legend()
plt.title("Correlation coefficients by time and band by quarter")
plt.show()

### Plot correlation coefficients by area, time and band

In [None]:
# Plot PS_Date vs m, for each area and band
for area in np.unique(corr_sub_df["Area"]):
    # Get df for area, drop area column
    area_df = corr_sub_df.loc[corr_sub_df["Area"] == area, :].drop(
        columns=["Area"]
    )
    # Convert to wide
    df = pd.pivot_table(area_df,index=['PS_Date'], columns = 'band', values = "m")
    # Plot
    fig, ax = plt.subplots(figsize=(10, 10))
    df.plot(ax=ax, alpha=0.5, marker='o')
    ax.set_title(f"Area {area}")
    ax.set_xlabel("PS_Date")
    ax.set_ylabel("m")

### Smooth data

In [None]:
# Plot PS_Date vs m, for each area and band
for area in np.unique(corr_sub_df["Area"]):
    # Get df for area, drop area column
    area_df = corr_sub_df.loc[corr_sub_df["Area"] == area, :].drop(
        columns=["Area"]
    )
    # Convert to wide
    df = pd.pivot_table(area_df,index=['PS_Date'], columns = 'band', values = "m")
    # Smooth df by rolling mean
    df_smooth = df.rolling(window=21).mean()
    # Plot
    fig, ax = plt.subplots(figsize=(10, 10))
    df_smooth.plot(ax=ax, alpha=0.5, marker='o')
    ax.set_title(f"Area {area}")
    ax.set_xlabel("PS_Date")
    ax.set_ylabel("m")

### Mean m by band

In [None]:
corr_sub_df.head()

In [None]:
# Compute mean m by band, ignore Area
mean_m_by_band = corr_sub_df.groupby("band").mean()

In [None]:
# Drop Area column
mean_m_by_band = mean_m_by_band.drop(columns=["Area"])

In [None]:
mean_m_by_band.head()

In [None]:
# Plot mean m by band
fig, ax = plt.subplots(figsize=(10, 10))
mean_m_by_band.plot(ax=ax)
ax.set_title("Mean m by band")
ax.set_xlabel("Band")
ax.set_ylabel("m")
# No legend
ax.legend().set_visible(False)

In [None]:
# Create violin plot
sns.violinplot(data=corr_sub_df, x="band", y="m")

### Statistics of m by area and band

In [None]:
# Histogram of Area
fig, ax = plt.subplots()
ax = sns.countplot(data=corr_sub_df, x="Area")
ax.set_xlabel("Area")
ax.set_ylabel("Count")
plt.xticks([0, 1, 2, 3], ["Izola", "Jesenice", "Kranj", "Radenci"])
plt.show()

In [None]:
corr_sub_df.head()

In [None]:
# Compute mean m by area and band
mean_m_by_area_band = corr_sub_df.groupby(["band", "Area"]).mean()

In [None]:
mean_m_by_area_band.head()

In [None]:
# To wide
mean_m_by_area_band_wide = mean_m_by_area_band.unstack()

In [None]:
mean_m_by_area_band_wide.head()

In [None]:
# Plot mean m by band
fig, ax = plt.subplots(figsize=(10, 10))
mean_m_by_area_band_wide.plot(ax=ax)
ax.set_title("Mean m by band and area")
ax.set_xlabel("Area")
ax.set_ylabel("m")
plt.xticks([0, 1, 2, 3], ["Izola", "Jesenice", "Kranj", "Radenci"])
# No legend
ax.legend().set_visible(False)

In [None]:
# Create violin plot
sns.violinplot(data=corr_sub_df, x="band", y="m", hue="Area")