In [140]:
from greensight.utils import DIR_DATA, DIR_ROOT
import pandas as pd
import numpy as np
import json
from typing import Callable, Union
from pathlib import Path
import re
from datetime import datetime
from tqdm.notebook import tqdm
from itertools import product

In [141]:
lookup_path = DIR_ROOT / "resources/id_lookup/id_lookup.json"
with open(lookup_path, "r") as in_file:
    D_lookup = json.load(in_file)

In [151]:

def load_sentinel_one_data_from_path(path: Union[str, Path], condition: Callable) -> pd.DataFrame:
    """""
    loads sentinel one data from file path and process into dataframe 
    """
    path = Path(path)
    assert path.is_dir()

    # Extract the year using regex
    match = re.search(r"\d{4}", str(path))
    year = int(match.group(0) if match else None)

    # get all files which meet the stipulated condition
    files = sorted([file for file in path.iterdir() if condition(file)])

    # load and concatenate
    data = [pd.read_csv(file) for file in files]
    df = pd.concat(data, axis=0)

    # get unique band identifiers
    band_inds = set(
        [i.split("_")[1] for i in df.columns.unique() if i.split("_")[0].isnumeric()]
    )

    # get month identifiers
    month_inds = set(
        [i.split("_")[0] for i in df.columns.unique() if i.split("_")[0].isnumeric()]
    )

    # index by shape code
    df.index = df["LAD_CD"]

    # drop unwanted columns
    df = df.drop(columns=["system:index", ".geo"])


    cols = sorted([m+"_"+b for m, b in list(product(month_inds, band_inds))])

    df_months = [df[cols].iloc[:, i*4:i*4+4] for i in range(len(month_inds))]

    row_months = [df.fillna(0.).stack().to_frame().T for df in df_months]

    for i, rm in enumerate(row_months):
        new_cols = [(a, b.split("_")[1]) for a, b in rm.columns]
        rm.columns = pd.MultiIndex.from_tuples(new_cols)
        rm.index = [i+1]

    df_out = pd.concat(row_months, axis=0)

    df_out.index.name = "date"
    df_out.index = [datetime(year, int(month), 1) for month in df_out.index]
    df_out.columns.names = ("shape", "band")

    # remove duplicate replacing with their mean- this should not be needed, but is due to a mistake in the data
    df_out = df_out.T.groupby(level=[0, 1]).mean().T

    # add greenbelt information from json dict.
    lookup_path = DIR_ROOT / "resources/id_lookup/id_lookup.json"
    with open(lookup_path, "r") as in_file:
        D_lookup = json.load(in_file)
    greenbelts = [D_lookup[code]["GB_Name"] for code, band in df_out.columns]

    # add greenbelts to column MultiIndex
    df_out.columns = pd.MultiIndex.from_tuples(
        [(gb, *cols) for gb, cols in zip(greenbelts, df_out.columns)]
    )
    df_out.columns.names = ("greenbelt", "shape", "band")

    return df_out

In [144]:
sentinel_data_path = DIR_DATA / "sentinel_data"
assert sentinel_data_path.is_dir()

In [152]:
# load data for each year
def condition(path: str) -> bool:
    """
    Define condition for file in data dir to be included.
    """
    return  "mean" in path.stem and "S1" in path.stem

dfs = []
for path in tqdm(list(sentinel_data_path.iterdir())): 
    df = load_sentinel_one_data_from_path(path, condition=condition)
    dfs.append(df)

  0%|          | 0/9 [00:00<?, ?it/s]

In [153]:
# combine years
df_comb = pd.concat(dfs, axis=0)
df_comb.columns = pd.MultiIndex.from_tuples([(i[0].replace("-", " ").replace(",", ""), i[1], i[2]) for i in df_comb.columns])
df_comb.columns.names =  ("greenbelt", "shape", "band")
df_comb.head(5)

greenbelt,Merseyside and Greater Manchester,Merseyside and Greater Manchester,Merseyside and Greater Manchester,Merseyside and Greater Manchester,Merseyside and Greater Manchester,Merseyside and Greater Manchester,Merseyside and Greater Manchester,Merseyside and Greater Manchester,Merseyside and Greater Manchester,Merseyside and Greater Manchester,...,London,London,London,London,London,London,London,London,London,London
shape,E06000006,E06000006,E06000006,E06000006,E06000007,E06000007,E06000007,E06000007,E06000008,E06000008,...,E09000027,E09000027,E09000029,E09000029,E09000029,E09000029,E09000031,E09000031,E09000031,E09000031
band,VHAsc,VHDes,VVAsc,VVDes,VHAsc,VHDes,VVAsc,VVDes,VHAsc,VHDes,...,VVAsc,VVDes,VHAsc,VHDes,VVAsc,VVDes,VHAsc,VHDes,VVAsc,VVDes
2016-01-01,-21.880295,-16.991904,-15.775903,-9.790099,-16.071707,-16.330632,-9.220326,-9.751699,-16.65234,-16.737056,...,-8.756936,-10.676782,-16.77137,-17.255673,-10.597951,-11.7237,-16.742193,-17.17476,-10.834776,-11.552687
2016-02-01,-20.987328,-17.073966,-14.334226,-9.872628,-17.203276,-16.601356,-10.011234,-9.807394,-18.008646,-17.417576,...,-9.768301,-11.732255,-17.869552,-18.684054,-11.592459,-12.890625,-17.622554,-18.572617,-11.384257,-12.543675
2016-03-01,-21.244966,-17.481998,-14.993133,-10.083542,-16.309546,-16.989151,-9.348067,-10.051062,-17.052957,-17.576936,...,-9.616047,-12.0703,-17.526683,-18.222637,-11.388633,-12.6142,-17.260247,-18.026064,-11.268428,-12.471184
2016-04-01,-23.155242,-17.578793,-16.93079,-9.725071,-16.492496,-16.878869,-9.33944,-9.714068,-16.769975,-16.911006,...,-8.943721,-11.705665,-17.18518,-17.619796,-10.872856,-11.681797,-16.307043,-17.45287,-10.085572,-11.418674
2016-05-01,-23.382046,-18.10371,-17.087475,-10.5317,-16.863572,-17.473715,-9.546583,-10.452768,-17.058511,-18.111202,...,-9.011251,-11.404164,-17.0954,-17.859541,-10.820224,-12.022884,-16.547483,-17.510375,-10.391498,-11.840899


In [154]:
# save DataFrame
output_path = DIR_DATA / "processed_data/sentinel_one_df.hdf"
df_comb.to_hdf(output_path, key="df", mode="w", format="table")

In [155]:

## check it is able to be loaded
df_loaded = pd.read_hdf(output_path, key="df")
df_loaded.head(5)

greenbelt,Merseyside and Greater Manchester,Merseyside and Greater Manchester,Merseyside and Greater Manchester,Merseyside and Greater Manchester,Merseyside and Greater Manchester,Merseyside and Greater Manchester,Merseyside and Greater Manchester,Merseyside and Greater Manchester,Merseyside and Greater Manchester,Merseyside and Greater Manchester,...,London,London,London,London,London,London,London,London,London,London
shape,E06000006,E06000006,E06000006,E06000006,E06000007,E06000007,E06000007,E06000007,E06000008,E06000008,...,E09000027,E09000027,E09000029,E09000029,E09000029,E09000029,E09000031,E09000031,E09000031,E09000031
band,VHAsc,VHDes,VVAsc,VVDes,VHAsc,VHDes,VVAsc,VVDes,VHAsc,VHDes,...,VVAsc,VVDes,VHAsc,VHDes,VVAsc,VVDes,VHAsc,VHDes,VVAsc,VVDes
2016-01-01,-21.880295,-16.991904,-15.775903,-9.790099,-16.071707,-16.330632,-9.220326,-9.751699,-16.65234,-16.737056,...,-8.756936,-10.676782,-16.77137,-17.255673,-10.597951,-11.7237,-16.742193,-17.17476,-10.834776,-11.552687
2016-02-01,-20.987328,-17.073966,-14.334226,-9.872628,-17.203276,-16.601356,-10.011234,-9.807394,-18.008646,-17.417576,...,-9.768301,-11.732255,-17.869552,-18.684054,-11.592459,-12.890625,-17.622554,-18.572617,-11.384257,-12.543675
2016-03-01,-21.244966,-17.481998,-14.993133,-10.083542,-16.309546,-16.989151,-9.348067,-10.051062,-17.052957,-17.576936,...,-9.616047,-12.0703,-17.526683,-18.222637,-11.388633,-12.6142,-17.260247,-18.026064,-11.268428,-12.471184
2016-04-01,-23.155242,-17.578793,-16.93079,-9.725071,-16.492496,-16.878869,-9.33944,-9.714068,-16.769975,-16.911006,...,-8.943721,-11.705665,-17.18518,-17.619796,-10.872856,-11.681797,-16.307043,-17.45287,-10.085572,-11.418674
2016-05-01,-23.382046,-18.10371,-17.087475,-10.5317,-16.863572,-17.473715,-9.546583,-10.452768,-17.058511,-18.111202,...,-9.011251,-11.404164,-17.0954,-17.859541,-10.820224,-12.022884,-16.547483,-17.510375,-10.391498,-11.840899


In [156]:
D_lookup["E07000197"]


{'GB_Name': 'Stoke-on-Trent', 'Area_ha': 9292.431604283165}

In [158]:
df_loaded["Stoke on Trent"]["E07000197"]

band,VHAsc,VHDes,VVAsc,VVDes
2016-01-01,-15.844641,-16.044457,-10.030831,-10.206329
2016-02-01,-16.989927,-20.959983,-11.033302,-15.765055
2016-03-01,-16.329657,-21.749309,-10.452592,-16.817705
2016-04-01,-16.103968,-16.030307,-10.121864,-10.099585
2016-05-01,-16.534933,-17.048609,-10.433406,-10.802523
...,...,...,...,...
2024-06-01,-16.773918,-17.336234,-10.722856,-11.237809
2024-07-01,-16.842957,-17.310148,-10.746824,-11.131337
2024-08-01,-17.232763,-16.992074,-11.069896,-10.617870
2024-09-01,-17.014054,-17.491494,-10.727905,-10.898619
