In [1]:
from greensight.utils import DIR_DATA
import pandas as pd
import numpy as np
import json
from typing import Callable, Union
from pathlib import Path
import re
from datetime import datetime
from tqdm.notebook import tqdm

In [2]:

def load_sentinel_one_data_from_path(path: Union[str, Path], condition: Callable) -> pd.DataFrame:
    """""
    loads sentinel one data from file path and process into dataframe 
    """
    path = Path(path)
    assert path.is_dir()

    # Extract the year using regex
    match = re.search(r"\d{4}", str(path))
    year = int(match.group(0) if match else None)

    # get all files which meet the stipulated condition
    files = sorted([file for file in path.iterdir() if condition(file)])

    # load and concatenate
    data = [pd.read_csv(file) for file in files]
    df = pd.concat(data, axis=0)

    # get unique band identifiers
    band_inds = set(
        [i.split("_")[1] for i in df.columns.unique() if i.split("_")[0].isnumeric()]
    )

    # get month identifiers
    month_inds = set(
        [i.split("_")[0] for i in df.columns.unique() if i.split("_")[0].isnumeric()]
    )

    # index by shape code
    df.index = df["LAD_CD"]

    # drop unwanted columns
    df = df.drop(columns=["system:index", ".geo"])

    months = []
    inds = []
    for month in month_inds:
        # generate desired columns
        cols = [month + "_" + band for band in band_inds]

        # create df of desired columns
        df_month = df[cols].copy()

        # convert from a DataFrame of rows: shapes, columns: bands for a single month
        # to a single row of rows: month, columns: (shape, band)
        row_month = df_month.stack().to_frame().T

        # create multi-index for the columns (shape, band)
        new_cols = [(a, b.split("_")[1]) for a, b in row_month.columns]
        row_month.columns = pd.MultiIndex.from_tuples(new_cols)

        # add to stack
        months.append(row_month)
        # add month name to index.
        inds.append(month)

    df_month = pd.concat(months, axis=0)
    df_month.index = np.array(inds).astype(int) + 1

    df_month = df_month.sort_index()
    df_month.index.name = "date"
    df_month.index = [datetime(year, int(month), 1) for month in df_month.index]
    df_month.columns.names = ("shape", "band")

    # remove duplicate replacing with their mean- this should not be needed, but is due to a mistake in the data
    df_month = df_month.T.groupby(level=[0, 1]).mean().T

    # add greenbelt information from json dict.
    lookup_path = DIR_DATA / "id_lookup/id_lookup.json"
    with open(lookup_path, "r") as in_file:
        D_lookup = json.load(in_file)
    greenbelts = [D_lookup[code]["GB_Name"] for code, band in df_month.columns]

    # add greenbelts to column MultiIndex
    df_month.columns = pd.MultiIndex.from_tuples(
        [(gb, *cols) for gb, cols in zip(greenbelts, df_month.columns)]
    )
    df_month.columns.names = ("greenbelt", "shape", "band")

    return df_month

In [3]:
sentinel_data_path = DIR_DATA / "sentinel_data"
assert sentinel_data_path.is_dir()

In [4]:
# load data for each year
def condition(path: str) -> bool:
    """
    Define condition for file in data dir to be included.
    """
    return  "mean" in path.stem and "S1" in path.stem

dfs = []
for path in tqdm(list(sentinel_data_path.iterdir())): 
    df = load_sentinel_one_data_from_path(path, condition=condition)
    dfs.append(df)

  0%|          | 0/9 [00:00<?, ?it/s]

InvalidIndexError: Reindexing only valid with uniquely valued Index objects

In [4]:
# combine years
df_comb = pd.concat(dfs, axis=0)
df_comb.columns = pd.MultiIndex.from_tuples([(i[0].replace("-", " ").replace(",", ""), i[1], i[2]) for i in df_comb.columns])
df_comb.columns.names =  ("greenbelt", "shape", "band")
df_comb.head(5)

greenbelt,Merseyside and Greater Manchester,Merseyside and Greater Manchester,Merseyside and Greater Manchester,Merseyside and Greater Manchester,Merseyside and Greater Manchester,Merseyside and Greater Manchester,Merseyside and Greater Manchester,Merseyside and Greater Manchester,Merseyside and Greater Manchester,Merseyside and Greater Manchester,...,London,London,London,London,London,London,London,London,London,London
shape,E06000006,E06000006,E06000006,E06000006,E06000007,E06000007,E06000007,E06000007,E06000008,E06000008,...,E09000027,E09000027,E09000029,E09000029,E09000029,E09000029,E09000031,E09000031,E09000031,E09000031
band,VHAsc,VHDes,VVAsc,VVDes,VHAsc,VHDes,VVAsc,VVDes,VHAsc,VHDes,...,VVAsc,VVDes,VHAsc,VHDes,VVAsc,VVDes,VHAsc,VHDes,VVAsc,VVDes
2016-01-01,-21.880295,-16.991904,-15.775903,-9.790099,-16.071707,-16.330632,-9.220326,-9.751699,-16.65234,-16.737056,...,-8.756936,-10.676782,-16.77137,-17.255673,-10.597951,-11.7237,-16.742193,-17.17476,-10.834776,-11.552687
2016-02-01,-23.155242,-17.578793,-16.93079,-9.725071,-16.492496,-16.878869,-9.33944,-9.714068,-16.769975,-16.911006,...,-8.943721,-11.705665,-17.18518,-17.619796,-10.872856,-11.681797,-16.307043,-17.45287,-10.085572,-11.418674
2016-03-01,-23.382046,-18.10371,-17.087475,-10.5317,-16.863572,-17.473715,-9.546583,-10.452768,-17.058511,-18.111202,...,-9.011251,-11.404164,-17.0954,-17.859541,-10.820224,-12.022884,-16.547483,-17.510375,-10.391498,-11.840899
2016-04-01,-24.581625,-17.473175,-19.243807,-10.768544,-16.92985,-16.801029,-10.216895,-10.500869,-17.323229,-17.50371,...,-9.39143,-11.195927,-17.18128,-17.307712,-11.139923,-11.771096,-16.631947,-17.335087,-10.686377,-11.707105
2016-05-01,-23.044508,-18.52983,-18.120999,-12.751782,-17.919154,-17.971877,-12.141177,-12.285496,-18.045142,-18.14168,...,-9.288337,-11.733921,-17.733808,-18.18216,-11.964601,-12.63764,-17.542597,-18.446995,-11.333272,-12.681095


In [5]:
# save DataFrame
output_path = DIR_DATA / "processed_data/sentinel_one_df.hdf"
df_comb.to_hdf(output_path, key="df", mode="w", format="table")

In [6]:

## check it is able to be loaded
df_loaded = pd.read_hdf(output_path, key="df")
df_loaded.head(5)

greenbelt,Merseyside and Greater Manchester,Merseyside and Greater Manchester,Merseyside and Greater Manchester,Merseyside and Greater Manchester,Merseyside and Greater Manchester,Merseyside and Greater Manchester,Merseyside and Greater Manchester,Merseyside and Greater Manchester,Merseyside and Greater Manchester,Merseyside and Greater Manchester,...,London,London,London,London,London,London,London,London,London,London
shape,E06000006,E06000006,E06000006,E06000006,E06000007,E06000007,E06000007,E06000007,E06000008,E06000008,...,E09000027,E09000027,E09000029,E09000029,E09000029,E09000029,E09000031,E09000031,E09000031,E09000031
band,VHAsc,VHDes,VVAsc,VVDes,VHAsc,VHDes,VVAsc,VVDes,VHAsc,VHDes,...,VVAsc,VVDes,VHAsc,VHDes,VVAsc,VVDes,VHAsc,VHDes,VVAsc,VVDes
2016-01-01,-21.880295,-16.991904,-15.775903,-9.790099,-16.071707,-16.330632,-9.220326,-9.751699,-16.65234,-16.737056,...,-8.756936,-10.676782,-16.77137,-17.255673,-10.597951,-11.7237,-16.742193,-17.17476,-10.834776,-11.552687
2016-02-01,-23.155242,-17.578793,-16.93079,-9.725071,-16.492496,-16.878869,-9.33944,-9.714068,-16.769975,-16.911006,...,-8.943721,-11.705665,-17.18518,-17.619796,-10.872856,-11.681797,-16.307043,-17.45287,-10.085572,-11.418674
2016-03-01,-23.382046,-18.10371,-17.087475,-10.5317,-16.863572,-17.473715,-9.546583,-10.452768,-17.058511,-18.111202,...,-9.011251,-11.404164,-17.0954,-17.859541,-10.820224,-12.022884,-16.547483,-17.510375,-10.391498,-11.840899
2016-04-01,-24.581625,-17.473175,-19.243807,-10.768544,-16.92985,-16.801029,-10.216895,-10.500869,-17.323229,-17.50371,...,-9.39143,-11.195927,-17.18128,-17.307712,-11.139923,-11.771096,-16.631947,-17.335087,-10.686377,-11.707105
2016-05-01,-23.044508,-18.52983,-18.120999,-12.751782,-17.919154,-17.971877,-12.141177,-12.285496,-18.045142,-18.14168,...,-9.288337,-11.733921,-17.733808,-18.18216,-11.964601,-12.63764,-17.542597,-18.446995,-11.333272,-12.681095
