In [1]:
from greensight.utils import DIR_DATA
import pandas as pd
import numpy as np
from datetime import datetime
import json
from typing import Union
from pathlib import Path
import re
from tqdm.notebook import tqdm

In [2]:
sentinel_data_path = DIR_DATA / "sentinel_data"
assert sentinel_data_path.is_dir()

In [3]:
year_paths = [path for path in sorted(list(sentinel_data_path.iterdir())) if "2016" not in path.name]

In [4]:

def load_sentinel_two_data_from_csv(path: Union[str, Path]) -> pd.DataFrame:

    """""
    loads sentinel two data from file path and process into dataframe 
    """

    path = Path(path)
    assert path.is_file()

    # Extract the year using regex
    match = re.search(r"\d{4}", str(path))
    year = int(match.group(0) if match else None)

    df = pd.read_csv(file)

    # drop unwanted columns
    df = df.drop(columns=["system:index", ".geo"])

    # check columns (month_band) for duplicates
    assert np.unique(df.columns).shape == df.columns.shape

    # get unique band identifiers
    band_inds = set(
        sorted([i.split("_")[1] for i in df.columns.unique() if i.split("_")[0].isnumeric()])
    )

    # get month identifiers
    month_inds = set(
        [i.split("_")[0] for i in df.columns.unique() if i.split("_")[0].isnumeric()]
    )

    # set index
    df = df.set_index("LAD_CD")

    months = []
    inds = []
    for month in month_inds:
        # generate desired columns
        cols = [month + "_" + band for band in band_inds]

        # create df of desired columns
        df_month = df[cols].copy()

        # convert from a DataFrame of rows: shapes, columns: bands for a single month to a single row of rows: month, columns: (shape, band)
        row_month = df_month.stack().to_frame().T

        # create multi-index for the columns (shape, band)
        new_cols = [(a, b.split("_")[1]) for a, b in row_month.columns]
        row_month.columns = pd.MultiIndex.from_tuples(new_cols)

        # add to stack
        months.append(row_month)
        # add month name to index.
        inds.append(month)


    # combine rows
    df_month = pd.concat(months, axis=0)

    # fix index to month value
    df_month.index = np.array(inds).astype(int) + 1

    # format index
    df_month = df_month.sort_index()
    df_month.index.name = "date"
    df_month.index = [datetime(year, int(month), 1) for month in df_month.index]
    df_month.columns.names = ("shape", "band")

    assert df_month.shape == (len(month_inds), len(band_inds)*df.shape[0])

    # add greenbelt information from json dict.
    lookup_path = DIR_DATA / "id_lookup/id_lookup.json"
    with open(lookup_path, "r") as in_file:
        D_lookup = json.load(in_file)
    greenbelts = [D_lookup[code]["GB_Name"] for code, _ in df_month.columns]

    # add greenbelts to column MultiIndex
    df_month.columns = pd.MultiIndex.from_tuples(
        [(gb, *cols) for gb, cols in zip(greenbelts, df_month.columns)]
    )
    df_month.columns.names = ("greenbelt", "shape", "band")

    return df_month

In [5]:
df_all = []
for year_path in tqdm(year_paths):
    year_files = [file for file in year_path.iterdir() if "mean" in file.stem and "S2" in file.stem]

    df_year = []
    for file in tqdm(year_files):
        out = load_sentinel_two_data_from_csv(file)
        df_year.append(out)
    df_year = pd.concat(df_year, axis=1)
    
    df_all.append(df_year)


  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

In [6]:
# remove additional columns
df_all2 = [df[df_all[0].columns] for df in df_all]

In [9]:
# remove duplicate columns
df_all_combined = []

for df in df_all2:
    unique_columns = ~df.columns.duplicated(keep='first')

    df_unique = df.loc[:, unique_columns]

    df_all_combined.append(df_unique)

In [10]:
for df in df_all_combined:
    print(df.shape)

(9, 2160)
(12, 2160)
(12, 2160)
(12, 2160)
(12, 2160)
(12, 2160)
(12, 2160)
(10, 2160)


In [11]:
df_out = pd.concat(df_all_combined, axis=0)

In [13]:
df_out.head()

greenbelt,Bath and Bristol,Bath and Bristol,Bath and Bristol,Bath and Bristol,Bath and Bristol,Bath and Bristol,Bath and Bristol,Bath and Bristol,Bath and Bristol,Bath and Bristol,...,York,York,York,York,York,York,York,York,York,York
shape,E06000022,E06000022,E06000022,E06000022,E06000022,E06000022,E06000022,E06000022,E06000022,E06000022,...,E07000167,E07000167,E07000167,E07000167,E07000167,E07000167,E07000167,E07000167,E07000167,E07000167
band,B6,B11,B4,B9,B2,B5,B7,B8A,B12,B3,...,B4,B9,B2,B5,B7,B8A,B12,B3,B8,B1
2017-04-01,3108.089837,1894.865456,422.891561,4068.776515,311.27669,1094.207824,3732.298612,4035.477838,1020.251965,623.238776,...,,,,,,,,,,
2017-05-01,3312.17503,1909.530406,417.478271,4384.997329,302.160542,1125.957264,4058.373258,4353.98211,991.96232,635.020438,...,507.285913,3575.736009,343.276927,1045.571895,3335.098446,3544.669321,1182.111125,627.370944,3509.033587,249.211902
2017-06-01,3402.514986,2188.81041,551.242964,4549.696552,340.702544,1301.592959,4113.497425,4480.317738,1161.614115,724.435485,...,,,,,,,,,,
2017-07-01,3112.659109,2017.795291,482.459567,4043.190219,363.510906,1183.826676,3686.713608,3993.286431,1048.994736,677.887598,...,,,,,,,,,,
2017-08-01,,,,,,,,,,,...,,,,,,,,,,


In [15]:
# clean up greenbelt place names
df_out.columns = pd.MultiIndex.from_tuples([(i[0].replace("-", " ").replace(",", "") , i[1], i[2]) for i in df_out.columns])
df_out.columns.names = ["greenbelt", "shape", "band"]
df_out.head()

greenbelt,Bath and Bristol,Bath and Bristol,Bath and Bristol,Bath and Bristol,Bath and Bristol,Bath and Bristol,Bath and Bristol,Bath and Bristol,Bath and Bristol,Bath and Bristol,...,York,York,York,York,York,York,York,York,York,York
shape,E06000022,E06000022,E06000022,E06000022,E06000022,E06000022,E06000022,E06000022,E06000022,E06000022,...,E07000167,E07000167,E07000167,E07000167,E07000167,E07000167,E07000167,E07000167,E07000167,E07000167
band,B6,B11,B4,B9,B2,B5,B7,B8A,B12,B3,...,B4,B9,B2,B5,B7,B8A,B12,B3,B8,B1
2017-04-01,3108.089837,1894.865456,422.891561,4068.776515,311.27669,1094.207824,3732.298612,4035.477838,1020.251965,623.238776,...,,,,,,,,,,
2017-05-01,3312.17503,1909.530406,417.478271,4384.997329,302.160542,1125.957264,4058.373258,4353.98211,991.96232,635.020438,...,507.285913,3575.736009,343.276927,1045.571895,3335.098446,3544.669321,1182.111125,627.370944,3509.033587,249.211902
2017-06-01,3402.514986,2188.81041,551.242964,4549.696552,340.702544,1301.592959,4113.497425,4480.317738,1161.614115,724.435485,...,,,,,,,,,,
2017-07-01,3112.659109,2017.795291,482.459567,4043.190219,363.510906,1183.826676,3686.713608,3993.286431,1048.994736,677.887598,...,,,,,,,,,,
2017-08-01,,,,,,,,,,,...,,,,,,,,,,


In [16]:
# save DataFrame
output_path = DIR_DATA / "processed_data/sentinel_two_df.hdf"
df_out.to_hdf(output_path, key="df", mode="w", format="table")