In [1]:
from greensight.utils import DIR_DATA
import pandas as pd
import numpy as np
from datetime import datetime
import json
from typing import Union
from pathlib import Path
import re
from tqdm.notebook import tqdm

In [3]:
sentinel_data_path = DIR_DATA / "sentinel_data"
assert sentinel_data_path.is_dir()

In [4]:
year_paths = [path for path in sorted(list(sentinel_data_path.iterdir())) if "2016" not in path.name]

In [9]:

def load_sentinel_two_data_from_csv(path: Union[str, Path]) -> pd.DataFrame:

    """""
    loads sentinel two data from file path and process into dataframe 
    """
    from greensight.indices import INDEX_NAMES

    path = Path(path)
    assert path.is_file()

    # Extract the year using regex
    match = re.search(r"\d{4}", str(path))
    year = int(match.group(0) if match else None)

    df = pd.read_csv(path)

    # drop unwanted columns
    df = df.drop(columns=["system:index", ".geo"])

    # check columns (month_band) for duplicates
    assert np.unique(df.columns).shape == df.columns.shape

    # get unique band identifiers
    band_inds = list(set(
        sorted([i.split("_")[1] for i in df.columns.unique() if i.split("_")[0].isnumeric()])
    ))

    index_inds = INDEX_NAMES

    print("index inds: ", index_inds)
    band_inds += index_inds

    # get month identifiers
    month_inds = set(
        [i.split("_")[0] for i in df.columns.unique() if i.split("_")[0].isnumeric()]
    )

    # set index
    df = df.set_index("LAD_CD")

    months = []
    inds = []
    for month in month_inds:
        # generate desired columns
        cols = [month + "_" + band for band in band_inds]
        print(cols)

        # create df of desired columns
        df_month = df[cols].copy()

        # convert from a DataFrame of rows: shapes, columns: bands for a single month to a single row of rows: month, columns: (shape, band)
        row_month = df_month.stack().to_frame().T

        # create multi-index for the columns (shape, band)
        new_cols = [(a, b.split("_")[1]) for a, b in row_month.columns]
        row_month.columns = pd.MultiIndex.from_tuples(new_cols)

        # add to stack
        months.append(row_month)
        # add month name to index.
        inds.append(month)


    # combine rows
    df_month = pd.concat(months, axis=0)

    # fix index to month value
    df_month.index = np.array(inds).astype(int) + 1

    # format index
    df_month = df_month.sort_index()
    df_month.index.name = "date"
    df_month.index = [datetime(year, int(month), 1) for month in df_month.index]
    df_month.columns.names = ("shape", "band")

    assert df_month.shape == (len(month_inds), len(band_inds)*df.shape[0])

    # add greenbelt information from json dict.
    lookup_path = DIR_DATA / "id_lookup/id_lookup.json"
    with open(lookup_path, "r") as in_file:
        D_lookup = json.load(in_file)
    greenbelts = [D_lookup[code]["GB_Name"] for code, _ in df_month.columns]

    # add greenbelts to column MultiIndex
    df_month.columns = pd.MultiIndex.from_tuples(
        [(gb, *cols) for gb, cols in zip(greenbelts, df_month.columns)]
    )
    df_month.columns.names = ("greenbelt", "shape", "band")

    return df_month

In [5]:
df_all = []
for year_path in tqdm(year_paths):
    year_files = [file for file in year_path.iterdir() if "mean" in file.stem and "S2" in file.stem]

    df_year = []
    for file in tqdm(year_files):
        out = load_sentinel_two_data_from_csv(file)
        df_year.append(out)
    df_year = pd.concat(df_year, axis=1)
    
    df_all.append(df_year)


  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

In [6]:
# remove additional columns
df_all2 = [df[df_all[0].columns] for df in df_all]

In [7]:
# remove duplicate columns
df_all_combined = []

for df in df_all2:
    unique_columns = ~df.columns.duplicated(keep='first')

    df_unique = df.loc[:, unique_columns]

    df_all_combined.append(df_unique)

In [8]:
for df in df_all_combined:
    print(df.shape)

(9, 2160)
(12, 2160)
(12, 2160)
(12, 2160)
(12, 2160)
(12, 2160)
(12, 2160)
(10, 2160)


In [9]:
df_out = pd.concat(df_all_combined, axis=0)

In [10]:
df_out.head()

greenbelt,Bath and Bristol,Bath and Bristol,Bath and Bristol,Bath and Bristol,Bath and Bristol,Bath and Bristol,Bath and Bristol,Bath and Bristol,Bath and Bristol,Bath and Bristol,...,York,York,York,York,York,York,York,York,York,York
shape,E06000022,E06000022,E06000022,E06000022,E06000022,E06000022,E06000022,E06000022,E06000022,E06000022,...,E07000167,E07000167,E07000167,E07000167,E07000167,E07000167,E07000167,E07000167,E07000167,E07000167
band,B2,B1,B9,B8A,B8,B11,B4,B12,B5,B3,...,B9,B8A,B8,B11,B4,B12,B5,B3,B7,B6
2017-04-01,311.27669,234.80582,4068.776515,4035.477838,3999.326764,1894.865456,422.891561,1020.251965,1094.207824,623.238776,...,,,,,,,,,,
2017-05-01,302.160542,216.930759,4384.997329,4353.98211,4197.78026,1909.530406,417.478271,991.96232,1125.957264,635.020438,...,3575.736009,3544.669321,3509.033587,1819.617433,507.285913,1182.111125,1045.571895,627.370944,3335.098446,2744.18825
2017-06-01,340.702544,252.662771,4549.696552,4480.317738,4226.873012,2188.81041,551.242964,1161.614115,1301.592959,724.435485,...,,,,,,,,,,
2017-07-01,363.510906,300.029123,4043.190219,3993.286431,3856.484979,2017.795291,482.459567,1048.994736,1183.826676,677.887598,...,,,,,,,,,,
2017-08-01,,,,,,,,,,,...,,,,,,,,,,


In [11]:
# clean up greenbelt place names
df_out.columns = pd.MultiIndex.from_tuples([(i[0].replace("-", " ").replace(",", "") , i[1], i[2]) for i in df_out.columns])
df_out.columns.names = ["greenbelt", "shape", "band"]
df_out.head()

greenbelt,Bath and Bristol,Bath and Bristol,Bath and Bristol,Bath and Bristol,Bath and Bristol,Bath and Bristol,Bath and Bristol,Bath and Bristol,Bath and Bristol,Bath and Bristol,...,York,York,York,York,York,York,York,York,York,York
shape,E06000022,E06000022,E06000022,E06000022,E06000022,E06000022,E06000022,E06000022,E06000022,E06000022,...,E07000167,E07000167,E07000167,E07000167,E07000167,E07000167,E07000167,E07000167,E07000167,E07000167
band,B2,B1,B9,B8A,B8,B11,B4,B12,B5,B3,...,B9,B8A,B8,B11,B4,B12,B5,B3,B7,B6
2017-04-01,311.27669,234.80582,4068.776515,4035.477838,3999.326764,1894.865456,422.891561,1020.251965,1094.207824,623.238776,...,,,,,,,,,,
2017-05-01,302.160542,216.930759,4384.997329,4353.98211,4197.78026,1909.530406,417.478271,991.96232,1125.957264,635.020438,...,3575.736009,3544.669321,3509.033587,1819.617433,507.285913,1182.111125,1045.571895,627.370944,3335.098446,2744.18825
2017-06-01,340.702544,252.662771,4549.696552,4480.317738,4226.873012,2188.81041,551.242964,1161.614115,1301.592959,724.435485,...,,,,,,,,,,
2017-07-01,363.510906,300.029123,4043.190219,3993.286431,3856.484979,2017.795291,482.459567,1048.994736,1183.826676,677.887598,...,,,,,,,,,,
2017-08-01,,,,,,,,,,,...,,,,,,,,,,


In [12]:
# save DataFrame
output_path = DIR_DATA / "processed_data/sentinel_two_df.hdf"
df_out.to_hdf(output_path, key="df", mode="w", format="table")

In [51]:
dfs = [load_sentinel_two_data_from_csv(p) for p in Path("/home/finley/GoogleDrive/Greenbelts_2020_test_full_year").iterdir()]

index inds:  ['arvi', 'bi', 'bi2', 'ci', 'dvi', 'gemi', 'gndvi', 'ipvi', 'ireci', 'mcari', 'mndwi', 'msavi', 'msavi2', 'mtci', 'ndi45', 'ndpi', 'ndti', 'ndvi', 'ndwi', 'ndwi2', 'pssra', 'pvi', 'reip', 'ri', 'rvi', 's2rep', 'savi', 'tndvi', 'tsavi', 'wdvi']
['0_B2', '0_B1', '0_B9', '0_B8A', '0_B8', '0_B11', '0_B4', '0_B12', '0_B5', '0_B3', '0_B7', '0_B6', '0_arvi', '0_bi', '0_bi2', '0_ci', '0_dvi', '0_gemi', '0_gndvi', '0_ipvi', '0_ireci', '0_mcari', '0_mndwi', '0_msavi', '0_msavi2', '0_mtci', '0_ndi45', '0_ndpi', '0_ndti', '0_ndvi', '0_ndwi', '0_ndwi2', '0_pssra', '0_pvi', '0_reip', '0_ri', '0_rvi', '0_s2rep', '0_savi', '0_tndvi', '0_tsavi', '0_wdvi']


KeyError: "['0_arvi', '0_bi', '0_bi2', '0_ci', '0_dvi', '0_gemi', '0_gndvi', '0_ipvi', '0_ireci', '0_mcari', '0_mndwi', '0_msavi', '0_msavi2', '0_mtci', '0_ndi45', '0_ndpi', '0_ndti', '0_ndvi', '0_ndwi', '0_ndwi2', '0_pssra', '0_pvi', '0_reip', '0_ri', '0_rvi', '0_s2rep', '0_savi', '0_tndvi', '0_tsavi', '0_wdvi'] not in index"

In [36]:
dfs[0]

greenbelt,York,York,York,York,York,York,York,York,York,York,York,York,York,York,York,York,York,York,York,York,York
shape,E07000164,E07000164,E07000164,E07000164,E07000164,E07000164,E07000164,E07000164,E07000164,E07000164,...,E06000014,E06000014,E06000014,E06000014,E06000014,E06000014,E06000014,E06000014,E06000014,E06000014
band,B2,B1,B9,B8A,B8,B11,B4,B12,B5,B3,...,B9,B8A,B8,B11,B4,B12,B5,B3,B7,B6
2020-01-01,582.605169,539.497921,2079.40904,2104.011433,2085.477096,1511.918514,639.062193,941.926838,1057.141741,663.683077,...,1987.023754,2024.111219,1986.872212,1374.575942,551.983751,857.788292,943.70249,595.841745,1842.714819,1645.063394
2020-02-01,567.551773,558.54386,2548.828697,2510.718208,2525.736822,1836.69663,785.918381,1143.515955,1267.672896,831.891762,...,2514.633354,2458.229316,2457.898778,1732.166397,691.984799,1099.288403,1163.118843,734.104211,2252.287328,2018.584163
2020-03-01,464.588671,331.798804,2405.904083,2375.215506,2385.769907,1730.120324,699.533352,1069.504545,1183.167496,703.527821,...,2415.649633,2388.705771,2404.849001,1708.529481,602.524943,1088.848458,1069.991621,617.399296,2180.361294,1919.727472
2020-04-01,541.601436,432.399102,3122.081151,3085.562914,3112.568017,2067.148072,744.864191,1372.901518,1319.482461,824.172368,...,3259.8271,3211.927947,3210.372696,2029.141944,665.551668,1306.119734,1242.533285,766.160383,2977.686909,2561.760876
2020-05-01,603.924724,504.760202,3321.128109,3288.965219,3182.959567,2221.092915,824.532192,1575.544854,1343.659275,865.138367,...,3565.533173,3507.609114,3420.397446,2028.694836,652.35165,1361.995646,1191.94689,777.922232,3272.848051,2683.428515
2020-06-01,463.077755,410.606256,3699.026568,3703.987917,3496.514534,2203.715178,727.089952,1427.206855,1348.742172,810.710828,...,3931.507871,3926.069293,3726.179952,2095.596839,622.039644,1296.058494,1287.377817,761.490224,3616.364621,2993.934739
2020-07-01,601.681829,451.949832,3423.630149,3470.80667,3355.050079,2293.22063,863.282441,1488.559454,1463.39653,915.003994,...,3680.541894,3722.202378,3564.296411,2325.326598,772.212558,1431.797757,1380.678591,836.352677,3406.548889,2856.930804
2020-08-01,829.571061,757.037036,3241.758366,3209.966192,3118.692276,2692.298743,1172.609182,1803.793315,1647.892931,1090.83047,...,3430.63083,3303.786305,3196.045516,2323.815094,900.627754,1515.575973,1393.895707,894.859859,2993.066253,2545.243064
2020-09-01,529.642189,381.949988,2582.384305,2563.10929,2488.027842,2275.860043,821.691504,1650.336606,1241.553598,768.021066,...,2875.529816,2851.816502,2751.611517,2169.774673,669.054844,1440.825606,1144.085314,688.820069,2598.110977,2226.09985
2020-10-01,548.883662,519.744941,2446.876332,2427.36983,2366.740054,2001.504966,778.117669,1375.717426,1189.092582,775.543789,...,2793.472134,2768.636912,2714.610141,1964.336298,672.809758,1269.363833,1145.494496,746.239164,2525.291287,2213.369301


In [37]:
dfs[0]["York"]["E06000014"]

band,B2,B1,B9,B8A,B8,B11,B4,B12,B5,B3,B7,B6
2020-01-01,558.674287,544.984362,1987.023754,2024.111219,1986.872212,1374.575942,551.983751,857.788292,943.70249,595.841745,1842.714819,1645.063394
2020-02-01,485.561922,488.04455,2514.633354,2458.229316,2457.898778,1732.166397,691.984799,1099.288403,1163.118843,734.104211,2252.287328,2018.584163
2020-03-01,384.288638,239.133611,2415.649633,2388.705771,2404.849001,1708.529481,602.524943,1088.848458,1069.991621,617.399296,2180.361294,1919.727472
2020-04-01,499.354253,427.837457,3259.8271,3211.927947,3210.372696,2029.141944,665.551668,1306.119734,1242.533285,766.160383,2977.686909,2561.760876
2020-05-01,529.477791,441.652858,3565.533173,3507.609114,3420.397446,2028.694836,652.35165,1361.995646,1191.94689,777.922232,3272.848051,2683.428515
2020-06-01,441.60989,400.625377,3931.507871,3926.069293,3726.179952,2095.596839,622.039644,1296.058494,1287.377817,761.490224,3616.364621,2993.934739
2020-07-01,533.845894,409.683463,3680.541894,3722.202378,3564.296411,2325.326598,772.212558,1431.797757,1380.678591,836.352677,3406.548889,2856.930804
2020-08-01,657.651117,567.398173,3430.63083,3303.786305,3196.045516,2323.815094,900.627754,1515.575973,1393.895707,894.859859,2993.066253,2545.243064
2020-09-01,461.241009,349.757132,2875.529816,2851.816502,2751.611517,2169.774673,669.054844,1440.825606,1144.085314,688.820069,2598.110977,2226.09985
2020-10-01,519.805142,495.4012,2793.472134,2768.636912,2714.610141,1964.336298,672.809758,1269.363833,1145.494496,746.239164,2525.291287,2213.369301


In [10]:
path = Path("/home/finley/GoogleDrive/Greenbelts_2020_polygon_bands_and_indices")
df = load_sentinel_two_data_from_csv(next(path.iterdir()))

index inds:  ['arvi', 'bi', 'bi2', 'ci', 'dvi', 'gemi', 'gndvi', 'ipvi', 'ireci', 'mcari', 'mndwi', 'msavi', 'msavi2', 'mtci', 'ndi45', 'ndpi', 'ndti', 'ndvi', 'ndwi', 'ndwi2', 'pssra', 'pvi', 'reip', 'ri', 'rvi', 's2rep', 'savi', 'tndvi', 'tsavi', 'wdvi']
['0_tndvi', '0_mndwi', '0_dvi', '0_pvi', '0_bi', '0_ndi45', '0_ndvi', '0_ndwi', '0_s2rep', '0_ipvi', '0_gndvi', '0_gemi', '0_arvi', '0_ndwi2', '0_msavi2', '0_bi2', '0_mtci', '0_rvi', '0_tsavi', '0_ci', '0_msavi', '0_savi', '0_ireci', '0_mcari', '0_pssra', '0_reip', '0_ndpi', '0_ri', '0_wdvi', '0_ndti', '0_arvi', '0_bi', '0_bi2', '0_ci', '0_dvi', '0_gemi', '0_gndvi', '0_ipvi', '0_ireci', '0_mcari', '0_mndwi', '0_msavi', '0_msavi2', '0_mtci', '0_ndi45', '0_ndpi', '0_ndti', '0_ndvi', '0_ndwi', '0_ndwi2', '0_pssra', '0_pvi', '0_reip', '0_ri', '0_rvi', '0_s2rep', '0_savi', '0_tndvi', '0_tsavi', '0_wdvi']
['10_tndvi', '10_mndwi', '10_dvi', '10_pvi', '10_bi', '10_ndi45', '10_ndvi', '10_ndwi', '10_s2rep', '10_ipvi', '10_gndvi', '10_gemi', '1

In [11]:
df

greenbelt,Oxford,Oxford,Oxford,Oxford,Oxford,Oxford,Oxford,Oxford,Oxford,Oxford,Oxford,Oxford,Oxford,Oxford,Oxford,Oxford,Oxford,Oxford,Oxford,Oxford,Oxford
shape,E07000181,E07000181,E07000181,E07000181,E07000181,E07000181,E07000181,E07000181,E07000181,E07000181,E07000181,E07000181,E07000181,E07000181,E07000181,E07000181,E07000181,E07000181,E07000181,E07000181,E07000181
band,tndvi,mndwi,dvi,pvi,bi,ndi45,ndvi,ndwi,s2rep,ipvi,...,pssra,pvi.1,reip,ri,rvi,s2rep.1,savi,tndvi.1,tsavi,wdvi
2020-01-01,0.520768,-0.409023,1363.604941,1822.185232,-836.068385,0.513508,0.520777,-0.513508,0.140169,0.434777,...,-0.140169,1822.185232,0.3066,4.571236,4.571236,0.140169,0.781074,0.520768,0.520753,1831.273494
2020-02-01,0.528037,-0.431238,1584.477242,2133.315104,-961.428597,0.52556,0.528054,-0.52556,0.13123,0.438174,...,-0.13123,2133.315104,0.323918,4.37416,4.37416,0.13123,0.791924,0.528037,0.528012,2143.955145
2020-03-01,0.533004,-0.443979,1851.913056,2504.929425,-1106.866438,0.523076,0.533019,-0.523076,0.114909,0.442015,...,-0.114909,2504.929425,0.343476,4.424587,4.424587,0.114909,0.799391,0.533004,0.532982,2517.422916
2020-04-01,0.561364,-0.452467,2219.746244,2912.197321,-1407.729777,0.542992,0.561376,-0.542992,0.139739,0.470912,...,-0.139739,2912.197321,0.369299,5.369434,5.369434,0.139739,0.84195,0.561364,0.561346,2926.722086
2020-05-01,0.637681,-0.447037,2789.720235,3419.818652,-2003.455939,0.60329,0.637691,-0.60329,0.240633,0.548955,...,-0.240633,3419.818652,0.443537,8.009982,8.009982,0.240633,0.956441,0.637681,0.637666,3436.87521
2020-06-01,0.591133,-0.46109,2306.950701,2955.766314,-1546.849538,0.573519,0.591146,-0.573519,0.179617,0.505397,...,-0.179617,2955.766314,0.399686,7.004075,7.004075,0.179617,0.886604,0.591133,0.591115,2970.508382
2020-07-01,0.544397,-0.485119,2186.059939,2912.74965,-1374.311841,0.543812,0.544408,-0.543812,0.111741,0.463499,...,-0.111741,2912.74965,0.364926,6.745796,6.745796,0.111741,0.816505,0.544397,0.544379,2927.27717
2020-08-01,0.504748,-0.486318,1843.865474,2581.043567,-1051.332954,0.515825,0.50476,-0.515825,0.065721,0.42431,...,-0.065721,2581.043567,0.336781,5.354651,5.354651,0.065721,0.757031,0.504748,0.504731,2593.916682
2020-09-01,0.522823,-0.463131,1948.633854,2635.694601,-1147.913757,0.512623,0.522835,-0.512623,0.09017,0.437188,...,-0.09017,2635.694601,0.349417,5.340641,5.340641,0.09017,0.784139,0.522823,0.522805,2648.840292
2020-10-01,0.500858,-0.308923,1602.068132,2103.122119,-952.234386,0.443684,0.500878,-0.443684,0.168985,0.408125,...,-0.168985,2103.122119,0.325302,4.794219,4.794219,0.168985,0.751128,0.500858,0.500827,2113.611572
