## Purpose
The original dataset was created from imagery over Thailand. Time was ignored, instead observations were averaged together over individual seasons. Here I re-purpose the data to experiment with fitting a 2 dimensional Gaussian. 

In [5]:
import numpy as np
import pandas as pd
import pymc as pm
import arviz as az
import os
import censoring

RANDOM_SEED = 100
np.random.seed(RANDOM_SEED)
az.style.use("arviz-darkgrid")

## Data Loading and Processing
It's necessary to merge the Sentinel 1 and Sentinel 2 data together. 

In [3]:
# function to load the files
def load_files(dir_list): 
    # use directory to gain list of files
    file_list = os.listdir(dir_list)
    # set iterator
    c = 0
    # loop through files
    for x in file_list: 
        temp_df = pd.read_csv(dir_list + '\\' + x)
        if c == 0: 
            final_df = temp_df.copy()
        else: 
            final_df = pd.concat([final_df, temp_df], axis=0)
        c += 1
    
    return final_df

# function to average the observations and merge sentinel1, sentinel2 data
def merge_dfs(s1,s2, sub_flag="no"):
    # drop the time column
    s1 = s1.drop('time', axis=1)
    s2 = s2.drop('time', axis=1)
    # these are training files
    if sub_flag == "no":
        # average the times together
        s1_agg = s1.groupby(['lat','lon','class']).mean().reset_index()
        s2_agg = s2.groupby(['lat','lon','class']).mean().reset_index()
        # merge
        s_final = s1_agg.merge(s2_agg, how="inner", on=['lat','lon','class'])
    # these are submission files
    else:
        # average the times together
        s1_agg = s1.groupby(['lat','lon']).mean().reset_index()
        s2_agg = s2.groupby(['lat','lon']).mean().reset_index()
        # merge
        s_final = s1_agg.merge(s2_agg, how="inner", on=['lat','lon'])
    
    return s_final

In [6]:
sent1_data = load_files(censoring.s1_input)
sent2_data = load_files(censoring.s2_input)

final_train = merge_dfs(s1=sent1_data, 
                        s2=sent2_data, 
                        sub_flag = "no")

In [7]:
final_train.head(5)

Unnamed: 0,lat,lon,class,vh,vv,rvi,green,red,blue,ndvi,osavi,rdvi,mtvi1,evi
0,9.9449,105.521447,Rice,0.024088,0.114975,0.676813,1918.878094,1689.619048,1804.969548,0.510594,0.510574,33.511181,3598.597527,2.207659
1,9.946717,105.524172,Rice,0.030584,0.147283,0.698786,1568.410049,1339.704368,1426.481497,0.524069,0.524048,34.468099,3698.401132,1.320317
2,9.947171,105.517358,Rice,0.025577,0.132543,0.6516,1582.573002,1362.996035,1475.847617,0.48533,0.485309,30.715177,3198.024878,2.215421
3,9.94808,105.517358,Rice,0.023488,0.123932,0.676208,1574.642861,1335.136503,1442.862699,0.512604,0.512584,33.979766,3667.604974,1.843528
4,9.949897,105.518267,Rice,0.02592,0.127763,0.708171,1872.249544,1633.313349,1771.783806,0.495758,0.495739,32.794753,3553.858422,2.248219


## Helpful References Here
https://www.pymc-labs.io/blog-posts/spatial-gaussian-process-01/

This post describes making a custom class to measure the chordal distance between points of longitude/latitude. 