In [1]:
import os
import pandas as pd

In [2]:
PATH='D:\\Deutschland\\FUB\\master_thesis\\data'
INPUT_DIR = os.path.join(PATH, 'gee','extract_cloud30')
OUTPUT_DIR = os.path.join(PATH, 'gee', 'output')
DATA_DIR = os.path.join(OUTPUT_DIR, 'monthly_mean')

DATE_CSV = 'occurrence_30.csv'
MERGE_CSV = 'merged.csv'
LABEL_CSV = 'labels.csv'

raw_label_path = os.path.join(PATH, 'ref', LABEL_CSV)
date_path = os.path.join(OUTPUT_DIR, DATE_CSV)
merge_path = os.path.join(OUTPUT_DIR, MERGE_CSV)
label_path = os.path.join(OUTPUT_DIR, LABEL_CSV)

files = os.listdir(INPUT_DIR)

1. Functions for csv file

In [3]:
def load_csv_file(file_path:str) -> pd.DataFrame:
    df = pd.read_csv(file_path, sep=',', header=0, parse_dates = ['date'], index_col=['date'])
    # delete date when no available data
    df.dropna(axis=0, how='any', inplace=True)
    print(f'import file {file_path}')
    return df

In [4]:
def export_csv_file(df:pd.DataFrame, file_path:str, index:bool) -> None:
    df.to_csv(file_path, index=index)
    print(f'export file {file_path}')

In [5]:
def delete_file(file_path:str) -> None:
    os.remove(file_path)
    print(f'delete file {file_path}')

2. Functions for Pandas DataFrame

In [18]:
def shuffle(df:pd.DataFrame, ref:pd.DataFrame) -> pd.DataFrame:
    df = pd.merge(df, ref, how='right', on='date')
    df.drop(columns=['count', 'spacecraft_id', 'id'], inplace=True)
    # df.dropna(axis=0, how='any', inplace=True)
    return df

In [16]:
def monthly_mean_interpolate(df:pd.DataFrame) -> pd.DataFrame:
    # calculate mean band value in vegetation months for each year
    mean_df = df.resample('M').mean()
    mean_df = mean_df[((mean_df.index.month >= 5) & (mean_df.index.month <=9))]
    cols = list(mean_df.keys())
    for col in cols:
        mean_df[col] = mean_df[col].fillna(mean_df.groupby(mean_df.index.month)[col].transform('mean'))
    return mean_df

In [8]:
def reshape(df:pd.DataFrame) -> pd.DataFrame:
    # turn combination of reflectance value and date into a seperate column
    # new keys and values
    keys = list(df.keys())
    data = {'id':int(df.iat[0, -1])}
    for index, row in df.iterrows():
        date = index.strftime('%Y%m%d')
        for key in keys[:-1]:
            column = f'{date} {key}'
            data[column] = [row[key]]
    # reshape data
    return pd.DataFrame(data)

3. Count all available dates among all polygons

In [26]:
def count_dates() -> pd.DataFrame:
    map = {}
    # read each csv file
    for file in files:
        if file.endswith(".csv"):
            in_path = os.path.join(INPUT_DIR, file)
            try:
                df = load_csv_file(in_path)
                if df.empty:
                    delete_file(in_path)
                    continue
                # count date occurrence
                for index, row in df.iterrows():
                    date = index.strftime('%Y%m%d')
                    map[date] = map.get(date, 0) + 1
            except Exception:
                delete_file(in_path)
                continue
    # export output as csv
    dates = list(map.keys())
    counts = list(map.values())
    output = pd.DataFrame({'date':dates, 'count':counts})
    output.sort_values(by='date', ascending=True, inplace=True)
    export_csv_file(output, date_path, index=False)
    return output

count_dates()

import file D:\Deutschland\FUB\master_thesis\data\gee\extract_cloud30\plot_1.csv
import file D:\Deutschland\FUB\master_thesis\data\gee\extract_cloud30\plot_10.csv
import file D:\Deutschland\FUB\master_thesis\data\gee\extract_cloud30\plot_100.csv
import file D:\Deutschland\FUB\master_thesis\data\gee\extract_cloud30\plot_1000.csv
import file D:\Deutschland\FUB\master_thesis\data\gee\extract_cloud30\plot_10000.csv
import file D:\Deutschland\FUB\master_thesis\data\gee\extract_cloud30\plot_10002.csv
import file D:\Deutschland\FUB\master_thesis\data\gee\extract_cloud30\plot_10003.csv
import file D:\Deutschland\FUB\master_thesis\data\gee\extract_cloud30\plot_10004.csv
import file D:\Deutschland\FUB\master_thesis\data\gee\extract_cloud30\plot_10005.csv
import file D:\Deutschland\FUB\master_thesis\data\gee\extract_cloud30\plot_10006.csv
import file D:\Deutschland\FUB\master_thesis\data\gee\extract_cloud30\plot_10007.csv
import file D:\Deutschland\FUB\master_thesis\data\gee\extract_cloud30\plot_

Unnamed: 0,date,count
0,20170510,10190
65,20170517,5247
47,20170527,5815
109,20170530,1246
101,20170616,3718
...,...,...
92,20210908,5570
102,20210913,2092
98,20210918,4432
121,20210921,959


4. Merge all data frames to one csv file

In [20]:
def merge_data_frame() -> pd.DataFrame:
    data_frames = []
    dates = load_csv_file(date_path)
    # add each csv file to input list as data frame
    for file in files:
        if file.endswith(".csv"):
            in_path = os.path.join(INPUT_DIR, file)
            df =load_csv_file(in_path)
            df = shuffle(df, dates)
            df = monthly_mean_interpolate(df)
            # reshape columns to one row
            # df = reshape(df)
            data_frames.append(df)
            # export each new csv file
            out_path = os.path.join(DATA_DIR, file[5:])
            export_csv_file(df, out_path, False)
    # output = pd.concat(data_frames, ignore_index=True)
    # export_csv_file(output, merge_path, False)
    # return output

merge_data_frame()

import file D:\Deutschland\FUB\master_thesis\data\gee\output\occurrence_30.csv
import file D:\Deutschland\FUB\master_thesis\data\gee\extract_cloud30\plot_1.csv
export file D:\Deutschland\FUB\master_thesis\data\gee\output\monthly_mean\1.csv
import file D:\Deutschland\FUB\master_thesis\data\gee\extract_cloud30\plot_10.csv
export file D:\Deutschland\FUB\master_thesis\data\gee\output\monthly_mean\10.csv
import file D:\Deutschland\FUB\master_thesis\data\gee\extract_cloud30\plot_100.csv
export file D:\Deutschland\FUB\master_thesis\data\gee\output\monthly_mean\100.csv
import file D:\Deutschland\FUB\master_thesis\data\gee\extract_cloud30\plot_1000.csv
export file D:\Deutschland\FUB\master_thesis\data\gee\output\monthly_mean\1000.csv
import file D:\Deutschland\FUB\master_thesis\data\gee\extract_cloud30\plot_10000.csv
export file D:\Deutschland\FUB\master_thesis\data\gee\output\monthly_mean\10000.csv
import file D:\Deutschland\FUB\master_thesis\data\gee\extract_cloud30\plot_10002.csv
export file

5. Export all available dataset and corresponding label

In [13]:
def export_dataset() -> pd.DataFrame:
    # filter labels
    labels = pd.read_csv(raw_label_path, sep=',', header=0, index_col=['id'])
    merged = pd.read_csv(merge_path, sep=',', header=0, index_col=['id'])
    output = pd.merge(labels, merged, on='id', how='inner')
    cols = [i for i in range(4, 279)]
    output.drop(output.columns[cols], axis=1, inplace=True)
    export_csv_file(output, label_path, True)
    return output

export_dataset()

export file D:\Deutschland\FUB\master_thesis\data\gee\output\labels.csv


Unnamed: 0_level_0,Spruce,Beech,Coniferous,Deciduous
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.10,0.55,0.35,0.00
2,0.20,0.05,0.70,0.05
3,0.10,0.00,0.60,0.30
4,0.80,0.05,0.15,0.00
5,0.40,0.05,0.30,0.25
...,...,...,...,...
11051,0.35,0.10,0.45,0.10
11052,0.30,0.50,0.15,0.05
11053,0.30,0.25,0.45,0.00
11054,0.60,0.05,0.30,0.05
