In [1]:
import pandas as pd
from pathlib import Path
import numpy as np
import codecs
from datetime import datetime

In [2]:
def china(file: Path, index: list):
    dataset = pd.read_csv(file)
    dataset.rename(columns={col:col.lower() 
                            for col in dataset.columns}, 
                   inplace=True)
    # dataset.insert(1, 'country', 'China')
    return dataset.set_index(index)

In [3]:
def korea(file: Path, index: list):
    ds = pd.read_csv(file)
    if 'red_tite' in file.name:
        ds.rename(columns={'time(UTC)':'time'}, inplace=True)
        # ds.insert(1, 'country', 'Korea')
        return ds.set_index(index)
    
    if 'wave_glider' in file.name:
        dates, times = [], []
        append = dates.append
        appent = times.append
        
        drop_cols = ['Year', 'Month', 'Day', 'Hour', 'Minute'] + [
            col for col in ds.columns if 'Unnamed' in col
        ]
        for y, m, d, h, mn in zip(ds.loc[: ,'Year'], 
                                  ds.loc[: ,'Month'],
                                  ds.loc[: ,'Day'],
                                  ds.loc[: ,'Hour'], 
                                  ds.loc[: ,'Minute']):
            # date = pd.to_datetime(f'{y}-{d:02}-{m:02} {h:02}:{mn:02}')
            append(f'{m:g}/{d:g}/{y}')
            appent(f'{h:g}:{mn:02}')
        
        sta = [f'WG{i+1}' for i in range(ds.shape[0])]
        ds.insert(0, 'station', sta)
        ds.insert(1, 'country', 'Korea')
        ds.insert(2, 'date', dates)
        ds.insert(3, 'time', times)
        ds.drop(columns=drop_cols, inplace=True)
        
        ds.rename(columns={'Longitude':'lon', 
                           'Latitude': 'lat', 
                           'Chlorophyll-a': 'chla'}, inplace=True)
        return ds.set_index(index)

In [4]:
def japan(file: Path, index: list):
    ds = pd.read_excel(file, sheet_name='satdset', parse_dates=False)
    cols = {col:col.replace('Rrs', 'Rrs_') for col in ds.columns}
    ds.sort_values('date', axis=0, ascending=True, inplace=True)
    ds.sort_values('date', axis=0, ascending=True, inplace=True)
    ds.dropna(axis=0, subset=['date', 'time'], inplace=True)
    
    ds['date'] = [fmt_date(date_str=d, sep='-') for d in ds['date']]
    ds['time'] = [fmt_time(time_str=t) for t in ds['time']]
    ds.rename(columns=cols, inplace=True)
    # ds.replace(np.nan, '-999', inplace=True)
    return ds.set_index(index)

In [5]:
def fmt_time(time_str: str, sep: str = ':'):
    # print(time_str, type(time_str))
    if type(time_str) == str:
        h, m, s = time_str.split(sep)
    else:
        h, m = time_str.strftime(f'%H{sep}%M').split(sep)
    return f'{int(h)}{sep}{int(m):02}'

In [6]:
def fmt_date(date_str: str, sep: str = '/'):
    # print(date_str, type(date_str))
    if type(date_str) == str:
        m, d, y = date_str.split(sep)
    else:
        m, d, y = date_str.strftime(f'%m{sep}%d{sep}%Y').split(sep)
    return f'{int(m)}/{int(d)}/{int(y)}'

In [7]:
def russia(file: Path, index: list):
    dataset = pd.read_csv(file)
    if 'Rrs' in file.name:
        dataset['date'] = [fmt_date(date_str=d) for d in dataset['date']]     
    # dataset.insert(1, 'country', 'Russia')
    return dataset.set_index(index)

In [8]:
def merge_pd(left=None, right=None, top=None, bottom=None):
    if left is None:
        return top.append(other=bottom)
    if top is None:
        return left.merge(right, left_index=True, right_index=True, how='left')

In [9]:
# !pip install openpyxl
# !python -m pip install --upgrade pip

In [10]:
def get_data(files: list, country: str):
    top = bottom = None
    index = [
        'date'
        , 'time'
        , 'lat'
        , 'lon'
        , 'station'
    ]
    func = china if country.lower() == 'china' \
        else korea if country.lower() == 'korea' \
        else russia if country.lower() == 'russia' \
        else japan
    
    for i, f in enumerate(files):
        if i == 0:
            top = func(file=f, index=index)
        else:
            bottom = func(file=f, index=index)
        
        top = merge_pd(top=top, bottom=bottom)
    # top.replace(np.nan, '-999', inplace=True)
    return top

In [11]:
# dstr = datetime.today().strftime('%Y%m%d')
# f = f'insitu_data_collection_{dstr}.csv'
# df.to_csv(f, float_format='%.8f', encoding='utf-8')

In [12]:
input_path = Path('C:/Users/Eligio/Documents/NPEC/NEAT/02.DataCollection')
output_path = Path('.').absolute()

In [13]:
# files
flist = [['testc1_modisa .csv']
         , ['red_tite_south_sea(2018)_UTC.csv'
            , 'red_tite_south_sea(2019)_UTC.csv']
         , ['VKachur_chl_a_new.csv'
            , 'VKachur_chl_OC2.csv'
            , 'VKachur_Rrs.csv']
         , ['Toyama_data2003-2021.xls']
        ]

In [14]:
for j, country in enumerate(('China', 'Korea', 'Russia', 'Japan')):
    files = [input_path.joinpath(country, f) for f in flist[j]]
    
    fls = '\n\t'.join([f'{f}' for f in files])
    print(f'{country}: {fls}')
    
    save = output_path.joinpath(f'{country.lower()}_dataset.csv')
    
    df = get_data(files=files, country=country)
    
    print(df)
    df.reset_index().to_csv(save, encoding='utf-8', index=False)

China: C:\Users\Eligio\Documents\NPEC\NEAT\02.DataCollection\China\testc1_modisa .csv
                                        chla
date      time  lat   lon     station       
5/2/2019  0:50  38.35 120.470 B02      1.350
          7:55  38.33 119.450 B05      0.860
          12:33 38.31 118.960 B07      1.610
          17:09 38.83 118.970 B10      3.240
          21:06 39.16 119.400 B12      3.010
5/3/2019  3:10  39.14 120.320 B15      0.430
4/28/2019 22:10 37.00 123.010 H01      1.580
          18:35 37.02 123.500 H02      0.750
4/16/2019 21:45 36.00 121.090 H09      0.960
4/14/2019 4:45  34.97 120.550 H10      0.520
          11:35 35.00 121.730 H12      1.710
4/15/2019 22:15 35.00 123.250 H14      3.540
4/28/2019 14:45 37.50 123.530 N02      3.340
4/29/2019 1:55  37.49 123.010 N03      0.570
          4:55  37.75 123.240 N04      0.430
          9:55  38.24 123.740 N06      2.560
          18:15 39.24 123.760 N08      1.030
          21:20 39.27 123.260 N09      1.520
4/30/2019 16:2

Japan: C:\Users\Eligio\Documents\NPEC\NEAT\02.DataCollection\Japan\Toyama_data2003-2021.xls
                                                chla_0 ss_0 cdom_0  chla_02  \
date      time  lat       lon        station                                  
5/1/2003  21:40 36.790000 137.334000 滑川       0.700000  2.5    NaN      NaN   
          22:50 36.910000 137.398000 黒部       0.600000  3.3    NaN      NaN   
5/2/2003  0:00  36.989000 137.592000 朝日       0.500000  0.6    NaN      NaN   
          3:00  36.868000 137.015000 氷見       0.800000  0.6    NaN      NaN   
          4:20  36.804000 137.088000 小矢部川河口   1.100000  1.9    NaN      NaN   
...                                                ...  ...    ...      ...   
9/29/2021 10:49 36.769667 137.227000 10       4.949821  NaN    NaN      NaN   
          9:00  36.835700 137.230133 SP       0.867746  NaN    NaN      NaN   
          10:22 36.769667 137.227000 10       4.949821  NaN    NaN      NaN   
          9:36  36.803000 137.230333 8 