In [1]:
import sys
import os
import pandas as pd
import shutil
import re
import datetime

In [2]:
def sorted_alphanumeric(data):
    convert = lambda text: int(text) if text.isdigit() else text.lower()
    alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ]
  
    return sorted(data, key=alphanum_key)

In [3]:
def day_counter(theDay, howManyDays):
    theDay = datetime.datetime.strptime(theDay,"%Y%m%d")
    delta = datetime.timedelta(days = howManyDays)
    targetDay = theDay + delta
    targetDay = targetDay.strftime("%Y%m%d")

    return targetDay

In [4]:
def ERA5_PDF_training(theDay, filePath, Data_180_Path):
    oneDayTrainingSet = Data_180_Path + theDay
    ERA5Path = filePath
    beforeTheDay = day_counter(theDay,-90)
    src = 'ERA5_T2m_EA_'+beforeTheDay+'_PDF.csv'
    des = 'ERA5_T2m_EA_'+beforeTheDay+'_PDF.csv'
    shutil.copy2(os.path.join(ERA5Path, src), os.path.join(oneDayTrainingSet, des))

In [5]:
def CSFv2_PDF_training(theDay, filePath, Data_180_Path):
    oneDayTrainingSet = Data_180_Path + theDay
    CSFv2Path = filePath
    src = 'CFSv2_T2m_EA_'+theDay+'_PDF.csv'
    des = 'CFSv2_T2m_EA_'+theDay+'_PDF.csv'
    shutil.copy2(os.path.join(CSFv2Path, src), os.path.join(oneDayTrainingSet, des))

In [12]:
def ERA5_projection_training(theDay, filePath, Data_180_Path):
    oneDayTrainingSet = Data_180_Path + theDay
    ERA5Path = filePath
    filesList = sorted_alphanumeric(os.listdir(ERA5Path))
    modelFiles = [file_ for file_ in filesList if file_[9:11]=='EA']
    beforeTheDay = day_counter(theDay,-89)
    year = int(beforeTheDay[:4])
    month = int(beforeTheDay[4:6])
    day = int(beforeTheDay[6:])
    projectionList = []
    for file_ in modelFiles:
        df = pd.read_csv(os.path.join(ERA5Path,file_), header=None)
        df = df[df[0] == year]
        df = df[df[1] == month]
        df = df[df[2] == day]
        beforeDayIndex = df.index[0]
        df = pd.read_csv(os.path.join(ERA5Path,file_), header=None)
        tmp = df.iloc[beforeDayIndex:beforeDayIndex+90,3].tolist()
        projectionList.append(tmp)
    store = pd.DataFrame(data=projectionList)
    store.to_csv(os.path.join(oneDayTrainingSet,'model_eraf.csv'), encoding = 'utf-8', index = True)

In [7]:
def CSFv2_projection_training(theDay, filePath, Data_180_Path):
    oneDayTrainingSet = Data_180_Path+theDay
    CSFv2Path = filePath
    filesList = sorted_alphanumeric(os.listdir(CSFv2Path))
    modelFiles = [file_ for file_ in filesList if file_[10:12]=='EA']
    year = int(theDay[:4])
    month = int(theDay[4:6])
    day = int(theDay[6:])
    projectionList = []
    for file_ in modelFiles:
        df = pd.read_csv(os.path.join(CSFv2Path,file_), header=None)
        df = df[df[0] == year]
        df = df[df[1] == month]
        df = df[df[2] == day]
        theDayIndex = df.index[0]
        df = pd.read_csv(os.path.join(CSFv2Path,file_), header=None)
        tmp = df.iloc[theDayIndex,3:].tolist()
        projectionList.append(tmp)
    store = pd.DataFrame(data=projectionList)
    store.to_csv(os.path.join(oneDayTrainingSet,'model_csfv2.csv'), encoding = 'utf-8', index = True)

In [10]:
def training_assignment(Data_180_Path, PDF_ERA5_Path, PDF_CFSv2_Path, ERA5_past_initial_path, CFSV2_past_initial_path, theDay, filePath):
    if not os.path.isdir(filePath + theDay):
        os.mkdir(filePath + theDay)
    ERA5_PDF_training(theDay, PDF_ERA5_Path, Data_180_Path)
    CSFv2_PDF_training(theDay, PDF_CFSv2_Path, Data_180_Path)
    ERA5_projection_training(theDay, ERA5_past_initial_path, Data_180_Path)
    CSFv2_projection_training(theDay, CFSV2_past_initial_path, Data_180_Path)

In [13]:
#資料位置
Data_180_Path = 'D:/weather/data_180/'
PDF_ERA5_Path = 'F:/for_ML/PDF_ans/'
PDF_CFSv2_Path = 'F:/for_ML/PDF_CFSv2/'
ERA5_past_initial_path = 'F:/for_ML/projection/ERA5_past_initial/'
CFSV2_past_initial_path = 'F:/for_ML/projection/CFSv2_forecast/'

if not os.path.isdir(Data_180_Path):
    os.mkdir(Data_180_Path)

firstDay = '20110830'
lastDay = '20191231'

first = datetime.datetime.strptime(firstDay,"%Y%m%d")
last = datetime.datetime.strptime(lastDay,"%Y%m%d")
howManyTrainingDatas = (last - first).days + 1
print('There are',howManyTrainingDatas,'training datas.')

for i in range(howManyTrainingDatas):
    theDay = day_counter(firstDay,i)
    training_assignment(Data_180_Path, PDF_ERA5_Path, PDF_CFSv2_Path, ERA5_past_initial_path, CFSV2_past_initial_path, theDay, Data_180_Path)
    print(theDay,'was done.')

There are 3046 training datas.
0    2011.0000
1       6.0000
2       2.0000
3     100.6664
Name: 91, dtype: float64
0    2011.0000
1       6.0000
2       2.0000
3     180.2717
Name: 91, dtype: float64
0    2011.0000
1       6.0000
2       2.0000
3     820.5671
Name: 91, dtype: float64
0    2011.0000
1       6.0000
2       2.0000
3     446.4357
Name: 91, dtype: float64
0    2011.0000
1       6.0000
2       2.0000
3    -617.8529
Name: 91, dtype: float64
0    2011.0000
1       6.0000
2       2.0000
3      26.0978
Name: 91, dtype: float64
0    2011.0000
1       6.0000
2       2.0000
3    -379.3626
Name: 91, dtype: float64
0    2011.0000
1       6.0000
2       2.0000
3      36.1796
Name: 91, dtype: float64
0    2011.000
1       6.000
2       2.000
3    -502.374
Name: 91, dtype: float64
0    2011.0000
1       6.0000
2       2.0000
3      -4.6415
Name: 91, dtype: float64
20110830 was done.
