In [14]:
import pandas as pd, dill as pickle, os
from utils.calculateConstants import *
from utils.util import getTopLevelPath
from utils.elevation import getStationElevations, getStationElevationCircles
from datetime import date
from tqdm.notebook import tqdm, trange

In [2]:
folder_path =  getTopLevelPath() + 'data/'
stationsLonLatXY_path = folder_path + 'Measured/stationsLonLatXY.pkl'
measured_path = folder_path + 'Measured/Processed/' + max(os.listdir(folder_path + 'Measured/Processed/'), key = lambda f: os.path.getmtime(folder_path + 'Measured/Processed/' + f))
reanalysis_path = folder_path + 'Reanalysis/' + max([file for file in os.listdir(folder_path + 'Reanalysis/') if file.endswith('.feather')], key = lambda f: os.path.getmtime(folder_path + 'Reanalysis/' + f))
elevation_path = folder_path + "Elevation/IslandsDEMv1.0_20x20m_isn93_zmasl.tif"

se = getStationElevations()
ec = getStationElevationCircles()

today = date.today().strftime("%Y-%m-%d")
outputpath = folder_path + f'Model/data_{today}.feather'
outputpath_for_errors = folder_path + f'Model/Errors/error_{today}.feather'

In [3]:
def addLonLatXYtoMeasured(df, stationsLonLatXY_path = stationsLonLatXY_path):
    with open(stationsLonLatXY_path, 'rb') as f:
        stationsLonLatXY = pickle.load(f)
    def get_lon_lat_X_Y(stod):
        return stationsLonLatXY.get(stod, (np.nan, np.nan, np.nan, np.nan))

    lon, lat, X, Y = zip(*df.stod.map(get_lon_lat_X_Y))
    df['lon'], df['lat'], df['X'], df['Y'] = lon, lat, X, Y
    return df

In [4]:
def addStationToReanalysis(df, stationnsLonLatXY_path = stationsLonLatXY_path):
    df['LonLat'] = list(zip(df.lon, df.lat))
    with open(stationsLonLatXY_path, 'rb') as f:
        stationsLonLatXY = pickle.load(f)
    inv = {v[:2]: k for k, v in stationsLonLatXY.items()}
    def getStation(lonlat):
        return inv.get(lonlat, (np.nan))
    tqdm.pandas(desc = "Adding stations to reanalysis...")
    df['stod'] = df.LonLat.progress_map(getStation)

    return df

In [5]:
def addElevationCircles(stod):
    return ec[stod]

In [6]:
def addStationElevations(stod):
    return se[stod]

In [7]:
def addElevation(df):
    df['XYd'] = list(zip(df.X, df.Y, df.d))
    tqdm.pandas(desc = 'Adding station elevations...')
    df['station_elevation'] = df.stod.progress_map(addStationElevations)
    tqdm.pandas(desc = 'Adding landscape elevation...')
    ec = getStationElevationCircles()
    df['elevations']  = df.stod.progress_map(addElevationCircles)
    df = df.drop(['XYd'], axis = 1)

    return df

In [8]:
def prepareMeasurements(df, stationsLonLatXY_path, decimal_places = 4):
    df = df.drop(['dsdev'], axis = 1)
    df = df.rename(columns = {'timi':'time'})
    df = addLonLatXYtoMeasured(df)
    df = addElevation(df)
    df = df.round(decimal_places)
    return df

In [9]:
def prepareRenalysis(df, decimal_places = 4):
    df = df.rename(columns = {'Wind speed':'ws', 'Wind direction': 'wd', 'Pressure':'p', 'Temperature':'t'})
    df = df.drop_duplicates(subset=['lon', 'lat', 'time', 'height_level'])
    df = df.pivot(index = ['lon', 'lat', 'time'], columns = 'height_level')
    df = df.drop(columns='yr_month')
    df.columns = [f'{col[0]}_{col[1]}' for col in df.columns]
    df = df.reset_index()
    df = addStationToReanalysis(df)
    df.time = pd.to_datetime(df.time)
    df = df.round(decimal_places)
    tqdm.pandas(desc='Creating new column to calculate constants')
    df['cc'] = list(zip(df.t_15, df.t_250, df.t_500, df.p_15, df.p_250, df.p_500, df.ws_15, df.ws_250, df.ws_500))
    tqdm.pandas(desc='Calculating Richardson number...')
    df[['Ri_01', 'Ri_12', 'Ri_02']] = df.cc.progress_map(rowRichardson).to_list()    
    tqdm.pandas(desc='Calculating Brunt Vaisala Squared...')
    df[['N_01_squared', 'N_12_squared', 'N_02_squared']] = df.cc.progress_map(rowBruntVaisalaSquared).to_list()
    return df

In [None]:
def merge_large_dataframes(df1, df2, chunk_size=int(5e5)):
    # Determine the total number of chunks based on the chunk size
    num_chunks = (len(df1) // chunk_size) + 1

    # Initialize an empty list to store merged chunks
    merged_chunks = []

    # Iterate over chunks of the first DataFrame
    for i in trange(num_chunks):
        start_idx = i * chunk_size
        end_idx = min((i + 1) * chunk_size, len(df1))
        chunk_df1 = df1.iloc[start_idx:end_idx]

        # Merge the chunk of the first DataFrame with the second DataFrame
        merged_chunk = pd.merge(chunk_df1, df2, on=['stod', 'time'], how='inner', desc = 'Merging large dataframes...')

        # Append the merged chunk to the list
        merged_chunks.append(merged_chunk)

    # Concatenate all merged chunks into a single DataFrame
    merged_df = pd.DataFrame()
    for c_df in tqdm(merged_chunks, total = len(num_chunks), desc = 'Concating merged dataframe chunks...'):
        merged_df = pd.concat(merged_df, c_df, ignore_index=True)

    return merged_df

In [10]:
def merge(measured_path = measured_path, reanalysis_path = reanalysis_path):
    measured_df = pd.read_feather(measured_path)
    reanalysis_df = pd.read_feather(reanalysis_path)
    measured_df = prepareMeasurements(measured_df, stationsLonLatXY_path)
    reanalysis_df = prepareRenalysis(reanalysis_df)
    print("Sorting dataframes...")
    measured_df = measured_df.sort_values(['stod', 'time'])
    reanalysis_df = reanalysis_df.sort_values(['stod', 'time'])
    merged_df = merge_large_dataframes(measured_df, reanalysis_df)
    print("Finished merging...")
    print("Splitting by if error (fg <= f) (should not happen)")
    errors = merged_df[merged_df.fg <= merged_df.f]
    merged_df = merged_df[merged_df.fg > merged_df.f]
    merged_df.to_feather(outputpath)
    errors.to_feather(outputpath_for_errors)

    return merged_df

In [11]:
measured_df = pd.read_feather(measured_path)
reanalysis_df = pd.read_feather(reanalysis_path)
measured_df = prepareMeasurements(measured_df, stationsLonLatXY_path)
reanalysis_df = prepareRenalysis(reanalysis_df)

Adding station elevations...:   0%|          | 0/15494125 [00:01<?, ?it/s]

Adding landscape elevation...:   0%|          | 0/15494125 [00:00<?, ?it/s]

Adding stations to reanalysis...:   0%|          | 0/12385016 [00:00<?, ?it/s]

Calculating Richardson number...:   0%|          | 0/12385016 [00:00<?, ?it/s]

Calculating Brunt Vaisala Squared...:   0%|          | 0/12385016 [00:00<?, ?it/s]

In [12]:
measured_df = measured_df.sort_values(['stod', 'time'])
reanalysis_df = reanalysis_df.sort_values(['stod', 'time'])

In [22]:
def merge_large_dataframes(df1, df2, chunk_size=int(5e5)):
    # Determine the total number of chunks based on the chunk size
    num_chunks = (len(df1) // chunk_size) + 1

    # Initialize an empty list to store merged chunks
    merged_chunks = []

    # Iterate over chunks of the first DataFrame
    for i in trange(num_chunks):
        start_idx = i * chunk_size
        end_idx = min((i + 1) * chunk_size, len(df1))
        chunk_df1 = df1.iloc[start_idx:end_idx]

        # Merge the chunk of the first DataFrame with the second DataFrame
        merged_chunk = pd.merge(chunk_df1, df2, on=['stod', 'time'], how='inner', desc = 'Merging large dataframes...')

        # Append the merged chunk to the list
        merged_chunks.append(merged_chunk)

    # Concatenate all merged chunks into a single DataFrame
    merged_df = pd.DataFrame()
    for c_df in tqdm(merged_chunks, total = len(num_chunks), desc = 'Concating merged dataframe chunks...'):
        merged_df = pd.concat(merged_df, c_df, ignore_index=True)

    return merged_df


In [23]:
merged_df = merge_large_dataframes(measured_df, reanalysis_df)

  0%|          | 0/31 [00:00<?, ?it/s]