# Transform Data

This notebook contains functions for processing, merging, and saving climate data from the NASA Earth Exchange Global Daily Downscaled Projections (NEX-GDDP-CMIP6) and weather data from from Oikolab, a provider of weather and climate datasets.

## Import Packages

In [1]:
import pandas as pd
import xarray as xr
from pathlib import Path
from scipy.constants import convert_temperature

## Define Functions

These functions map the spatial resolution of the weather dataset onto the climate dataset and save the results to an interim subdirectory. 

In [2]:
def map_resolution(ds_climate, ds_weather):
    """
    Maps the resolution of two datasets: a climate dataset and a weather dataset.
    It removes duplicates in both datasets and then interpolates the climate dataset
    to match the latitude and longitude resolutions of the weather dataset.

    Parameters:
    - ds_climate: Dataset representing climate data.
    - ds_weather: Dataset representing weather data.

    Returns:
    - Interpolated climate dataset with resolution mapped to the weather dataset's resolution.
    """
    # Remove duplicates from both datasets
    ds_climate_cleaned = ds_climate.drop_duplicates(...)
    ds_weather_cleaned = ds_weather.drop_duplicates(...)
    
    # Interpolate the cleaned climate dataset to match the weather dataset's resolution
    ds_climate_interpolated = ds_climate_cleaned.interp(
        lat=ds_weather_cleaned.latitude, 
        lon=ds_weather_cleaned.longitude
    )
    
    return ds_climate_interpolated


In [3]:
def save_dataset(dataset, folderpath, filename):
    """
    Save the dataset to a netCDF file. 
    The netCDF file is saved in an interim subdirectory within the specified folder path.
    
    Parameters:
    - dataset: The xarray.Dataset to be saved.
    - folderpath: A string specifying the directory path where the netCDF file will be saved.
    - filename: The name of the netCDF file to save without an extension.
    
    """
    # Create a Path object for folderpath to ensure correct path manipulation
    folder = Path(folderpath)
    
    # Construct the file path for the processed version of the file
    filepath = folder / 'interim' / (filename + '.nc')
    
    dataset.to_netcdf(filepath)

These functions convert the climate and weather Xarray datasets into pandas DataFrames. 

In [4]:
def process_climate_data(dataset):
    """
    Processes climate data by converting an Xarray Dataset to a pandas DataFrame, 
    creating a combined 'model_scenario' column, converting temperatures from Kelvin 
    to Fahrenheit, and pivoting the DataFrame for analysis.

    Parameters:
    - dataset (xarray.Dataset): The input dataset containing climate data. Expected
      to have 'model', 'scenario', 'tasmin', 'lat', 'lon', and 'time' variables.

    Returns:
    - pandas.DataFrame: A pivoted DataFrame with minimum daily Fahrenheit 
      temperatures for each latitude, longitude, and date combination.
    """
    # Convert the Xarray Dataset into a pandas DataFrame
    df = dataset.to_dataframe().dropna().reset_index()
    
    # Create a new column concatenating the model and scenario columns
    df['model_scenario'] = df['model'] + '-' + df['scenario']
    
    # Convert the time column to datetime format
    df['time'] = pd.to_datetime(df['time']).dt.date
    
    # Convert temperature from Kelvin to Fahrenheit
    df['fahrenheit'] = convert_temperature(df['tasmin'], 'kelvin', 'fahrenheit')
    
    # Pivot the DataFrame
    pivot = df.pivot_table(
        index=['lat', 'lon', 'time'], 
        columns='model_scenario', 
        values='fahrenheit', 
        aggfunc='min',
    ).reset_index()

    return pivot


In [5]:
def process_weather_data(dataset):
    """
    Processes weather data by converting an Xarray Dataset to a pandas DataFrame, 
    creating a combined 'model_scenario' column and converting temperatures from 
    Celsius to Fahrenheit.

    Parameters:
    - dataset (xarray.Dataset): The input dataset containing climate data. Expected
      to have 'time', 'latitude', 'longitude', and 'temperature' variables.

    Returns:
    - pandas.DataFrame: A pivoted DataFrame with minimum daily Fahrenheit 
      temperatures for each latitude, longitude, and date combination.
    """
    # Convert the Xarray Dataset into a pandas DataFrame
    df = dataset.to_dataframe().dropna().reset_index()
    
    # Convert the time column to datetime format
    df['time'] = pd.to_datetime(df['time']).dt.date
    
    # Convert temperature from Celsius to Fahrenheit
    df['fahrenheit'] = convert_temperature(df['temperature'], 'celsius', 'fahrenheit')

    df = df[['latitude', 'longitude', 'time', 'fahrenheit']]
    df.columns = ['lat', 'lon', 'time', 'fahrenheit']

    return df


In [6]:
def merge_dataframes(df_climate, df_weather):
    """
    Merge preprocessed climate and weather data DataFrames.
    
    Parameters:
    - df_climate: Preprocessed climate data.
    - df_weather: Preprocessed weather data.
    
    Returns:
    - DataFrame resulting from the merge of climate and weather data.
    """
    # Merge climate and weather data
    df = pd.merge(df_climate, df_weather, how='left', on=['lat', 'lon', 'time'])
    
    # Drop rows with any missing values resulting from the merge
    return df.dropna()
    

In [7]:
def save_dataframe(folderpath, filename, dataframe):
    """
    Saves a given dataframe to a Parquet file with GZIP compression in a specified
    directory and subdirectory ('processed').

    Parameters:
    - folderpath: str, the path to the main folder where the file will be saved.
    - filename: str, the name of the file to be saved.
    - dataframe: DataFrame, the pandas DataFrame to save.
    """    
    # Create a Path object for folder_path to ensure correct path manipulation
    folder = Path(folderpath)

    # Combine the folder path and file name to create the full path to the file
    filepath = folder / 'processed' / (filename + '.parquet.gzip')
    
    dataframe.to_parquet(filepath, compression='gzip')
    

## Execute Functions

In [8]:
# Load raw datasets
ds_climate_train_raw = xr.open_dataset('../data/raw/CMIP6_train_easternmountain.nc')
ds_climate_validate_raw = xr.open_dataset('../data/raw/CMIP6_validate_easternmountain.nc')
ds_climate_test_raw = xr.open_dataset('../data/raw/CMIP6_test_easternmountain.nc')
ds_climate_project_raw = xr.open_dataset('../data/raw/CMIP6_project_easternmountain.nc')
ds_weather = xr.open_dataset('../data/raw/oiko_easternmountain.nc')

In [9]:
# Map spatial resolution from weather to climate datasets
ds_climate_train = map_resolution(ds_climate_train_raw, ds_weather)
ds_climate_validate = map_resolution(ds_climate_validate_raw, ds_weather)
ds_climate_test = map_resolution(ds_climate_test_raw, ds_weather)
ds_climate_project = map_resolution(ds_climate_project_raw, ds_weather)

In [10]:
# save_dataset(ds_climate_train, '../data', 'CMIP6_train_easternmountain_interpolated')

In [11]:
# save_dataset(ds_climate_validate, '../data', 'CMIP6_validate_easternmountain_interpolated')

In [12]:
# save_dataset(ds_climate_test, '../data', 'CMIP6_test_easternmountain_interpolated')

In [13]:
# save_dataset(ds_climate_project, '../data', 'CMIP6_project_easternmountain_interpolated')

In [14]:
# Convert datasets to dataframes
df_climate_train = process_climate_data(ds_climate_train)
df_climate_validate = process_climate_data(ds_climate_validate)
df_climate_test = process_climate_data(ds_climate_test)
df_climate_project = process_climate_data(ds_climate_project)
df_weather = process_weather_data(ds_weather)

In [15]:
# Merge dataframes
df_train = merge_dataframes(df_climate_train, df_weather)
df_validate = merge_dataframes(df_climate_validate, df_weather)
df_test = merge_dataframes(df_climate_test, df_weather)

In [16]:
# save_dataframe('../data', 'df_easternmountain_train', df_train)

In [17]:
# save_dataframe('../data', 'df_easternmountain_validate', df_validate)

In [18]:
# save_dataframe('../data', 'df_easternmountain_test', df_test)

In [19]:
# save_dataframe('../data', 'df_easternmountain_project', df_climate_project)

In [20]:
# save_dataframe('../data', 'df_easternmountain_weather', df_weather)