In [1]:
import pandas as pd
import numpy as np
import os
from os import path
import shutil
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
import re
import traceback
%matplotlib inline

### Add Exogenous Features to Processed data

In [2]:
def add_exogenous_features(main_dir, df_recursive_temp, folder_name, debugging=False):
    """ Function to add exogenous features to the monthly groundwater level
    Input:
        main_dir(string): main directory that host the 487 temp processed data

        df_recursive_temp(df): dataframe of temperature recursive results for 26 horizon

        folder_name(string): name of the temperature preprocessed data folder

        debugging(bool): indicate to process all files in the temperature preprocessed data folder
    
    Output:
        None 
    """
    
    val_col_name = "gw-level"
    
    folder_dir = path.join(main_dir, folder_name)
    
    print("> Processing --{}..".format(folder_name))
    
    # create new directory for clean processed data
    output_dir = path.join(main_dir, "clean_processed_data")
    
    try:
        shutil.rmtree(output_dir)
    except:
        pass
    os.mkdir(output_dir)
    
    filenames = os.listdir(folder_dir)
    for filename in filenames:

        mp_num = filename.split(".")[0].split("-")[-1]
        
        filepath = path.join(folder_dir, filename)
        
        # process only files (don't iterate over output dir)
        if not path.isfile(filepath):
            continue
     
        df = pd.read_csv(filepath)[["date","gw-level","temp"]]
        df['date'] = pd.to_datetime(df['date'])
        
        df_rescur_id = df_recursive_temp[["date", mp_num]]
        date_recur_results_dict = df_rescur_id.set_index("date")[mp_num].to_dict()

        # fill the temp nan with recursive results
        df.loc[df['temp'].isna(), 'temp'] = df.loc[df['temp'].isna(), 'date'].map(date_recur_results_dict)
    
    
        # add temp features
        df['temp_roll_mean_1_year'] = df['temp'].rolling(12, closed='left').mean()
        df['temp_roll_mean_2_year'] = df['temp'].rolling(24, closed='left').mean()
        df['temp_roll_max_1_year'] = df['temp'].rolling(12, closed='left').max()
        df['temp_roll_min_1_year'] = df['temp'].rolling(12, closed='left').min()

        
        # add calender features
        df["month"] = df["date"].dt.month
        df["year"] = df["date"].dt.year
        df["quarter"] = df["date"].dt.quarter

        # add season features
        df["season"] = df["date"].dt.month.apply(get_season)
        df["weather"] = df["date"].dt.month.apply(get_weather)
        df["season"] = df["season"].astype("category")
        df["weather"] = df["weather"].astype("category")
        
        # cyclic calender and seasonal features
        month_cyclic = cyclical_encoded(df["month"], cycle_length=24)
        quarter_cyclic = cyclical_encoded(df["quarter"], cycle_length=4)

        # merge the df to the cyclic the features
        df_exogenous_features = pd.concat([df,month_cyclic, quarter_cyclic], axis=1)

        # add intereaction between exogenous varibles
        transformer_poly = PolynomialFeatures(
        degree           = 2,
        interaction_only = True,
        include_bias     = False
        ).set_output(transform="pandas")
    
        # pick columns for exgennous varibles for intereactions
        copy_df = df_exogenous_features.copy()
        copy_df.drop(["season","weather","date",val_col_name], axis=1, inplace=True)
        poly_cols = copy_df.columns.tolist()
    
        poly_features = transformer_poly.fit_transform(df_exogenous_features[poly_cols].dropna())
        poly_features = poly_features.drop(columns=poly_cols)
        poly_features.columns = [f"poly_{col}" for col in poly_features.columns]
        poly_features.columns = poly_features.columns.str.replace(" ", "_")
        df_exogenous_features = pd.concat([df_exogenous_features, poly_features], axis=1)

        # Set the last 26 entries of the 'temp' column to 0.0
        df_exogenous_features.loc[df_exogenous_features.index[-26:], 'gw-level'] = 0.0

        df_exogenous_features.dropna(inplace=True)
        df_exogenous_features['temp_roll_mean_1_year'] = df_exogenous_features['temp_roll_mean_1_year'].round(2)
        df_exogenous_features['temp_roll_mean_2_year'] = df_exogenous_features['temp_roll_mean_2_year'].round(2)
        df_exogenous_features.set_index("date", inplace=True)
        
        # save processed data to file
        out_filename = "processed_{}".format(filename)
        file_path = path.join(output_dir, out_filename)
        df_exogenous_features.to_csv(file_path)
    
        
        # if debugging, process only one file from each sub dir 
        if debugging:
            break
        
    print("\t- Done!")

    return None

In [3]:
# Define a function to map month to season for exogenous varibles
def get_season(month):
    if month in [3.0, 4.0, 5.0]:
        return 'spring'
    elif month in [6.0, 7.0, 8.0]:
        return 'summer'
    elif month in [9.0, 10.0, 11.0]:
        return 'Fall'
    else:  # months 12.0, 1.0, 2.0
        return 'winter'

# information from www.weatherspark.com site
def get_weather(month):
    if month in [1, 2, 12]:
        return "freezing"
    elif month in [3, 11]:
        return "cold"
    elif month in [4,10]:
        return "cool"
    elif month in [5, 6, 9]:
        return "comfortable"
    else: # 7 & 8
        return "warm"

def cyclical_encoded(data, cycle_length):
    """ function to capture pattern on calender features """

    sin = np.sin(2 * np.pi * data/cycle_length)
    cos = np.cos(2 * np.pi * data/cycle_length)
    result =  pd.DataFrame({
                  f"{data.name}_sin": sin,
                  f"{data.name}_cos": cos
              })

    return result


In [4]:
# temperature recursive results and make date column datetime object
recursive_path = "/users/azeez/water_prediction/ai4ls_2_water_prediction/data/raw/df_submission_temp.csv"
df_recursive = pd.read_csv(recursive_path)
df_recursive = df_recursive.rename(columns={"Unnamed: 0": "date"})
df_recursive['date'] = pd.to_datetime(df_recursive['date'])

In [5]:
FILE_PROCESSING_DEBUGGING = False # False => process  all files   
main_dir = "/users/azeez/water_prediction/ai4ls_2_water_prediction/data/raw/"
folder_name = "temp_processed_data"

In [6]:
import warnings
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    
    add_exogenous_features(main_dir, df_recursive, folder_name, debugging=FILE_PROCESSING_DEBUGGING)

> Processing --temp_processed_data..
	- Done!


In [7]:
# check if the data is 487 in total as given on the deliverables
processed_data_dir = "/users/azeez/water_prediction/ai4ls_2_water_prediction/data/raw/clean_processed_data"
files = os.listdir(processed_data_dir)
print(f" Number of processed data : {len(files)}")

 Number of processed data : 487


In [8]:
# check irregularity in the datetime rangeindex
counter = 0
filenames_with_fault = []
for filename in files:
    path = os.path.join(processed_data_dir, filename)
    df = pd.read_csv(path)
    df.set_index("date", inplace=True)
    df_total = len(df.index)
    time_range = len(pd.date_range(start=df.index.min(), end=df.index.max(), freq='MS'))
    if df_total != time_range:
        counter += 1
        filenames_with_fault.append(filename)

print(counter)
filenames_with_fault

0


[]

In [None]:
total_files = len(files) 
split_files = total_files // 4

index = processed_data_dir.find("/raw/")
outward_dir = processed_data_dir[:index + len('/raw/')]

processed_data_1 = os.path.join(outward_dir, "clean_processed_data_part1")
processed_data_2 = os.path.join(outward_dir, "clean_processed_data_part2")
processed_data_3 = os.path.join(outward_dir, "clean_processed_data_part3")
processed_data_4 = os.path.join(outward_dir, "clean_processed_data_part4")

if not os.path.exists(processed_data_1):
    os.mkdir(processed_data_1)
    
if not os.path.exists(processed_data_2):
    os.mkdir(processed_data_2)

if not os.path.exists(processed_data_3):
    os.mkdir(processed_data_3)

if not os.path.exists(processed_data_4):
    os.mkdir(processed_data_4)
    
for i, filename in enumerate(files):
    filepath = os.path.join(processed_data_dir, filename)
    if i <= split_files:
        new_filepath = os.path.join(processed_data_1, filename)
        shutil.copy(filepath, new_filepath)
    elif i > split_files and i <= split_files*2:
        new_filepath = os.path.join(processed_data_2, filename)
        shutil.copy(filepath, new_filepath)
    elif i > split_files and i <= split_files*3:
        new_filepath = os.path.join(processed_data_3, filename)
        shutil.copy(filepath, new_filepath)
    else:
        new_filepath = os.path.join(processed_data_4, filename)
        shutil.copy(filepath, new_filepath)