Subset.py

In [26]:
import pandas as pd

def subset(file_paths=[]):

    dataframes = [pd.read_csv(file) for file in file_paths]
    combined_df = pd.concat(dataframes, ignore_index=True)

    return combined_df

In [25]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Dataset.py

In [27]:
from sklearn.model_selection import train_test_split
def dataset():
    file_paths = [
        '/content/drive/MyDrive/combined_features_with_date1.csv',
        # '/content/drive/MyDrive/combined_features_with_date2.csv',
        # '/content/drive/MyDrive/combined_features_with_date3.csv'
    ]

    Dataset = subset(file_paths=file_paths)

    TrainingData, TestData = train_test_split(Dataset, test_size=0.2, random_state=42)

    TrainingData.to_csv("TrainingData.csv", index=False)
    TestData.to_csv("TestData.csv", index=False)

    with open("features_info.txt", "w") as f:
        f.write("Features used:\n")
        for feature in Dataset.columns:
            f.write(f"{feature}\n")

    print("Datasets have been successfully split and saved as 'TrainingData.csv' and 'TestData.csv'.")
    print("Features info saved as 'features_info.txt'.")

    return TrainingData, TestData


neuralnet.py

In [29]:
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import os

def build_model(input_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(input_size,)),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(1)
    ])
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model

def prepare_dataset(data, batch_size=32, shuffle=True):
    X = data.drop(columns=["streamflow"]).values
    y = data["streamflow"].values

    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    dataset = tf.data.Dataset.from_tensor_slices((X, y))

    if shuffle:
        dataset = dataset.shuffle(buffer_size=len(X))
    dataset = dataset.batch(batch_size)

    return dataset, scaler

def train_nn(TrainingData, epochs=10, batch_size=32, checkpoint_dir="/content/checkpoints"):
    os.makedirs(checkpoint_dir, exist_ok=True)


    checkpoint_path = os.path.join(checkpoint_dir, "model_epoch_{epoch:02d}.weights.h5")
    checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_path,
        save_weights_only=True,
        save_best_only=True,
        monitor='val_loss',
        mode='min',
        verbose=1
    )

    # Remove unnecessary columns and NaNs
    if 'date' in TrainingData.columns:
        TrainingData = TrainingData.drop(columns=['date'])
    TrainingData = TrainingData.dropna()

    train_dataset, scaler = prepare_dataset(TrainingData, batch_size=batch_size, shuffle=True)


    input_size = TrainingData.drop(columns=["streamflow"]).shape[1]
    model = build_model(input_size)


    model.fit(
        train_dataset,
        epochs=epochs,
        validation_data=train_dataset.take(20),
        callbacks=[checkpoint_callback],
        verbose=1
    )

    print("Model training complete.")
    return model, scaler

if __name__ == "__main__":

    # train_data, validation_data, test_data = dataset
    train_data = pd.read_csv('/content/TrainingData.csv').sample(10000, random_state=1)
    NNModel, scaler = train_nn(train_data)


Epoch 1/10
[1m1/3[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m2s[0m 1s/step - loss: 7.0806 - mae: 1.6215
Epoch 1: val_loss improved from inf to 8.09063, saving model to /content/checkpoints/model_epoch_01.weights.h5
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 100ms/step - loss: 7.9026 - mae: 1.6776 - val_loss: 8.0906 - val_mae: 1.6326
Epoch 2/10
[1m1/3[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m0s[0m 94ms/step - loss: 5.8985 - mae: 1.5911
Epoch 2: val_loss improved from 8.09063 to 7.47118, saving model to /content/checkpoints/model_epoch_02.weights.h5
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - loss: 7.1574 - mae: 1.6028 - val_loss: 7.4712 - val_mae: 1.5684
Epoch 3/10
[1m1/3[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m0s[0m 84ms/step - loss: 1.6809 - mae: 0.9391
Epoch 3: val_loss improved from 7.47118 to 6.94973, saving model to /content/checkpoints/model_epoch_03.weights.h5
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[

Time Series Forecasting:

Setup

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import os
import datetime

import IPython
import IPython.display
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf

csv_path = '/content/drive/MyDrive/combined_features_with_date1.csv'

df = pd.read_csv(csv_path)

df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')

date_time = df.pop('date')

# print(df.head())
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
snow_depth_water_equivalent_mean,7891952.0,42.414401,80.804755,0.0,0.0,0.23,47.76,806.96
surface_net_solar_radiation_mean,7891952.0,118.645158,81.195738,1.05,48.33,100.12,181.74,345.15
surface_net_thermal_radiation_mean,7891952.0,-51.749863,26.108172,-126.44,-72.34,-52.76,-31.24,32.79
surface_pressure_mean,7891952.0,98.336203,1.839745,88.06,97.18,98.48,99.69,104.5
temperature_2m_mean,7891952.0,4.651566,11.296679,-47.08,-2.95,4.95,14.13,31.33
dewpoint_temperature_2m_mean,7891952.0,0.395755,11.085597,-49.9,-6.93,0.99,9.37,25.66
u_component_of_wind_10m_mean,7891952.0,0.930433,2.058297,-15.35,-0.37,1.02,2.2,19.48
v_component_of_wind_10m_mean,7891952.0,0.092764,2.041131,-21.17,-1.23,0.09,1.44,16.05
volumetric_soil_water_layer_1_mean,7891952.0,0.32858,0.076177,0.06,0.27,0.34,0.39,0.76
volumetric_soil_water_layer_2_mean,7891952.0,0.328568,0.076529,0.07,0.27,0.34,0.39,0.76


In [5]:
import numpy as np


timestamp_s = date_time.map(pd.Timestamp.timestamp)


year = 365.2425 * 24 * 60 * 60  # seconds in a year

# Add sine and cosine transforms for yearly periodicity
df['Year sin'] = np.sin(timestamp_s * (2 * np.pi / year))
df['Year cos'] = np.cos(timestamp_s * (2 * np.pi / year))


print(df[['Year sin', 'Year cos']].head())

   Year sin  Year cos
0 -0.006752  0.999977
1  0.010450  0.999945
2  0.027650  0.999618
3  0.044841  0.998994
4  0.062019  0.998075
