In [3]:
# Colab Cell 1: Setup, Mounting Drive, and Installing Libraries
# This cell handles everything needed before model training.

import os
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from google.colab import drive
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

print("Mounting Google Drive...")
drive.mount('/content/drive')
print("Google Drive mounted.")

# --- Define project directories in Google Drive ---
GOOGLE_DRIVE_PROJECT_ROOT = '/content/drive/MyDrive/Colab_Projects/LSTM_Water_Level_Project'
RAW_DATA_DIR = os.path.join(GOOGLE_DRIVE_PROJECT_ROOT, 'raw_data')
MODEL_SAVE_DIR = os.path.join(GOOGLE_DRIVE_PROJECT_ROOT, 'lstm_model')
os.makedirs(RAW_DATA_DIR, exist_ok=True)
os.makedirs(MODEL_SAVE_DIR, exist_ok=True)
print(f"Project root in Drive: {GOOGLE_DRIVE_PROJECT_ROOT}")

# --- Install Libraries ---
print("\nInstalling core libraries...")
!pip install -q tensorflow scikit-learn tqdm matplotlib pandas numpy
print("Required libraries installed/updated.")

Mounting Google Drive...
Mounted at /content/drive
Google Drive mounted.
Project root in Drive: /content/drive/MyDrive/Colab_Projects/LSTM_Water_Level_Project

Installing core libraries...
Required libraries installed/updated.


In [4]:
# Colab Cell 2: Data Loading, Merging, and Preprocessing
# This cell loads the two CSV files, merges them, and prepares the data for the LSTM.

# --- Define the paths to your two CSV files ---
water_level_file = "water_level.csv"  # IMPORTANT: Use your actual file names
rainfall_file = "rainfall.csv"        # IMPORTANT: Use your actual file names

water_level_path = os.path.join(RAW_DATA_DIR, water_level_file)
rainfall_path = os.path.join(RAW_DATA_DIR, rainfall_file)

if not os.path.exists(water_level_path) or not os.path.exists(rainfall_path):
    raise FileNotFoundError(
        "ERROR: One or both of the data files were not found.\n"
        f"Please ensure '{water_level_file}' and '{rainfall_file}' are uploaded to {RAW_DATA_DIR}."
    )

# --- 1. Load both datasets ---
print("Loading water level and rainfall data...")
water_level_df = pd.read_csv(water_level_path, parse_dates=['Date'])
rainfall_df = pd.read_csv(rainfall_path, parse_dates=['Date'])
print("\nWater Level DataFrame Info:")
water_level_df.info()
print("\nRainfall DataFrame Info:")
rainfall_df.info()

# --- 2. Merge the two datasets on the 'Date' column ---
print("\nMerging data on 'Date' column...")
# We assume the 'Date' column is the common key.
# A left merge will keep all water level data and add rainfall where available.
df = pd.merge(water_level_df, rainfall_df, on='Date', how='left')

# Drop any rows with missing data after the merge
df.dropna(inplace=True)
print("Data merged and cleaned. Final DataFrame:")
print(df.head())
print("\nFinal DataFrame Info:")
df.info()

# --- 3. Data Cleaning and Feature Selection ---
# We'll use Rainfall as a feature to predict Water Level.
features = ['water_level_feet_NAVD88', '3A11 Rainfall (inches)']
data = df[features].values
print(f"\nSelected features for training: {features}")
print(f"Data shape: {data.shape}")

# --- 4. Normalization ---
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data = scaler.fit_transform(data)
print("\nData normalized using MinMaxScaler.")

# --- 5. Create sequences for LSTM ---
# LSTMs require data to be in a sequence format (look_back_window, features).
def create_sequences(data, look_back_window):
    X, y = [], []
    for i in range(len(data) - look_back_window):
        X.append(data[i:(i + look_back_window)])
        y.append(data[i + look_back_window, 0]) # We predict the Level value (index 0) of the next time step
    return np.array(X), np.array(y)

LOOK_BACK_WINDOW = 10 # Predict the next day's Level based on the last 10 days of data
X, y = create_sequences(scaled_data, LOOK_BACK_WINDOW)
print(f"\nCreated sequences:")
print(f"X shape (sequences): {X.shape}") # Expected: (samples, look_back_window, features)
print(f"y shape (labels): {y.shape}")    # Expected: (samples,)

# --- 6. Split data into training and validation sets ---
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nTraining data shape: {X_train.shape}")
print(f"Validation data shape: {X_val.shape}")

# --- 7. Create TensorFlow Datasets for efficient training ---
BATCH_SIZE = 8
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(buffer_size=len(X_train)).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val)).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

print("\nData preprocessing complete. tf.data.Dataset objects are ready for model training.")

Loading water level and rainfall data...

Water Level DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5477 entries, 0 to 5476
Data columns (total 2 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Date                     5477 non-null   object 
 1   water_level_feet_NAVD88  5477 non-null   float64
dtypes: float64(1), object(1)
memory usage: 85.7+ KB

Rainfall DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5441 entries, 0 to 5440
Data columns (total 2 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Date                    5441 non-null   object 
 1   3A11 Rainfall (inches)  5441 non-null   float64
dtypes: float64(1), object(1)
memory usage: 85.1+ KB

Merging data on 'Date' column...
Data merged and cleaned. Final DataFrame:
         Date  water_level_feet_NAVD88  3A11 Rainfall (inches)
0  08/08/2010      

In [5]:
# Colab Cell 3: LSTM Model Building and Training
# This cell defines, compiles, and trains the LSTM model.

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow import keras

# --- Define the LSTM model architecture ---
def build_lstm_model(input_shape):
    model = keras.Sequential()
    # A single LSTM layer is a great start for this project
    model.add(LSTM(50, activation='relu', input_shape=input_shape))
    model.add(Dense(1))  # The output is a single value (the predicted water level)
    return model

# Define the input shape for the LSTM
input_shape = (X_train.shape[1], X_train.shape[2]) # (look_back_window, features)

# Build the model
model = build_lstm_model(input_shape)
model.summary()

# --- Compile the model ---
print("\n--- Compiling Model ---")
model.compile(optimizer='adam', loss='mean_squared_error')  # MSE is standard for regression/forecasting

# --- Training Callbacks ---
callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, mode='min', restore_best_weights=True),
    ModelCheckpoint(
        filepath=os.path.join(MODEL_SAVE_DIR, 'lstm_checkpoint.keras'),
        monitor='val_loss',
        mode='min',
        save_best_only=True,
        verbose=1
    )
]

# --- Train the model ---
print("\n--- Training Model ---")
EPOCHS = 100  # Train for more epochs, EarlyStopping will stop it when it's done learning
history = model.fit(
    train_dataset,
    epochs=EPOCHS,
    validation_data=val_dataset,
    callbacks=callbacks
)

# --- Evaluate the model ---
print("\n--- Evaluating Model on Validation Set ---")
loss = model.evaluate(val_dataset, verbose=0)
print(f"Validation Loss (MSE): {loss:.4f}")

# Save the final model
model.save(os.path.join(MODEL_SAVE_DIR, 'lstm_model_final.keras'))
print(f"\nFinal LSTM model saved to: {os.path.join(MODEL_SAVE_DIR, 'lstm_model_final.keras')}")

  super().__init__(**kwargs)



--- Compiling Model ---

--- Training Model ---
Epoch 1/100
[1m537/543[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 11ms/step - loss: 0.0199
Epoch 1: val_loss improved from inf to 0.00041, saving model to /content/drive/MyDrive/Colab_Projects/LSTM_Water_Level_Project/lstm_model/lstm_checkpoint.keras
[1m543/543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 14ms/step - loss: 0.0197 - val_loss: 4.0791e-04
Epoch 2/100
[1m533/543[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 5ms/step - loss: 3.5676e-04
Epoch 2: val_loss improved from 0.00041 to 0.00032, saving model to /content/drive/MyDrive/Colab_Projects/LSTM_Water_Level_Project/lstm_model/lstm_checkpoint.keras
[1m543/543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - loss: 3.5685e-04 - val_loss: 3.1742e-04
Epoch 3/100
[1m542/543[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 5ms/step - loss: 3.5943e-04
Epoch 3: val_loss did not improve from 0.00032
[1m543/543[0m [32m━━━━━━━━━━━━

In [6]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Ensure model is trained in the previous cell
# Predict on validation set
y_val_pred = model.predict(X_val)

# Inverse scale predictions and true values
y_val_true_inv = scaler.inverse_transform(
    np.hstack((y_val.reshape(-1,1), np.zeros((len(y_val), 1))))
)[:, 0]

y_val_pred_inv = scaler.inverse_transform(
    np.hstack((y_val_pred.reshape(-1,1), np.zeros((len(y_val_pred), 1))))
)[:, 0]

# Compute metrics
rmse = np.sqrt(mean_squared_error(y_val_true_inv, y_val_pred_inv))
mae = mean_absolute_error(y_val_true_inv, y_val_pred_inv)
r2 = r2_score(y_val_true_inv, y_val_pred_inv)

print("📊 Validation Metrics:")
print(f"  RMSE: {rmse:.2f} ft")
print(f"  MAE: {mae:.2f} ft")
print(f"  R²: {r2:.3f}")


[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
📊 Validation Metrics:
  RMSE: 0.04 ft
  MAE: 0.02 ft
  R²: 0.995


In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# --- Load the saved scaler ---
# In your training notebook, you would save the scaler object after training.
# For this example, we'll create a new one, but in a real project,
# you would load the saved scaler to ensure consistency.
scaler = MinMaxScaler(feature_range=(0, 1))

# --- Simulate new data for the last 10 days ---
# In a real project, this would be a new CSV or API call.
new_data_raw = pd.DataFrame({
    'Date': pd.to_datetime(['2025-04-10', '2025-04-11', '2025-04-12', '2025-04-13', '2025-04-14', '2025-04-15', '2025-04-16', '2025-04-17', '2025-04-18', '2025-04-19']),
    'Level': [100.2, 101.5, 102.3, 102.8, 103.1, 102.9, 102.5, 101.9, 101.1, 100.5],
    'Rainfall': [5.1, 2.5, 0.0, 0.0, 1.2, 3.4, 0.1, 0.0, 0.5, 1.0]
})
features = ['Level', 'Rainfall']
new_data = new_data_raw[features].values

# Fit the scaler on your original training data, then transform the new data.
# This ensures the new data is scaled consistently.
# For this example, we'll just fit and transform on the new data, which is not ideal.
scaled_new_data = scaler.fit_transform(new_data)

# Reshape the data for the LSTM model: (1, look_back_window, features)
look_back_window = 10
X_new = scaled_new_data.reshape(1, look_back_window, len(features))

print(f"New input tensor shape: {X_new.shape}")
print(f"New input tensor data type: {X_new.dtype}")

New input tensor shape: (1, 10, 2)
New input tensor data type: float64


In [None]:
import os
import tensorflow as tf
from tensorflow.keras.models import load_model

# --- Load the saved model ---
MODEL_SAVE_DIR = '/content/drive/MyDrive/Colab_Projects/LSTM_Water_Level_Project/lstm_model'
model_path = os.path.join(MODEL_SAVE_DIR, 'lstm_model_final.keras')

try:
    loaded_model = load_model(model_path)
    print("Model loaded successfully.")
except Exception as e:
    print(f"ERROR: Failed to load the model. Ensure the path is correct and the file exists. Error: {e}")
    raise

# --- Make a prediction ---
# 'X_new' is the tensor you prepared in the previous step.
predicted_scaled_level = loaded_model.predict(X_new)[0][0]
print(f"\nPredicted scaled level: {predicted_scaled_level:.4f}")

# --- Inverse transform the prediction to get the real water level ---
# The scaler was trained on a specific number of features.
# You need to create a dummy array to inverse transform only the 'Level' value.
dummy_array = np.zeros((1, len(features)))
dummy_array[0, 0] = predicted_scaled_level
predicted_level = scaler.inverse_transform(dummy_array)[0][0]

print(f"\nPredicted water level for the next day: {predicted_level:.2f}")

Model loaded successfully.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 795ms/step

Predicted scaled level: 0.1960

Predicted water level for the next day: 100.77


In [None]:
# get chat gpt code and try to increase data range
# try to find indian data and train model