In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense



# Load datasets (update path if using Google Drive)
daily_df = pd.read_csv("/content/gwl-daily.csv")
monthly_df = pd.read_csv("/content/gwl-monthly.csv")
stations_df = pd.read_csv("/content/gwl-stations.csv")

# Display basic info
print("Daily Groundwater Data:", daily_df.shape)
print("Monthly Groundwater Data:", monthly_df.shape)
print("Station Data:", stations_df.shape)

# Convert date column to datetime format
daily_df["MSMT_DATE"] = pd.to_datetime(daily_df["MSMT_DATE"])
monthly_df["MSMT_DATE"] = pd.to_datetime(monthly_df["MSMT_DATE"])

# Select relevant columns for prediction
columns_needed = ["MSMT_DATE", "STATION", "WSE"]
daily_data = daily_df[columns_needed].dropna()

# Sort values by station and date
daily_data = daily_data.sort_values(by=["STATION", "MSMT_DATE"])

# Create lag features (Time-Series Features)
def create_lagged_features(df, station_id, lags=10):
    station_data = df[df["STATION"] == station_id].copy()
    for lag in range(1, lags + 1):
        station_data[f"WSE_lag{lag}"] = station_data["WSE"].shift(lag)
    return station_data.dropna()

# Select a sample station for modeling
sample_station = daily_data["STATION"].value_counts().idxmax()
station_data = create_lagged_features(daily_data, sample_station)

# Prepare dataset for training
features = [col for col in station_data.columns if "lag" in col]
X = station_data[features]
y = station_data["WSE"]

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Scale data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Traditional Machine Learning Models ###

# 1. Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100)
rf_model.fit(X_train_scaled, y_train)
y_pred_rf = rf_model.predict(X_test_scaled)
rf_mse = mean_squared_error(y_test, y_pred_rf)
rf_r2 = r2_score(y_test, y_pred_rf)

# 2. Support Vector Machine Regressor
svm_model = SVR()
svm_model.fit(X_train_scaled, y_train)
y_pred_svm = svm_model.predict(X_test_scaled)
svm_mse = mean_squared_error(y_test, y_pred_svm)
svm_r2 = r2_score(y_test, y_pred_svm)

### Deep Learning Model: RNN (LSTM) ###
X_train_rnn = X_train_scaled.reshape((X_train_scaled.shape[0], X_train_scaled.shape[1], 1))
X_test_rnn = X_test_scaled.reshape((X_test_scaled.shape[0], X_test_scaled.shape[1], 1))

# Build LSTM Model
lstm_model = Sequential([
    LSTM(50, activation='relu', input_shape=(X_train_rnn.shape[1], 1)),
    Dense(1)
])

lstm_model.compile(optimizer='adam', loss='mse')
lstm_model.fit(X_train_rnn, y_train, epochs=20, verbose=0)

# Make LSTM predictions
y_pred_lstm = lstm_model.predict(X_test_rnn).flatten()
lstm_mse = mean_squared_error(y_test, y_pred_lstm)
lstm_r2 = r2_score(y_test, y_pred_lstm)

# Compare Model Performance
results = pd.DataFrame({
    "Model": ["Random Forest", "SVM", "LSTM"],
    "MSE": [rf_mse, svm_mse, lstm_mse],
    "R2 Score": [rf_r2, svm_r2, lstm_r2]
})
print("\nModel Comparison:\n", results)

# Plot Predictions
plt.figure(figsize=(12, 6))
plt.plot(y_test.values, label="Actual", linestyle="dashed")
plt.plot(y_pred_rf, label="Random Forest Prediction")
plt.plot(y_pred_svm, label="SVM Prediction")
plt.plot(y_pred_lstm, label="LSTM Prediction")
plt.legend()
plt.title("Groundwater Level Predictions: ML vs LSTM")
plt.show()


In [None]:
# Improved LSTM Model
from tensorflow.keras.layers import Dropout

# Reshape input for LSTM
X_train_rnn = X_train_scaled.reshape((X_train_scaled.shape[0], X_train_scaled.shape[1], 1))
X_test_rnn = X_test_scaled.reshape((X_test_scaled.shape[0], X_test_scaled.shape[1], 1))

# Build the enhanced LSTM model
lstm_model = Sequential([
    LSTM(128, activation='relu', return_sequences=True, input_shape=(X_train_rnn.shape[1], 1)),  # First LSTM layer
    Dropout(0.2),  # Dropout to prevent overfitting
    LSTM(128, activation='relu', return_sequences=True),  # Second LSTM layer
    Dropout(0.2),
    LSTM(128, activation='relu'),  # Third LSTM layer
    Dropout(0.2),
    Dense(1)  # Output layer
])

# Compile model
lstm_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='mse')

# Train LSTM model with more epochs
lstm_model.fit(X_train_rnn, y_train, epochs=50, batch_size=32, verbose=1)

# Make Predictions
y_pred_lstm = lstm_model.predict(X_test_rnn).flatten()

# Evaluate Performance
lstm_mse = mean_squared_error(y_test, y_pred_lstm)
lstm_r2 = r2_score(y_test, y_pred_lstm)

print(f"\nEnhanced LSTM Model: MSE = {lstm_mse:.5f}, R² = {lstm_r2:.5f}")

# Compare Model Performance
results = pd.DataFrame({
    "Model": ["Random Forest", "SVM", "Enhanced LSTM"],
    "MSE": [rf_mse, svm_mse, lstm_mse],
    "R2 Score": [rf_r2, svm_r2, lstm_r2]
})
print("\nUpdated Model Comparison:\n", results)

# Plot Predictions
plt.figure(figsize=(12, 6))
plt.plot(y_test.values, label="Actual", linestyle="dashed", color='black')
plt.plot(y_pred_rf, label="Random Forest Prediction", color='blue')
plt.plot(y_pred_svm, label="SVM Prediction", color='red')
plt.plot(y_pred_lstm, label="Enhanced LSTM Prediction", color='green')
plt.legend()
plt.title("Improved Groundwater Level Predictions: ML vs Enhanced LSTM")
plt.show()


In [None]:
 # Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Bidirectional, Dense, Dropout

# Load datasets
daily_df = pd.read_csv("/content/gwl-daily.csv")
monthly_df = pd.read_csv("/content/gwl-monthly.csv")
stations_df = pd.read_csv("/content/gwl-stations.csv")

# Convert date column to datetime format
daily_df["MSMT_DATE"] = pd.to_datetime(daily_df["MSMT_DATE"])
monthly_df["MSMT_DATE"] = pd.to_datetime(monthly_df["MSMT_DATE"])

# Select relevant columns for prediction
columns_needed = ["MSMT_DATE", "STATION", "WSE"]
daily_data = daily_df[columns_needed].dropna()

daily_data = daily_data.sort_values(by=["STATION", "MSMT_DATE"])

def create_lagged_features(df, station_id, lags=10):
    station_data = df[df["STATION"] == station_id].copy()
    for lag in range(1, lags + 1):
        station_data[f"WSE_lag{lag}"] = station_data["WSE"].shift(lag)
    return station_data.dropna()

sample_station = daily_data["STATION"].value_counts().idxmax()
station_data = create_lagged_features(daily_data, sample_station)

features = [col for col in station_data.columns if "lag" in col]
X = station_data[features]
y = station_data["WSE"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Traditional Machine Learning Models
rf_model = RandomForestRegressor(n_estimators=100)
rf_model.fit(X_train_scaled, y_train)
y_pred_rf = rf_model.predict(X_test_scaled)

svm_model = SVR()
svm_model.fit(X_train_scaled, y_train)
y_pred_svm = svm_model.predict(X_test_scaled)

X_train_rnn = X_train_scaled.reshape((X_train_scaled.shape[0], X_train_scaled.shape[1], 1))
X_test_rnn = X_test_scaled.reshape((X_test_scaled.shape[0], X_test_scaled.shape[1], 1))

# Hybrid Model: Bidirectional LSTM + Random Forest
lstm_model = Sequential([
    Bidirectional(LSTM(128, return_sequences=True, activation='relu'), input_shape=(X_train_rnn.shape[1], 1)),
    Dropout(0.2),
    Bidirectional(LSTM(64, return_sequences=True, activation='relu')),
    Dropout(0.2),
    Bidirectional(LSTM(32, activation='relu')),
    Dense(1)
])

lstm_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='mse')
lstm_model.fit(X_train_rnn, y_train, epochs=50, batch_size=32, verbose=0)

y_pred_lstm = lstm_model.predict(X_test_rnn).flatten()

# Hybrid Model Combining Predictions
hybrid_pred = (y_pred_lstm + y_pred_rf) / 2

# Performance Evaluation
rf_mse = mean_squared_error(y_test, y_pred_rf)
rf_r2 = r2_score(y_test, y_pred_rf)
svm_mse = mean_squared_error(y_test, y_pred_svm)
svm_r2 = r2_score(y_test, y_pred_svm)
lstm_mse = mean_squared_error(y_test, y_pred_lstm)
lstm_r2 = r2_score(y_test, y_pred_lstm)
hybrid_mse = mean_squared_error(y_test, hybrid_pred)
hybrid_r2 = r2_score(y_test, hybrid_pred)

results = pd.DataFrame({
    "Model": ["Random Forest", "SVM", "Bidirectional LSTM", "Hybrid Model"],
    "MSE": [rf_mse, svm_mse, lstm_mse, hybrid_mse],
    "R2 Score": [rf_r2, svm_r2, lstm_r2, hybrid_r2]
})
print("\nModel Comparison:\n", results)

# Plot Predictions
plt.figure(figsize=(12, 6))
plt.plot(y_test.values, label="Actual", linestyle="dashed")
plt.plot(y_pred_rf, label="Random Forest Prediction")
plt.plot(y_pred_svm, label="SVM Prediction")
plt.plot(y_pred_lstm, label="Bidirectional LSTM Prediction")
plt.plot(hybrid_pred, label="Hybrid Model Prediction", linestyle="dotted")
plt.legend()
plt.title("Groundwater Level Predictions: ML vs LSTM vs Hybrid")
plt.show()