<a href="https://colab.research.google.com/github/aaronraiftorres/BSCS1A-TORRES/blob/main/BALDA_50.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import TimeSeriesSplit
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.regularizers import l2
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import folium
from folium.plugins import HeatMap
from IPython.display import IFrame

In [None]:
# Load each CSV file
hotspots_data = pd.read_csv('hotspots.csv')
sst_data = pd.read_csv('sst.csv')
ssh_data = pd.read_csv('ssh.csv')
chl_data = pd.read_csv('chl.csv')

# Convert 'date' column to datetime type
sst_data['date'] = pd.to_datetime(sst_data['date'])
ssh_data['date'] = pd.to_datetime(ssh_data['date'])
chl_data['date'] = pd.to_datetime(chl_data['date'])
hotspots_data['date'] = pd.to_datetime(hotspots_data['date'])

# Set 'date' column as index
sst_data.set_index('date', inplace=True)
ssh_data.set_index('date', inplace=True)
chl_data.set_index('date', inplace=True)
hotspots_data.set_index('date', inplace=True)

# Resample each dataset to monthly frequency and interpolate
sst_resampled = sst_data.resample('M').mean().interpolate(method='time')
ssh_resampled = ssh_data.resample('M').mean().interpolate(method='time')
chl_resampled = chl_data.resample('M').mean().interpolate(method='time')
hotspots_resampled = hotspots_data.resample('M').mean().interpolate(method='time')

FileNotFoundError: [Errno 2] No such file or directory: 'hotspots.csv'

In [None]:
# Merge the resampled datasets
data_resampled = pd.concat([sst_resampled, ssh_resampled, chl_resampled, hotspots_resampled], axis=1)

In [None]:
# Check for NaN values after merging
nan_counts = data_resampled.isna().sum()
print("NaN counts after merging:")
print(nan_counts)


In [None]:
# Fill NaNs with backward fill method
data_resampled.fillna(method='bfill', inplace=True)


In [None]:
# Check for NaN values after merging
nan_counts = data_resampled.isna().sum()
print("NaN counts after merging:")
print(nan_counts)

In [None]:
# Normalization
feature_scaler = MinMaxScaler()
target_scaler = MinMaxScaler()

target_column = 'squid_abundance_per_kgs'  # Ensure this matches exactly with the column name in your data

# Check if the target column exists
if target_column not in data_resampled.columns:
    raise KeyError(f"The target column '{target_column}' does not exist in the data.")

# Split the data into features and target
X = data_resampled.drop(columns=[target_column])
y = data_resampled[target_column]

# Normalize features and target
X_normalized = pd.DataFrame(feature_scaler.fit_transform(X), columns=X.columns, index=X.index)
y_normalized = pd.DataFrame(target_scaler.fit_transform(y.values.reshape(-1, 1)), columns=[target_column], index=y.index)

In [None]:
# Function to create sequences
def create_sequences(X, y, time_steps=1):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        Xs.append(X.iloc[i:(i + time_steps)].values)
        ys.append(y.iloc[i + time_steps].values)
    return np.array(Xs), np.array(ys)

time_steps = 9  # e.g., using past 9 months to predict the next month

In [None]:
# Cross-validation
tscv = TimeSeriesSplit(n_splits=5)

train_mse_scores = []
test_mse_scores = []
y_train_pred_all = []
y_train_actual_all = []
y_test_pred_all = []
y_test_actual_all = []

for train_index, test_index in tscv.split(X_normalized):
    X_train, X_test = X_normalized.iloc[train_index], X_normalized.iloc[test_index]
    y_train, y_test = y_normalized.iloc[train_index], y_normalized.iloc[test_index]
    coordinates_train, coordinates_test = coordinates.iloc[train_index], coordinates.iloc[test_index]

    X_train_lstm, y_train_lstm = create_sequences(X_train, y_train, time_steps)
    X_test_lstm, y_test_lstm = create_sequences(X_test, y_test, time_steps)

    # Ensure that sequences are not empty
    if len(X_train_lstm) == 0 or len(X_test_lstm) == 0:
        print("Skipping due to insufficient data to create sequences.")
        continue


In [None]:
# Cross-validation
tscv = TimeSeriesSplit(n_splits=5)

train_mse_scores = []
test_mse_scores = []
y_train_pred_all = []
y_train_actual_all = []
y_test_pred_all = []
y_test_actual_all = []

for train_index, test_index in tscv.split(X_normalized):
    X_train, X_test = X_normalized.iloc[train_index], X_normalized.iloc[test_index]
    y_train, y_test = y_normalized.iloc[train_index], y_normalized.iloc[test_index]

    X_train_lstm, y_train_lstm = create_sequences(X_train, y_train, time_steps)
    X_test_lstm, y_test_lstm = create_sequences(X_test, y_test, time_steps)

print("Shape of X_train_lstm:", X_train_lstm.shape)
print("Shape of y_train_lstm:", y_train_lstm.shape)
print("Shape of X_test_lstm:", X_test_lstm.shape)
print("Shape of y_test_lstm:", y_test_lstm.shape)



In [None]:
# Build the model using LSTM
model = Sequential()
model.add(LSTM(units=64, return_sequences=True, input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2]), kernel_regularizer=l2(0.01)))
model.add(Dropout(0.3))
model.add(LSTM(units=64, return_sequences=False, kernel_regularizer=l2(0.01)))
model.add(Dropout(0.3))
model.add(Dense(units=32, activation='relu', kernel_regularizer=l2(0.01)))
model.add(Dense(units=1))

model.compile(optimizer='adam', loss='mean_squared_error')

In [None]:
# Train the model
model.fit(X_train_lstm, y_train_lstm, epochs=100, batch_size=16, validation_split=0.2, verbose=1)


In [None]:
 # Predictions
y_train_pred = model.predict(X_train_lstm)
y_pred = model.predict(X_test_lstm)

    # Inverse transform the predictions
y_train_pred_inv = target_scaler.inverse_transform(y_train_pred)
y_test_pred_inv = target_scaler.inverse_transform(y_pred)
y_train_inv = target_scaler.inverse_transform(y_train_lstm)
y_test_inv = target_scaler.inverse_transform(y_test_lstm.reshape(-1, 1))

    # Store predictions for visualization
y_train_pred_all.append(y_train_pred_inv)
y_train_actual_all.append(y_train_inv)
y_test_pred_all.append(y_test_pred_inv)
y_test_actual_all.append(y_test_inv)

    # Calculate MSE
train_mse = mean_squared_error(y_train_inv, y_train_pred_inv)
test_mse = mean_squared_error(y_test_inv, y_test_pred_inv)

train_mse_scores.append(train_mse)
test_mse_scores.append(test_mse)

# Average MSE scores
average_train_mse = np.mean(train_mse_scores)
average_test_mse = np.mean(test_mse_scores)

print(f'Average Train MSE: {average_train_mse}')
print(f'Average Test MSE: {average_test_mse}')

In [None]:
# Future Predictions
def predict_future(model, X, n_steps, time_steps):
    future_preds = []
    last_seq = X[-time_steps:].values  # get the last sequence from the dataset
    n_features = X.shape[1]  # number of features in the dataset

    for _ in range(n_steps):
        pred = model.predict(last_seq.reshape(1, time_steps, n_features))
        future_preds.append(pred[0, 0])

        # Slide the window forward
        # Create a new row with the prediction and the rest of the features from the last row
        new_row = np.zeros((1, n_features))
        new_row[0, 0] = pred  # Assuming the prediction should be in the first column
        new_row[0, 1:] = last_seq[-1, 1:]  # Retain the rest of the features

        last_seq = np.vstack((last_seq[1:], new_row))  # append new_row and remove the first row

    return np.array(future_preds)

# Assuming each dataset has the same coordinates, extract coordinates from the hotspots dataset
coordinates = hotspots_data[['latitude', 'longitude']]

# Predict future values
n_future_steps = 9  # Number of months to predict into the future
future_predictions = predict_future(model, X_normalized, n_future_steps, time_steps)

# Inverse transform the future predictions
future_predictions_inv = target_scaler.inverse_transform(future_predictions.reshape(-1, 1))


In [None]:
# Create future dates
last_date = data_resampled.index[-1]
future_dates = pd.date_range(start=last_date, periods=n_future_steps + 1, freq='M')[1:]

# Calculate mean and standard deviation of coordinates
mean_lat = coordinates['latitude'].mean()
mean_lon = coordinates['longitude'].mean()
std_lat = coordinates['latitude'].std()
std_lon = coordinates['longitude'].std()

In [None]:
# Generate varied future coordinates
np.random.seed(42)  # for reproducibility
future_lats = np.random.normal(loc=mean_lat, scale=std_lat, size=n_future_steps)
future_lons = np.random.normal(loc=mean_lon, scale=std_lon, size=n_future_steps)

# Create a DataFrame for future predictions with varied geospatial data
future_geo_predictions = pd.DataFrame({
    'latitude': future_lats,
    'longitude': future_lons,
    'prediction': future_predictions_inv.flatten(),
    'date': future_dates
})

# Create a map centered around the average latitude and longitude
m = folium.Map(location=[mean_lat, mean_lon], zoom_start=6)

# Add heat map
heat_data = [[row['latitude'], row['longitude'], row['prediction']] for index, row in future_geo_predictions.iterrows()]
HeatMap(heat_data).add_to(m)

# Save the map as an HTML file and display it
map_filename = 'future_predictions_heatmap.html'
m.save(map_filename)

# Display the map in Jupyter Notebook
IFrame(map_filename, width=700, height=500)

In [None]:
## WALA NANI DIYA NA PART


# Assuming each dataset has the same coordinates, extract coordinates from the hotspots dataset
coordinates = hotspots_data[['latitude', 'longitude']]

# Create a DataFrame for future predictions with geospatial data
future_geo_predictions = pd.DataFrame({
    'latitude': [coordinates['latitude'].iloc[-1]] * n_future_steps,
    'longitude': [coordinates['longitude'].iloc[-1]] * n_future_steps,
    'prediction': future_predictions_inv.flatten(),
    'date': future_dates
})

# Create a map centered around the average latitude and longitude
m = folium.Map(location=[future_geo_predictions['latitude'].mean(), future_geo_predictions['longitude'].mean()], zoom_start=6)

# Add heat map
heat_data = [[row['latitude'], row['longitude'], row['prediction']] for index, row in future_geo_predictions.iterrows()]
HeatMap(heat_data).add_to(m)

# Save the map as an HTML file and display it
map_filename = 'future_predictions_heatmap.html'
m.save(map_filename)

# Display the map in Jupyter Notebook
IFrame(map_filename, width=700, height=500)