<a href="https://colab.research.google.com/github/25Iqbalhossain/Co2_series_injection/blob/main/LSTM_test_with_C02_injections.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from IPython import get_ipython
from IPython.display import display
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV  # <-- Added GridSearchCV import
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error
import xgboost as xgb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from scipy.fftpack import fft
from scipy.signal import savgol_filter
import matplotlib.pyplot as plt
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Load dataset
train_df = pd.read_excel('/content/drive/MyDrive/Co2_Injection_Ml_Data_Xl/CO2_Injection_rate train.xlsx')
test_df = pd.read_excel('/content/drive/MyDrive/Co2_Injection_Ml_Data_Xl/CO2_Injection_rate test exam.xlsx')

# Data Cleaning and Preprocessing
train_df.columns = train_df.columns.str.strip()  # Strip whitespace from column names
test_df.columns = test_df.columns.str.strip()  # Strip whitespace from column names

# Feature Engineering: Signal Processing + Lag Features
numerical_features = [col for col in train_df.columns if col not in ['Date Time', 'inj_diff']]

for col in numerical_features:
    if col in train_df.columns:
        # Adding lag features
        train_df[f'{col}_lag_1'] = train_df[col].shift(1)
        train_df[f'{col}_lag_2'] = train_df[col].shift(2)
        train_df[f'{col}_rolling_mean'] = train_df[col].rolling(window=5).mean()
        train_df[f'{col}_rolling_std'] = train_df[col].rolling(window=5).std()

        # Handle FFT: Check if column is numeric and has enough data
        numeric_col = pd.to_numeric(train_df[col], errors='coerce').dropna()
        if len(numeric_col) > 0:
            # Convert the Pandas Series to a NumPy array before applying FFT
            fft_result = np.abs(fft(numeric_col.to_numpy()))

            # Ensure the FFT result matches the original data length
            # Truncate or zero-pad the FFT result to match the original length
            fft_result_padded = np.pad(fft_result, (0, len(train_df[col]) - len(fft_result)), mode='constant')
            train_df[f'{col}_fft'] = fft_result_padded
        else:
            print(f'Skipping FFT for column {col} due to insufficient data.')

        # Apply Savitzky-Golay filter
        train_df[f'{col}_smooth'] = savgol_filter(train_df[col].fillna(0), window_length=5, polyorder=2)

    if col in test_df.columns:
        # Adding lag features
        test_df[f'{col}_lag_1'] = test_df[col].shift(1)
        test_df[f'{col}_lag_2'] = test_df[col].shift(2)
        test_df[f'{col}_rolling_mean'] = test_df[col].rolling(window=5).mean()
        test_df[f'{col}_rolling_std'] = test_df[col].rolling(window=5).std()

        # Handle FFT: Check if column is numeric and has enough data
        numeric_col_test = pd.to_numeric(test_df[col], errors='coerce').dropna()
        if len(numeric_col_test) > 0:
            # Convert the Pandas Series to a NumPy array before applying FFT
            fft_result_test = np.abs(fft(numeric_col_test.to_numpy()))

            # Ensure the FFT result matches the original data length
            fft_result_padded_test = np.pad(fft_result_test, (0, len(test_df[col]) - len(fft_result_test)), mode='constant')
            test_df[f'{col}_fft'] = fft_result_padded_test
        else:
            print(f'Skipping FFT for column {col} due to insufficient data.')

        # Apply Savitzky-Golay filter
        test_df[f'{col}_smooth'] = savgol_filter(test_df[col].fillna(0), window_length=5, polyorder=2)

# Dropping NaN values caused by shifting
train_df.dropna(inplace=True)

# Splitting the data
X = train_df.drop(columns=['inj_diff', 'Date Time'])
y = train_df['inj_diff']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Reshape for LSTM (samples, time steps, features)
X_train_lstm = X_train_scaled.reshape(X_train_scaled.shape[0], 1, X_train_scaled.shape[1])
X_test_lstm = X_test_scaled.reshape(X_test_scaled.shape[0], 1, X_test_scaled.shape[1])

# Build LSTM Model
lstm_model = Sequential([
    LSTM(64, return_sequences=True, input_shape=(1, X_train.shape[1])),
    Dropout(0.3),
    LSTM(32),
    Dropout(0.2),
    Dense(1)
])

lstm_model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])

# Train LSTM
lstm_model.fit(X_train_lstm, y_train, epochs=100, batch_size=16, validation_data=(X_test_lstm, y_test),
               verbose=1, callbacks=[EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)])

# XGBoost Hyperparameter tuning with GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 6, 9],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0]
}

grid_search = GridSearchCV(estimator=xgb.XGBRegressor(random_state=42), param_grid=param_grid, scoring='neg_mean_squared_error', cv=3, verbose=2)
grid_search.fit(X_train, y_train)

# Best XGBoost Model
best_xgb_model = grid_search.best_estimator_

# Making Predictions
lstm_preds = lstm_model.predict(X_test_lstm).flatten()
xgb_preds = best_xgb_model.predict(X_test)

# Hybrid Prediction (Weighted Average)
final_preds = 0.6 * lstm_preds + 0.4 * xgb_preds

# Calculate R² Score and Accuracy Percentage
r2 = r2_score(y_test, final_preds)
mae = mean_absolute_error(y_test, final_preds)
accuracy_percentage = r2 * 100
print(f'Validation R² Score: {r2:.4f} ({accuracy_percentage:.2f}%)')
print(f'Mean Absolute Error (MAE): {mae:.4f}')

# Predicting on Test Data
test_features = test_df.drop(columns=['Date Time', 'inj_diff'], errors='ignore')
test_features = scaler.transform(test_features[X.columns])
test_features_lstm = test_features.reshape(test_features.shape[0], 1, test_features.shape[1])

lstm_test_preds = lstm_model.predict(test_features_lstm).flatten()
xgb_test_preds = best_xgb_model.predict(test_features)
test_df['inj_diff_pred'] = 0.6 * lstm_test_preds + 0.4 * xgb_test_preds

# Saving Results
test_df[['Date Time', 'inj_diff_pred']].to_csv('CO2_injection_predictions.csv', index=False)
print("Predictions saved successfully.")


Mounted at /content/drive


  train_df[f'{col}_lag_2'] = train_df[col].shift(2)
  train_df[f'{col}_rolling_mean'] = train_df[col].rolling(window=5).mean()
  train_df[f'{col}_rolling_std'] = train_df[col].rolling(window=5).std()
  train_df[f'{col}_fft'] = fft_result_padded
  train_df[f'{col}_smooth'] = savgol_filter(train_df[col].fillna(0), window_length=5, polyorder=2)
  test_df[f'{col}_lag_2'] = test_df[col].shift(2)
  test_df[f'{col}_rolling_mean'] = test_df[col].rolling(window=5).mean()
  test_df[f'{col}_rolling_std'] = test_df[col].rolling(window=5).std()
  test_df[f'{col}_fft'] = fft_result_padded_test
  test_df[f'{col}_smooth'] = savgol_filter(test_df[col].fillna(0), window_length=5, polyorder=2)
  train_df[f'{col}_lag_1'] = train_df[col].shift(1)
  train_df[f'{col}_lag_2'] = train_df[col].shift(2)
  train_df[f'{col}_rolling_mean'] = train_df[col].rolling(window=5).mean()
  train_df[f'{col}_rolling_std'] = train_df[col].rolling(window=5).std()
  train_df[f'{col}_fft'] = fft_result_padded
  train_df[f'{col}_

Epoch 1/100
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 20ms/step - loss: 18.5254 - mae: 1.3661 - val_loss: 17.2542 - val_mae: 1.4711
Epoch 2/100
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - loss: 17.1325 - mae: 1.3550 - val_loss: 17.0273 - val_mae: 1.5011
Epoch 3/100
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - loss: 22.1185 - mae: 1.6394 - val_loss: 16.8389 - val_mae: 1.4969
Epoch 4/100
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - loss: 15.6100 - mae: 1.3248 - val_loss: 16.4902 - val_mae: 1.6564
Epoch 5/100
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - loss: 17.5974 - mae: 1.5377 - val_loss: 16.2823 - val_mae: 1.6696
Epoch 6/100
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - loss: 13.5266 - mae: 1.3940 - val_loss: 15.7753 - val_mae: 1.6368
Epoch 7/100
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9

  test_df['inj_diff_pred'] = 0.6 * lstm_test_preds + 0.4 * xgb_test_preds
