<a href="https://colab.research.google.com/github/Anwesha-code/PBL_Project/blob/main/DPRNN_CEP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from google.colab import files
uploaded = files.upload()

Saving data_center_carbon_emission_dataset_COPY.xlsx to data_center_carbon_emission_dataset_COPY.xlsx


In [4]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
df = pd.read_excel('data_center_carbon_emission_dataset_COPY.xlsx')
print(df.shape)

(75000, 18)


In [5]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from scipy import stats
import numpy as np

# Load data
df = pd.read_excel('data_center_carbon_emission_dataset_COPY.xlsx')

# Convert timestamp to datetime and chronological sort
df['timestamp'] = pd.to_datetime(df['timestamp'])
df = df.sort_values('timestamp').reset_index(drop=True)

# Handle missing values
df.ffill(inplace=True)

# Remove duplicates
df.drop_duplicates(inplace=True)

# Feature engineering: hour and day of week
df['hour'] = df['timestamp'].dt.hour
df['dayofweek'] = df['timestamp'].dt.dayofweek

# Outlier removal (z-score, 3-sigma rule)
z_scores = np.abs(stats.zscore(df.select_dtypes(include=[float, int])))
df = df[(z_scores < 3).all(axis=1)]

# Scaling numerical features
num_cols = [
    'voltage', 'current', 'power', 'frequency', 'energy', 'power_factor',
    'esp32_temperature', 'cpu_temperature', 'gpu_temperature',
    'cpu_usage_percent', 'cpu_power_watts', 'gpu_usage_percent',
    'gpu_power_watts', 'ram_usage_percent', 'ram_power_watts'
]
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

# Categorical encoding with one-hot (MAC, weekday)
df = pd.get_dummies(df, columns=['MAC', 'weekday'], drop_first=True)

# Check for missing values and duplicates (again, after all steps)
print("Missing values per column:\n", df.isnull().sum())
df = df.drop_duplicates()

# Verify sum of one-hot columns for weekdays is 0 or 1 per row (because of drop_first=True)
weekday_cols = [col for col in df.columns if col.startswith('weekday_')]
assert (((df[weekday_cols].sum(axis=1) == 1) | (df[weekday_cols].sum(axis=1) == 0)).all()), "Error: Weekday encoding not correct"

# Temporal lag features (e.g., for voltage)
df = df.sort_values('timestamp').reset_index(drop=True)
df['voltage_lag1'] = df['voltage'].shift(1)

# Drop NA values from lag creation (first row)
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

# Chronological split for train, validation, test (70/15/15 split)
train_size = int(len(df) * 0.7)
val_size = int(len(df) * 0.15)
train = df[:train_size]
val = df[train_size:train_size + val_size]
test = df[train_size + val_size:]

print("Train shape:", train.shape)
print("Validation shape:", val.shape)
print("Test shape:", test.shape)

Missing values per column:
 timestamp                0
voltage                  0
current                  0
power                    0
frequency                0
energy                   0
power_factor             0
esp32_temperature        0
cpu_temperature          0
gpu_temperature          0
cpu_usage_percent        0
cpu_power_watts          0
gpu_usage_percent        0
gpu_power_watts          0
ram_usage_percent        0
ram_power_watts          0
hour                     0
dayofweek                0
MAC_4D:62:06:13:97:31    0
MAC_5E:63:07:14:98:32    0
MAC_6F:64:08:15:99:33    0
weekday_1                0
weekday_2                0
weekday_3                0
weekday_4                0
weekday_5                0
weekday_6                0
dtype: int64
Train shape: (51961, 28)
Validation shape: (11134, 28)
Test shape: (11135, 28)


In [6]:
# --- 1. Sequence Preparation ---
def create_sequences(data, target, seq_length):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:i + seq_length])
        y.append(target[i + seq_length])
    return np.array(X, dtype=np.float32), np.array(y, dtype=np.float32)

sequence_length = 10  # you can change this (e.g., 10-20)
target_col = 'energy'
feature_cols = [col for col in train.columns if col not in ['timestamp', target_col]]

# Arrays for each split
X_train_arr = train[feature_cols].values
y_train_arr = train[target_col].values
X_val_arr   = val[feature_cols].values
y_val_arr   = val[target_col].values
X_test_arr  = test[feature_cols].values
y_test_arr  = test[target_col].values

# Create sequence data
X_train_seq, y_train_seq = create_sequences(X_train_arr, y_train_arr, sequence_length)
X_val_seq,   y_val_seq   = create_sequences(X_val_arr,   y_val_arr,   sequence_length)
X_test_seq,  y_test_seq  = create_sequences(X_test_arr,  y_test_arr,  sequence_length)

print("Train sequence shape:", X_train_seq.shape)
print("Validation sequence shape:", X_val_seq.shape)
print("Test sequence shape:", X_test_seq.shape)

# --- 2. DPRNN Model Definition ---
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, GRU, GlobalMaxPooling1D, Dense, Dropout, BatchNormalization

seq_len, num_feats = X_train_seq.shape[1], X_train_seq.shape[2]
model = Sequential([
    LSTM(64, return_sequences=True, input_shape=(seq_len, num_feats)),
    Dropout(0.2),
    BatchNormalization(),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(25, activation='relu'),
    Dense(1)
])

model.compile(optimizer='adam', loss='mse', metrics=['mae'])
model.summary()



Train sequence shape: (51951, 10, 26)
Validation sequence shape: (11124, 10, 26)
Test sequence shape: (11125, 10, 26)


  super().__init__(**kwargs)


In [7]:
# --- 3. Training ---
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=1e-4)

history = model.fit(
    X_train_seq, y_train_seq,
    epochs=100,
    batch_size=32,
    validation_data=(X_val_seq, y_val_seq),
    callbacks=[early_stopping, reduce_lr],
    verbose=1
)

# --- 4. Evaluate Model ---
from sklearn.metrics import mean_absolute_error, mean_squared_error

y_val_pred = model.predict(X_val_seq)
y_test_pred = model.predict(X_test_seq)

val_mae  = mean_absolute_error(y_val_seq, y_val_pred)
val_rmse = np.sqrt(mean_squared_error(y_val_seq, y_val_pred))
test_mae  = mean_absolute_error(y_test_seq, y_test_pred)
test_rmse = np.sqrt(mean_squared_error(y_test_seq, y_test_pred))

print(f'DPRNN Validation MAE: {val_mae:.4f}, RMSE: {val_rmse:.4f}')
print(f'DPRNN Test MAE: {test_mae:.4f}, RMSE: {test_rmse:.4f}')


Epoch 1/100
[1m1624/1624[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 11ms/step - loss: 0.5260 - mae: 0.6147 - val_loss: 2.2306 - val_mae: 1.4777 - learning_rate: 0.0010
Epoch 2/100
[1m1624/1624[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 12ms/step - loss: 0.4676 - mae: 0.5861 - val_loss: 2.1711 - val_mae: 1.4546 - learning_rate: 0.0010
Epoch 3/100
[1m1624/1624[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 11ms/step - loss: 0.4399 - mae: 0.5681 - val_loss: 2.3827 - val_mae: 1.5218 - learning_rate: 0.0010
Epoch 4/100
[1m1624/1624[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 12ms/step - loss: 0.4112 - mae: 0.5502 - val_loss: 2.4465 - val_mae: 1.5397 - learning_rate: 0.0010
Epoch 5/100
[1m1624/1624[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 12ms/step - loss: 0.3723 - mae: 0.5206 - val_loss: 2.2461 - val_mae: 1.4751 - learning_rate: 0.0010
Epoch 6/100
[1m1624/1624[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 12ms/step - lo