Stock 

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load the dataset
file_path = 'C:\\Users\\Shadow\\Desktop\\GIT_PE\\survey_data\\survey_data.csv'
data = pd.read_csv(file_path)

# Drop the 'Meter No' column as it's not relevant
data.drop('Meter No', axis=1, inplace=True)

# Define appliance columns and target variable
appliance_columns = ['Rooms', 'Fans', 'Lights', 'Freezes', 'Microwave Oven', 'Geyser', 'AC', 'Washing Machine', 'TV', 'PC', 'Induction Stove']
target_variable = 'pre_23_dec_unit'

# Impute missing values with zero for appliance counts
data[appliance_columns] = data[appliance_columns].fillna(0)

# Neighboring Average for Monthly Units
unit_columns = [
    'post_21_july_unit', 'post_21_august_unit', 'post_21_sep_unit', 'post_21_oct_unit',
    'post_21_nov_unit', 'post_21_dec_unit', 'post_22_jan_unit', 'post_22_feb_unit',
    'pre_23_jan_unit', 'pre_23_feb_unit', 'pre_23_mar_unit',
    'pre_23_apr_unit', 'pre_23_may_unit', 'pre_23_june_unit', 'pre_23_jul_unit',
    'pre_23_aug_unit', 'pre_23_sep_unit', 'pre_23_oct_unit', 'pre_23_nov_unit'
]
for i in range(1, len(unit_columns)-1):
    data[unit_columns[i]] = data[unit_columns[i]].fillna(
        (data[unit_columns[i-1]] + data[unit_columns[i+1]]) / 2
    )

# Normalize the dataset
feature_scaler = MinMaxScaler(feature_range=(0, 1))
target_scaler = MinMaxScaler(feature_range=(0, 1))

selected_features = appliance_columns + unit_columns
data_scaled = feature_scaler.fit_transform(data[selected_features])
target_scaled = target_scaler.fit_transform(data[[target_variable]])

# Combine scaled features and target
data_scaled = np.hstack((data_scaled, target_scaled))

# Creating sequences with a length of 30
seq_length = 30
def create_sequences(data, seq_length):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:i+seq_length, :-1])
        y.append(data[i + seq_length, -1])
    return np.array(X), np.array(y)

X, y = create_sequences(data_scaled, seq_length)

# Define the LSTM model with the best hyperparameters
def create_model():
    model = Sequential()
    model.add(LSTM(100, return_sequences=True, input_shape=(seq_length, X.shape[2])))
    model.add(Dropout(0.3))
    model.add(LSTM(100))
    model.add(Dropout(0.3))
    model.add(Dense(1))
    model.compile(optimizer='rmsprop', loss='mean_squared_error')
    return model

# K-Fold Cross Validation
k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)
mse_scores = []
mae_scores = []
rmse_scores = []
r2_scores = []

for train_index, val_index in kf.split(X):
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]
    
    model = create_model()
    
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    model.fit(X_train, y_train, epochs=150, batch_size=32, validation_data=(X_val, y_val), callbacks=[early_stopping], verbose=0)
    
    y_val_pred = model.predict(X_val)
    y_val_pred_rescaled = target_scaler.inverse_transform(y_val_pred.reshape(-1, 1))
    y_val_rescaled = target_scaler.inverse_transform(y_val.reshape(-1, 1))
    
    mse = mean_squared_error(y_val_rescaled, y_val_pred_rescaled)
    mae = mean_absolute_error(y_val_rescaled, y_val_pred_rescaled)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_val_rescaled, y_val_pred_rescaled)
    
    mse_scores.append(mse)
    mae_scores.append(mae)
    rmse_scores.append(rmse)
    r2_scores.append(r2)

# Calculate average metrics
average_mse = np.mean(mse_scores)
average_mae = np.mean(mae_scores)
average_rmse = np.mean(rmse_scores)
average_r2 = np.mean(r2_scores)

print(f'Average MSE across {k} folds: {average_mse}')
print(f'Average MAE across {k} folds: {average_mae}')
print(f'Average RMSE across {k} folds: {average_rmse}')
print(f'Average R-squared across {k} folds: {average_r2}')


  super().__init__(**kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step


  super().__init__(**kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step


  super().__init__(**kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step


  super().__init__(**kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step


  super().__init__(**kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 176ms/step
Average MSE across 5 folds: 18189.464484325676
Average MAE across 5 folds: 93.84074255371092
Average RMSE across 5 folds: 133.79381707867384
Average R-squared across 5 folds: 0.14664976828108423


Use optimized one 

In [4]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt

# Load the dataset
file_path = 'C:\\Users\\Shadow\\Desktop\\GIT_PE\\survey_data\\survey_data.csv'
data = pd.read_csv(file_path)

# Drop the 'Meter No' column as it's not relevant
data.drop('Meter No', axis=1, inplace=True)

# Define appliance columns and target variable
appliance_columns = ['Rooms', 'Fans', 'Lights', 'Freezes', 'Microwave Oven', 'Geyser', 'AC', 'Washing Machine', 'TV', 'PC', 'Induction Stove']
target_variable = 'pre_23_dec_unit'

# Impute missing values with zero for appliance counts
data[appliance_columns] = data[appliance_columns].fillna(0)

# Neighboring Average for Monthly Units
unit_columns = [
    'post_21_july_unit', 'post_21_august_unit', 'post_21_sep_unit', 'post_21_oct_unit',
    'post_21_nov_unit', 'post_21_dec_unit', 'post_22_jan_unit', 'post_22_feb_unit',
    'pre_23_jan_unit', 'pre_23_feb_unit', 'pre_23_mar_unit',
    'pre_23_apr_unit', 'pre_23_may_unit', 'pre_23_june_unit', 'pre_23_jul_unit',
    'pre_23_aug_unit', 'pre_23_sep_unit', 'pre_23_oct_unit', 'pre_23_nov_unit'
]
for i in range(1, len(unit_columns)-1):
    data[unit_columns[i]] = data[unit_columns[i]].fillna(
        (data[unit_columns[i-1]] + data[unit_columns[i+1]]) / 2
    )

# Normalize the dataset
feature_scaler = MinMaxScaler(feature_range=(0, 1))
target_scaler = MinMaxScaler(feature_range=(0, 1))

selected_features = appliance_columns + unit_columns
data_scaled = feature_scaler.fit_transform(data[selected_features])
target_scaled = target_scaler.fit_transform(data[[target_variable]])

# Combine scaled features and target
data_scaled = np.hstack((data_scaled, target_scaled))

# Creating sequences with a length of 30
seq_length = 30
def create_sequences(data, seq_length):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:i+seq_length, :-1])
        y.append(data[i + seq_length, -1])
    return np.array(X), np.array(y)

X, y = create_sequences(data_scaled, seq_length)

# Define the LSTM model with the best parameters
def create_model(neurons=100, dropout_rate=0.3, optimizer='rmsprop'):
    model = Sequential()
    model.add(Input(shape=(seq_length, X.shape[2])))
    model.add(LSTM(neurons, return_sequences=True))
    model.add(Dropout(dropout_rate))
    model.add(LSTM(neurons))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1))
    model.compile(optimizer=optimizer, loss='mean_squared_error')
    return model

# K-Fold Cross Validation
k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)
mse_scores = []
mae_scores = []
rmse_scores = []
r2_scores = []

for train_index, val_index in kf.split(X):
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]
    
    model = create_model(neurons=100, dropout_rate=0.3, optimizer='rmsprop')
    
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    model.fit(X_train, y_train, epochs=150, batch_size=32, validation_data=(X_val, y_val), callbacks=[early_stopping], verbose=0)
    
    y_val_pred = model.predict(X_val)
    y_val_pred_rescaled = target_scaler.inverse_transform(y_val_pred.reshape(-1, 1))
    y_val_rescaled = target_scaler.inverse_transform(y_val.reshape(-1, 1))
    
    mse = mean_squared_error(y_val_rescaled, y_val_pred_rescaled)
    mae = mean_absolute_error(y_val_rescaled, y_val_pred_rescaled)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_val_rescaled, y_val_pred_rescaled)
    
    mse_scores.append(mse)
    mae_scores.append(mae)
    rmse_scores.append(rmse)
    r2_scores.append(r2)

# Calculate average metrics
average_mse = np.mean(mse_scores)
average_mae = np.mean(mae_scores)
average_rmse = np.mean(rmse_scores)
average_r2 = np.mean(r2_scores)

print(f'Average MSE across {k} folds: {average_mse}')
print(f'Average MAE across {k} folds: {average_mae}')
print(f'Average RMSE across {k} folds: {average_rmse}')
print(f'Average R-squared across {k} folds: {average_r2}')

# Visualization of the metrics
metrics = ['MSE', 'MAE', 'RMSE', 'R-squared']
scores = [average_mse, average_mae, average_rmse, average_r2]

plt.figure(figsize=(10, 6))
plt.bar(metrics, scores, color=['blue', 'green', 'red', 'purple'])
plt.title('Average Performance Metrics Across K-Folds')
plt.ylabel('Score')
plt.savefig('k_fold.png')
plt.show()
