<a href="https://colab.research.google.com/github/Farwa-01/SEG_Training_Results_60_Site_DKA_M18_A_Phase/blob/main/Feature_Selection_by_RFE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Linear Regression**

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from google.colab import drive
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score, median_absolute_error, explained_variance_score
from sklearn.model_selection import train_test_split
import time

# Mount Google Drive to access files
drive.mount('/content/drive')

# Load the dataset from Google Drive
file_path = '/content/drive/MyDrive/SEG Forecast/60-Site_DKA-M18_A-Phase.csv'
df = pd.read_csv(file_path)

# Select only the necessary columns
df = df[['Active_Power', 'Global_Horizontal_Radiation', 'Weather_Temperature_Celsius',
         'Weather_Relative_Humidity', 'Diffuse_Horizontal_Radiation', 'Wind_Speed',
         'Performance_Ratio']]

# Convert the relevant columns to numeric (if not already)
numeric_columns = ['Active_Power', 'Global_Horizontal_Radiation', 'Weather_Temperature_Celsius',
                   'Weather_Relative_Humidity', 'Diffuse_Horizontal_Radiation', 'Wind_Speed',
                   'Performance_Ratio']

# Use .loc to avoid SettingWithCopyWarning
df.loc[:, numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors='coerce')

# Handle missing values by filling them with the mean of the column
df.fillna(df.mean(), inplace=True)

# Define features (X) and target (y) for Linear Regression
X = df[['Global_Horizontal_Radiation', 'Weather_Temperature_Celsius',
                   'Weather_Relative_Humidity',  'Wind_Speed']]
y = df['Active_Power']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Polynomial Features Transformation
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Initialize and train Linear Regression model
model = LinearRegression()
start_time = time.time()
model.fit(X_train_poly, y_train)
end_time = time.time()
training_time = end_time - start_time

# Predict using the trained model
y_pred = model.predict(X_test_poly)

# Calculate and display model performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
medae = median_absolute_error(y_test, y_pred)
evs = explained_variance_score(y_test, y_pred)

print(f"Polynomial Linear Regression Model (Degree 2)")
print(f"Training Time: {training_time:.4f} seconds")
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")
print(f"Median Absolute Error: {medae}")
print(f"Explained Variance Score: {evs}")


Mounted at /content/drive


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors='coerce')


Polynomial Linear Regression Model (Degree 2)
Training Time: 0.9884 seconds
Mean Squared Error: 1.0984412126968792
R-squared: 0.504850541672511
Median Absolute Error: 0.10756436868040309
Explained Variance Score: 0.504852662410101


In [None]:
# Separate cell for Huber Loss
from sklearn.metrics import make_scorer
from sklearn.linear_model import HuberRegressor

# Initialize and train Huber Regressor
huber_model = HuberRegressor()
start_time = time.time()
huber_model.fit(X_train_poly, y_train)
end_time = time.time()
training_time_huber = end_time - start_time

# Predict using the trained Huber model
y_pred_huber = huber_model.predict(X_test_poly)

# Calculate Huber Loss
def huber_loss(y_true, y_pred, delta=1.0):
    error = y_true - y_pred
    is_small_error = np.abs(error) <= delta
    squared_loss = np.square(error) / 2
    linear_loss = delta * (np.abs(error) - delta / 2)
    return np.where(is_small_error, squared_loss, linear_loss).mean()

huber_loss_value = huber_loss(y_test, y_pred_huber)

print(f"\nHuber Regression Model")
print(f"Training Time: {training_time_huber:.4f} seconds")
print(f"Huber Loss: {huber_loss_value}")


Huber Regression Model
Training Time: 30.5854 seconds
Huber Loss: 0.38904652624891756


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [None]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
# Calculate and display Lasso Regression model performance

rmse_lasso = np.sqrt(mse_lasso)
mae_lasso = mean_absolute_error(y_test, y_pred_lasso)



print(f"Root Mean Squared Error (RMSE): {rmse_lasso}")
print(f"Mean Absolute Error (MAE): {mae_lasso}")

Root Mean Squared Error (RMSE): 1.04819903961482
Mean Absolute Error (MAE): 0.6209463466643363


**Random Forest**

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, median_absolute_error, explained_variance_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import HuberRegressor
import time

# Load your dataset
data = pd.read_csv('/content/drive/MyDrive/SEG Forecast/60-Site_DKA-M18_A-Phase.csv')  # Replace with your actual file path

# Impute missing values for both features and the target variable
imputer = SimpleImputer(strategy='median')

# Selecting features and target before imputation
features = data[['Global_Horizontal_Radiation', 'Weather_Relative_Humidity',
        'Weather_Temperature_Celsius', 'Wind_Speed']]
target = data['Active_Power']

# Applying imputation
features_imputed = imputer.fit_transform(features)
target_imputed = imputer.fit_transform(target.values.reshape(-1, 1)).ravel()

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_imputed, target_imputed, test_size=0.3, random_state=42)

# Feature scaling (Random Forest does not necessarily require feature scaling, but doing so can be beneficial in many scenarios)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Model building
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)  # You can adjust the number of trees and other parameters

# Measure training time
start_time = time.time()
rf_model.fit(X_train_scaled, y_train)
end_time = time.time()
training_time_rf = end_time - start_time

# Predicting the test results
y_pred = rf_model.predict(X_test_scaled)

# Model evaluation
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
medae = median_absolute_error(y_test, y_pred)
evs = explained_variance_score(y_test, y_pred)
epsilon = 1e-8  # Small number to prevent division by zero
mape = np.mean(np.abs((y_test - y_pred) / (y_test + epsilon))) * 100  # Modified MAPE calculation

# Calculate Huber Loss using Huber Regressor
huber_model = HuberRegressor()
start_time = time.time()
huber_model.fit(X_train_scaled, y_train)
end_time = time.time()
training_time_huber = end_time - start_time

y_pred_huber = huber_model.predict(X_test_scaled)
def huber_loss(y_true, y_pred, delta=1.0):
    error = y_true - y_pred
    is_small_error = np.abs(error) <= delta
    squared_loss = np.square(error) / 2
    linear_loss = delta * (np.abs(error) - delta / 2)
    return np.where(is_small_error, squared_loss, linear_loss).mean()
huber_loss_value = huber_loss(y_test, y_pred_huber)

print(f'Random Forest Regression Model')
print(f'Training Time: {training_time_rf:.4f} seconds')
print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {rmse}')
print(f'R-Squared: {r2}')
print(f'Mean Absolute Error: {mae}')
print(f'Median Absolute Error: {medae}')
print(f'Explained Variance Score: {evs}')
print(f'Mean Absolute Percentage Error: {mape:.2f}%')

print(f'\nHuber Regression Model')
print(f'Training Time: {training_time_huber:.4f} seconds')
print(f'Huber Loss: {huber_loss_value}')



Random Forest Regression Model
Training Time: 918.5411 seconds
Mean Squared Error: 0.7089243044047953
Root Mean Squared Error: 0.8419764274638544
R-Squared: 0.6832478875433896
Mean Absolute Error: 0.4038921952682804
Median Absolute Error: 0.000220666654931837
Explained Variance Score: 0.6832498311711465
Mean Absolute Percentage Error: 533104652.18%

Huber Regression Model
Training Time: 6.9867 seconds
Huber Loss: 0.39231556156515984


**MLP**

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, median_absolute_error, explained_variance_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import HuberRegressor
import time

# Load your dataset
data = pd.read_csv('/content/drive/MyDrive/SEG Forecast/60-Site_DKA-M18_A-Phase.csv')  # Replace with your actual file path

# Impute missing values for both features and the target variable
imputer = SimpleImputer(strategy='median')

# Selecting features and target before imputation
features = data[['Global_Horizontal_Radiation', 'Weather_Relative_Humidity',
                 'Weather_Temperature_Celsius', 'Wind_Speed']]
target = data['Active_Power']

# Applying imputation
features_imputed = imputer.fit_transform(features)
target_imputed = imputer.fit_transform(target.values.reshape(-1,1)).ravel()

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_imputed, target_imputed, test_size=0.3, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Model building
mlp_model = MLPRegressor(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)  # Adjust hidden_layer_sizes and other parameters as needed
mlp_model.fit(X_train_scaled, y_train)

# Measure training time
start_time = time.time()
mlp_model.fit(X_train_scaled, y_train)
end_time = time.time()
training_time_mlp = end_time - start_time

# Predicting the test results
y_pred = mlp_model.predict(X_test_scaled)

# Model evaluation
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
medae = median_absolute_error(y_test, y_pred)
evs = explained_variance_score(y_test, y_pred)
epsilon = 1e-8  # Small number to prevent division by zero
mape = np.mean(np.abs((y_test - y_pred) / (y_test + epsilon))) * 100  # Modified MAPE calculation

# Calculate Huber Loss using Huber Regressor
huber_model = HuberRegressor()
start_time = time.time()
huber_model.fit(X_train_scaled, y_train)
end_time = time.time()
training_time_huber = end_time - start_time

y_pred_huber = huber_model.predict(X_test_scaled)
def huber_loss(y_true, y_pred, delta=1.0):
    error = y_true - y_pred
    is_small_error = np.abs(error) <= delta
    squared_loss = np.square(error) / 2
    linear_loss = delta * (np.abs(error) - delta / 2)
    return np.where(is_small_error, squared_loss, linear_loss).mean()
huber_loss_value = huber_loss(y_test, y_pred_huber)

print(f'MLP Model')
print(f'Training Time: {training_time_mlp:.4f} seconds')
print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {rmse}')
print(f'R-Squared: {r2}')
print(f'Mean Absolute Error: {mae}')
print(f'Median Absolute Error: {medae}')
print(f'Explained Variance Score: {evs}')
print(f'Mean Absolute Percentage Error: {mape:.2f}%')

print(f'\nHuber Regression Model')
print(f'Training Time: {training_time_huber:.4f} seconds')
print(f'Huber Loss: {huber_loss_value}')

MLP Model
Training Time: 398.4803 seconds
Mean Squared Error: 0.8805599090135041
Root Mean Squared Error: 0.9383815370165294
R-Squared: 0.6065599534511581
Mean Absolute Error: 0.5130662470765822
Median Absolute Error: 0.06813604249471095
Explained Variance Score: 0.6066545571042821
Mean Absolute Percentage Error: 1023008001.32%

Huber Regression Model
Training Time: 4.0028 seconds
Huber Loss: 0.39231556156515984
