### Instructions

- Import you data and perform basic data exploration phase
   - Display general information about the dataset
   - Create a pandas profiling reports to gain insights into the dataset
   - Handle Missing and corrupted values
   - Remove duplicates, if they exist
   - Handle outliers, if they exist
- Encode categorical features
- Select your target variable and the features
- Split your dataset to training and test sets
- Based on your data exploration phase select a ML regression algorithm and train it on the training set
- Assess your model performance on the test set using relevant evaluation metrics
- Discuss with your cohort alternative ways to improve your model performance

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd

In [None]:
energy_df = pd.read_csv("5G_energy_consumption_dataset.csv")

In [None]:
energy_df.head()

In [None]:
energy_df.info()

In [None]:
energy_df['Time'] = pd.to_datetime(energy_df['Time'])

In [None]:
energy_df.describe()

In [None]:
energy_df['Time'].value_counts()

In [None]:
energy_df['BS'].value_counts()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

In [None]:
from ydata_profiling import ProfileReport

In [None]:
profile = ProfileReport(energy_df, title = "Energy Consumption Report", explorative = True)
profile.to_file("energy_profile_report.html")

In [None]:
profile.to_notebook_iframe()

In [None]:
energy_df = energy_df.drop("ESMODE", axis = 1)

In [None]:
energy_df = energy_df.drop("Time", axis = 1)

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le =LabelEncoder()

energy_df["BS"] = le.fit_transform(energy_df["BS"])

In [None]:
%matplotlib inline
numerical_features = energy_df.select_dtypes(include='number').columns
plt.figure(figsize=(25, 25))
for i in range(0, len(numerical_features)):
    plt.subplot(10, 4, i+1)
    sns.boxplot(x = energy_df[numerical_features[i]], palette = 'viridis')
    plt.title(numerical_features[i], fontsize = 30)
    plt.xlabel(' ')
    plt.tight_layout()

In [None]:
# Import necessary libraries for data manipulation and visualization
import numpy as np  # For numerical operations
import pandas as pd  # For data manipulation and analysis
from scipy import stats  # Import stats module for Z-score
import matplotlib.pyplot as plt  # For plotting data
import seaborn as sns  # For enhanced data visualizations
from scipy import stats # For statistics

# Import libraries for machine learning models and evaluation
from sklearn.model_selection import train_test_split  # For splitting data into training and testing sets
from sklearn.preprocessing import StandardScaler, OneHotEncoder  # For scaling numerical data and encoding categorical data
from sklearn.linear_model import LinearRegression, ElasticNet  # For linear Regression
from sklearn.tree import DecisionTreeRegressor  # For Decision Tree Regression
from sklearn.ensemble import RandomForestRegressor  # For Random Forest Regression
from sklearn.svm import SVR  # For Support Vector Regression 
import xgboost as xgb # For XGBoost Regression
from sklearn.metrics import mean_squared_error, r2_score, make_scorer  # For model evaluation metrics

In [None]:
X = energy_df.drop(columns=['Energy'])  # Drop the target column to get features
y = energy_df['Energy']  # Select the target column

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit and transform the training data
X_train_scaled = scaler.fit_transform(X_train)

# Transform the test data
X_test_scaled = scaler.transform(X_test)

In [None]:
# Initialize the model
lin_reg = LinearRegression()

# Fit the model
lin_reg.fit(X_train_scaled, y_train)

# Predicting on test data
y_pred_lr = lin_reg.predict(X_test_scaled)

# Model evaluation
print("Linear Regression RMSE:", mean_squared_error(y_test, y_pred_lr, squared=False))
print("Linear Regression R2 Score:", r2_score(y_test, y_pred_lr))

In [None]:
from sklearn.linear_model import ElasticNet

# Initialize the Elastic Net model
elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=42)

# Fit the model
elastic_net.fit(X_train_scaled, y_train)

# Predicting on test data
y_pred_en = elastic_net.predict(X_test_scaled)

# Model evaluation
print("Elastic Net RMSE:", mean_squared_error(y_test, y_pred_en, squared=False))
print("Elastic Net R2 Score:", r2_score(y_test, y_pred_en))

In [None]:
# Initialize the model with some parameters
tree_reg = DecisionTreeRegressor(max_depth=5, min_samples_split=10, random_state=42)

# Fit the model
tree_reg.fit(X_train_scaled, y_train)

# Predicting on test data
y_pred_tree = tree_reg.predict(X_test_scaled)

# Model evaluation
print("Decision Tree RMSE:", mean_squared_error(y_test, y_pred_tree, squared=False))
print("Decision Tree R2 Score:", r2_score(y_test, y_pred_tree))

In [None]:
# Initialize the model with some parameters
rf_reg = RandomForestRegressor(n_estimators=100, max_depth=10, min_samples_split=5, random_state=42)

# Fit the model
rf_reg.fit(X_train_scaled, y_train)

# Predicting on test data
y_pred_rf = rf_reg.predict(X_test_scaled)

# Model evaluation
print("Random Forest RMSE:", mean_squared_error(y_test, y_pred_rf, squared=False))
print("Random Forest R2 Score:", r2_score(y_test, y_pred_rf))

In [None]:
# Initialize the model with some parameters
svr_reg = SVR(kernel='rbf', C=100, epsilon=0.1)

# Fit the model
svr_reg.fit(X_train_scaled, y_train)

# Predicting on test data
y_pred_svr = svr_reg.predict(X_test_scaled)

# Model evaluation
print("SVR RMSE:", mean_squared_error(y_test, y_pred_svr, squared=False))
print("SVR R2 Score:", r2_score(y_test, y_pred_svr))

In [None]:
# Initialize the model with some parameters
xgb_reg = xgb.XGBRegressor(n_estimators=100, learning_rate=0.05, max_depth=6)

# Fit the model
xgb_reg.fit(X_train_scaled, y_train)

# Predicting on test data
y_pred_xgb = xgb_reg.predict(X_test_scaled)

# Model evaluation
print("XGBoost RMSE:", mean_squared_error(y_test, y_pred_xgb, squared=False))
print("XGBoost R2 Score:", r2_score(y_test, y_pred_xgb))

In [None]:
# Import the necessary functions from scikit-learn
from sklearn.metrics import make_scorer  # To create custom scoring functions
from sklearn.model_selection import cross_validate  # To perform cross-validation

In [None]:
# Initialize the XGBoost model
model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.05, max_depth=6, random_state=42)

# Create custom scorers for RMSE and R²
# `make_scorer` allows using custom metrics or built-in metrics in cross-validation
rmse_scorer = make_scorer(mean_squared_error, squared=False)
r2_scorer = make_scorer(r2_score)

# Dictionary of scoring metrics
scoring = {'RMSE': rmse_scorer, 'R2': r2_scorer}

# Perform cross-validation
# `cross_validate` splits the data into folds, trains and tests the model, and calculates the scores
cv_results = cross_validate(model, X, y, scoring=scoring, cv=5, return_train_score=True)
# Note that we are using X and y and not X_train and y_train

# Output the results
# `cv_results` contains the scores for each fold
print("RMSE scores:", cv_results['test_RMSE'])  # RMSE scores for each fold
print("R² scores:", cv_results['test_R2'])  # R² scores for each fold
print("Average RMSE:", cv_results['test_RMSE'].mean())  # Average RMSE across all folds
print("Average R²:", cv_results['test_R2'].mean())  # Average R² score across all folds


In [None]:
# We'll use XGBoost and tune the n_estimators, learning_rate, and max_depth.

# Initialize the best RMSE and best R² to extreme values to ensure any calculated values will be better
best_rmse = float('inf')
best_r2 = -float('inf')
best_params = {}

# Iterate over different values for n_estimators, learning_rate, and max_depth
for n_estimators in [50, 100, 200]:  # Number of boosting rounds
    for learning_rate in [0.01, 0.1, 0.2]:  # Step size at each iteration
        for max_depth in range(3, 10, 2):  # Maximum depth of each tree

            # Initialize the XGBoost model with the current set of hyperparameters
            xgb_reg = xgb.XGBRegressor(
                n_estimators=n_estimators,       # Number of boosting rounds
                learning_rate=learning_rate,     # Step size at each iteration
                max_depth=max_depth,             # Maximum depth of each tree
                random_state=42                  # Ensures reproducibility
            )
            
            # Train the model using the training data
            xgb_reg.fit(X_train, y_train)
            
            # Predict the target values for the test data
            y_pred_xgb = xgb_reg.predict(X_test)
            
            # Calculate Root Mean Squared Error (RMSE) and R² score for the current model
            rmse = mean_squared_error(y_test, y_pred_xgb, squared=False)
            r2 = r2_score(y_test, y_pred_xgb)
            
            # Check if the current RMSE is better (lower) than the best RMSE so far
            if rmse < best_rmse:
                # Update the best RMSE, R² score, and the best parameters
                best_rmse = rmse
                best_r2 = r2
                best_params = {
                    'n_estimators': n_estimators, 
                    'learning_rate': learning_rate, 
                    'max_depth': max_depth
                }

# Print the best hyperparameters and corresponding RMSE and R² score
print("Best Parameters for XGBoost:", best_params)
print("Best RMSE for XGBoost:", best_rmse)
print("Best R² Score for XGBoost:", best_r2)