In [1]:
!pip install category_encoders



In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error
from sklearn.svm import SVR
from scipy.stats import uniform, randint
from category_encoders import LeaveOneOutEncoder

# Supervised Learning Regression Checkpoint

### 5G-Energy consumption

In this checkpoint, I am going to work on the '5G-Energy consumption' dataset that was provided by the international telecommunication union (ITU) in 2023 as part of a global challenge or competition for data scientists all over the world to solve the 5G energy consumption modelling using machine learning techniques.

Checkpoint problematic : Network operational expenditure (OPEX) already accounts for around 25 percent of the total telecom operator’s cost, and 90 percent of it is spent on large energy bills. More than 70 percent of this energy is estimated to be consumed by the radio access network (RAN), particularly by the base stations (BSs). Thus, the objective is to build and train a ML model to estimate the energy consumed by different 5G base stations taking into consideration the impact of various engineering configurations, traffic conditions, and energy-saving methods.

Dataset description : This dataset is derived from the original copy and simplified for learning purposes. It includes cell-level traffic statistics of 4G/5G sites collected on different days.


In [3]:
energy = pd.read_csv('5G_energy_consumption_dataset.csv')

In [4]:
energy.head()

Unnamed: 0,Time,BS,Energy,load,ESMODE,TXpower
0,20230101 010000,B_0,64.275037,0.487936,0.0,7.101719
1,20230101 020000,B_0,55.904335,0.344468,0.0,7.101719
2,20230101 030000,B_0,57.698057,0.193766,0.0,7.101719
3,20230101 040000,B_0,55.156951,0.222383,0.0,7.101719
4,20230101 050000,B_0,56.053812,0.175436,0.0,7.101719


In [5]:
energy.isnull().sum()

Time       0
BS         0
Energy     0
load       0
ESMODE     0
TXpower    0
dtype: int64

In [6]:
energy.duplicated().sum()

0

In [7]:
energy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92629 entries, 0 to 92628
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Time     92629 non-null  object 
 1   BS       92629 non-null  object 
 2   Energy   92629 non-null  float64
 3   load     92629 non-null  float64
 4   ESMODE   92629 non-null  float64
 5   TXpower  92629 non-null  float64
dtypes: float64(4), object(2)
memory usage: 4.2+ MB


### Change Time Column Data Type to Date Time

In [8]:
energy['Time'] = pd.to_datetime(energy['Time'])

### Extract Hour since in your correlation it had a weak positive correlation

In [9]:
energy['Hour'] = pd.to_datetime(energy['Time']).dt.hour

### Extract Month, and Day to check correlation

In [10]:
energy['Month'] = pd.to_datetime(energy['Time']).dt.month
energy['Day'] = pd.to_datetime(energy['Time']).dt.day

In [11]:
energy.head()

Unnamed: 0,Time,BS,Energy,load,ESMODE,TXpower,Hour,Month,Day
0,2023-01-01 01:00:00,B_0,64.275037,0.487936,0.0,7.101719,1,1,1
1,2023-01-01 02:00:00,B_0,55.904335,0.344468,0.0,7.101719,2,1,1
2,2023-01-01 03:00:00,B_0,57.698057,0.193766,0.0,7.101719,3,1,1
3,2023-01-01 04:00:00,B_0,55.156951,0.222383,0.0,7.101719,4,1,1
4,2023-01-01 05:00:00,B_0,56.053812,0.175436,0.0,7.101719,5,1,1


#### Encoding BS Column

In [12]:
encoder = LeaveOneOutEncoder()
energy['BS Encoded'] = encoder.fit_transform(energy['BS'], energy['Energy'])

In [13]:
energy.head()

Unnamed: 0,Time,BS,Energy,load,ESMODE,TXpower,Hour,Month,Day,BS Encoded
0,2023-01-01 01:00:00,B_0,64.275037,0.487936,0.0,7.101719,1,1,1,73.170243
1,2023-01-01 02:00:00,B_0,55.904335,0.344468,0.0,7.101719,2,1,1,73.244982
2,2023-01-01 03:00:00,B_0,57.698057,0.193766,0.0,7.101719,3,1,1,73.228966
3,2023-01-01 04:00:00,B_0,55.156951,0.222383,0.0,7.101719,4,1,1,73.251655
4,2023-01-01 05:00:00,B_0,56.053812,0.175436,0.0,7.101719,5,1,1,73.243647


## Correlation Matrix

In [14]:
corr = energy.drop(columns = ['BS']).corr()
corr

Unnamed: 0,Time,Energy,load,ESMODE,TXpower,Hour,Month,Day,BS Encoded
Time,1.0,0.030404,0.066937,-0.032436,-0.038777,0.132592,,0.987834,0.011958
Energy,0.030404,1.0,0.643022,-0.271783,0.45076,0.203709,,-0.001294,0.9233
load,0.066937,0.643022,1.0,-0.208473,0.200221,0.323958,,0.016688,0.486274
ESMODE,-0.032436,-0.271783,-0.208473,1.0,0.060017,-0.234626,,0.004096,-0.13822
TXpower,-0.038777,0.45076,0.200221,0.060017,1.0,0.00063,,-0.03921,0.487278
Hour,0.132592,0.203709,0.323958,-0.234626,0.00063,1.0,,-0.023159,0.000269
Month,,,,,,,,,
Day,0.987834,-0.001294,0.016688,0.004096,-0.03921,-0.023159,,1.0,0.012019
BS Encoded,0.011958,0.9233,0.486274,-0.13822,0.487278,0.000269,,0.012019,1.0


Features to Select 
 - Load : with a correlation of 0.643022 is a moderately strong correlation
 - TXPower : with a correlation of 0.450760 is a moderate correlation
 - Hour : with a correlation of 0.203709 althogh is is weakly positive it can still be used
 - BS Encoded : woth the stromgesr correlation of 0.923300 is definately a factor when choosing Features

### Splitting My Data

In [15]:
X = energy.drop(columns = ['Time', 'BS', 'ESMODE', 'Month', 'Day', 'Energy'])
y = energy['Energy']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 42)

Using Randomized Search to pick out my Best Hyperparameters and models based on a list of models

In [17]:
model_params = {
    'GradientBoostingRegressor' : {
        'model' : GradientBoostingRegressor(),
        'params' : {
            'n_estimators' : randint(100, 200),
            'max_depth' : randint(3, 10),
            'learning_rate' : uniform(0.01, 0.2)
                                     
            }
        },

    'svr' : {
        'model' : SVR(),
        'params' : {
            'C' : uniform(10, 100),
            'kernel' : ['linear', 'rbf', 'poly', 'sigmoid'],
            'gamma' : ['scale'],
          
        }
    }
}
    


In [None]:
for model_name, mp in model_params.items() :
    print(f' Running randome Search For {model_name}')
    random_search = RandomizedSearchCV( mp['model'], mp['params'], cv = 3, n_iter = 10, n_jobs = -1, error_score = 'raise')
    random_search.fit(X_train,y_train)
    
    print(f'The Best Parameters for {model_name} : {random_search.best_params_}')
    print(f'The Best Score for {model_name} : {random_search.best_score_}')
    
    

 Running randome Search For GradientBoostingRegressor
The Best Parameters for GradientBoostingRegressor : {'learning_rate': 0.10672687983266597, 'max_depth': 9, 'n_estimators': 178}
The Best Score for GradientBoostingRegressor : 0.9766344085737529
 Running randome Search For svr
