In [35]:
import pandas as pd
import numpy as np


#Models
from sklearn.ensemble import RandomForestRegressor

#Metrics evaluation
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

from sklearn.preprocessing import LabelEncoder

# Spliting data set
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

### 1. Import your data and perform basic data exploration phase


In [2]:
df = pd.read_csv('5G_energy_consumption_dataset.csv')

df.head()

Unnamed: 0,Time,BS,Energy,load,ESMODE,TXpower
0,20230101 010000,B_0,64.275037,0.487936,0.0,7.101719
1,20230101 020000,B_0,55.904335,0.344468,0.0,7.101719
2,20230101 030000,B_0,57.698057,0.193766,0.0,7.101719
3,20230101 040000,B_0,55.156951,0.222383,0.0,7.101719
4,20230101 050000,B_0,56.053812,0.175436,0.0,7.101719


### Display general information about the dataset


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92629 entries, 0 to 92628
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Time     92629 non-null  object 
 1   BS       92629 non-null  object 
 2   Energy   92629 non-null  float64
 3   load     92629 non-null  float64
 4   ESMODE   92629 non-null  float64
 5   TXpower  92629 non-null  float64
dtypes: float64(4), object(2)
memory usage: 4.2+ MB


### Create a pandas profiling reports to gain insights into the dataset.

In [17]:
df.describe()

Unnamed: 0,BS,Energy,load,ESMODE,TXpower
count,92629.0,92629.0,92629.0,92629.0,92629.0
mean,423.585907,28.138997,0.244705,0.081361,6.765427
std,244.022127,13.934645,0.234677,0.382317,0.309929
min,0.0,0.747384,0.0,0.0,5.381166
25%,213.0,18.236173,0.05737,0.0,6.427504
50%,423.0,24.06577,0.16555,0.0,6.875934
75%,629.0,35.724963,0.363766,0.0,6.875934
max,922.0,100.0,0.993957,4.0,8.375336


### Remove duplicates, if they exist

In [9]:
df.duplicated()

0        False
1        False
2        False
3        False
4        False
         ...  
92624    False
92625    False
92626    False
92627    False
92628    False
Length: 92629, dtype: bool

In [None]:
# No Duplicates

### Handle Missing and corrupted values

In [15]:
df.isnull().sum()

Time       0
BS         0
Energy     0
load       0
ESMODE     0
TXpower    0
dtype: int64

### Encode categorical features

In [4]:
enc = LabelEncoder()

df["BS"] = enc.fit_transform(df["BS"])

### 2. Select your target variable and the features

In [21]:
features = df.drop(['Energy', 'Time'], axis=1)

labels = df["Energy"]

### 3. Split your dataset to training and test sets

In [25]:
x_train, x_test, y_train, y_test = train_test_split(features, labels, train_size=0.8, random_state=10)

y_test.shape
x_test.shape

(18526, 4)

### 4. Based on your data exploration phase select a ML regression algorithm and train it on the training set

In [27]:
rf = RandomForestRegressor()

rf.fit(x_train, y_train)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [28]:
# make Predictions 
y_pred = rf.predict(x_test)

In [30]:
y_pred

array([15.93572496, 24.64424514,  9.51718984, ..., 20.52017937,
       18.38565023, 11.20017439], shape=(18526,))

In [31]:
print(f"Model predictions: {y_pred}\n\n actual profit: {y_test}")

Model predictions: [15.93572496 24.64424514  9.51718984 ... 20.52017937 18.38565023
 11.20017439]

 actual profit: 40400    16.143498
55421    30.194320
4073      8.071749
91459    47.234679
88781    37.668161
           ...    
66127    37.070254
5907     27.503737
25015    19.730942
89165    17.638266
4232      8.370703
Name: Energy, Length: 18526, dtype: float64


### 5. Assess your model performance on the test set using relevant evaluation metrics


In [32]:
mae = mean_absolute_error(y_test, y_pred)

print(f"The model's mean absolute error is {mae:.2f}")

The model's mean absolute error is 2.03


In [33]:
mse = mean_squared_error(y_test, y_pred)

rmse = np.sqrt(mse)

print(f"The Model's Root Mean Squared error is {rmse:.2f}")

The Model's Root Mean Squared error is 3.21


In [34]:
r_score = r2_score(y_test, y_pred)

print(f"The Model's r2 score is {r_score:.2f}")

The Model's r2 score is 0.95
