#  Assignment Overview
You are going to work on the '5G-Energy consumption' dataset that was provided by the  international telecommunication union (ITU) in 2023 as part of a global challenge or competition for data scientists all over the world to solve the 5G energy consumption modelling using machine learning techniques.

Checkpoint problematic : Network operational expenditure (OPEX) already accounts for around 25 percent of the total telecom operator’s cost, and 90 percent of it is spent on large energy bills. More than 70 percent of this energy is estimated to be consumed by the radio access network (RAN), particularly by the base stations (BSs). Thus, the objective is to build and train a ML model to estimate the energy consumed by different 5G base stations taking into consideration the impact of various engineering configurations, traffic conditions, and energy-saving methods.

Dataset description : This dataset is derived from the original copy and simplified for learning purposes. It includes cell-level traffic statistics of 4G/5G sites collected on different days.

# Import necessary libraries

In [None]:
# importing necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, root_mean_squared_error
from sklearn.preprocessing import RobustScaler

# Load the data

In [None]:
df = pd.read_csv("5G_energy_consumption_dataset.csv")

In [None]:
df.head(10)

In [None]:
df.info()

In [None]:
#checking descriptive statistics
df.describe()

In [None]:
# checking for missing values
df.isnull().sum()

In [None]:
df.columns

In [None]:
# Convert all column names to lowercase
df.columns = df.columns.str.lower()
df.columns

In [None]:
# extract hour, day, year from time column
df['hour'] = pd.to_datetime(df['time']).dt.hour
df['day'] = pd.to_datetime(df['time']).dt.day

In [None]:
df.head(20)

In [None]:
df.describe()

In [None]:
# checking for outliers
df.plot(kind = "box" , subplots = True , figsize = (18,15) ,  layout = (3,4))
#plt.savefig('california housing outliers.png')
plt.show()

In [None]:
# see distribution of the energy column
sns.histplot(data=df.energy, kde=True)

For the energy column the distribution is skewed to the left and therefore will used Inter Quartile Range(IQR) to deal with outliers.

In [None]:
# visualize histplot of load column to check the distribution before handling outliers

sns.histplot(data=df.load, kde=True)

The load column also has a left-skewed distribution

In [None]:
# visualize  to check the distribution before handling outliers

sns.histplot(data=df.esmode, kde=True)

In [None]:
# visualize txpower column to check the distribution before handling outliers
sns.histplot(data=df.txpower, kde=True)

# Removing the outliers using the Inter Quartile Range

In [None]:
# Function to remove outliers using the IQR method
def remove_outliers_iqr(df, column):
    # Calculate Q1 (25th percentile) and Q3 (75th percentile)
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    
    # Calculate IQR
    IQR = Q3 - Q1
    
    # Define lower and upper limit
    lower_limit = Q1 - 1.5 * IQR
    upper_limit = Q3 + 1.5 * IQR
    
    # Filter the dataframe to remove outliers
    filtered_df = df[(df[column] >= lower_limit) & (df[column] <= upper_limit)]
    
    return filtered_df

# List of columns to remove outliers from
columns = ['energy', 'txpower', 'load']

# Apply the IQR method for each column and update the dataframe
for column in columns:
    df = remove_outliers_iqr(df, column)

In [None]:
# Check the shape of the dataframe after removing outliers
df.shape

In [None]:
df.head()

In [None]:
# checking for outliers
df.plot(kind = "box" , subplots = True , figsize = (18,15) ,  layout = (3,4))
#plt.savefig('california housing outliers.png')
plt.show()

## Capping the remaining outliers in the load and energy columns

In [None]:
# Function to cap outliers using the IQR method
def cap_outliers_iqr(df, column):
    # Calculate Q1 (25th percentile) and Q3 (75th percentile)
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    
    # Calculate IQR
    IQR = Q3 - Q1
    
    # Define lower and upper limit
    lower_limit = Q1 - 1.5 * IQR
    upper_limit = Q3 + 1.5 * IQR
    
    # Cap the values below the lower bound
    df[column] = df[column].apply(lambda x: lower_limit if x < lower_limit else x)
    
    # Cap the values above the upper bound
    df[column] = df[column].apply(lambda x: upper_limit if x > upper_limit else x)

    return df

# List of columns to cap outliers in
columns_to_cap = ['energy', 'load']

# Apply the capping function for the specified columns
for column in columns_to_cap:
    df = cap_outliers_iqr(df, column)

In [None]:
# Check the dataframe after capping outliers
df[columns_to_cap].describe()

In [None]:
# checking for outliers
df.plot(kind = "box" , subplots = True , figsize = (18,15) ,  layout = (3,4))
#plt.savefig('california housing outliers.png')
plt.show()

## removing outliers in the esmode column using imputation

In [None]:
# Calculate the mode of the 'esmode' column
#esmode_mode = df['esmode'].mode()[0]
#esmode_mode

In [None]:
# Replace outliers (values outside the range 0-4) with the mode
#df['esmode'] = df['esmode'].apply(lambda x: esmode_mode if x < 0 or x > 4 else x)

In [None]:
# Check the unique values to verify no outliers remain
#print(df['esmode'].unique())

In [None]:
df.head()

since esmode column has discrete column we use a countplot to check for outliers

## Check the correlation between the features and target variable

In [None]:
# Select specific columns for the correlation matrix
columns_to_correlate = ['energy', 'load', 'esmode', 'txpower', 'hour', 'day']

# Calculate the correlation matrix for the selected columns
correlation_matrix = df[columns_to_correlate].corr()

# Plot the heatmap of the correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Matrix for Selected Columns")
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.lineplot(x='hour', y='energy', data=df)
plt.title('Energy Consumption vs Time of Day')
plt.xlabel('Hour of Day')
plt.ylabel('Energy Consumption')
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
sns.lineplot(x='day', y='energy', data=df)
plt.title('Energy Consumption vs day')
plt.xlabel('Day')
plt.ylabel('Energy Consumption')
plt.show()


# 2. Selecting target variable and the features

In [None]:
#Select the specified feature columns

X = df['load']

# Select the target column
y = df['energy']

# 3. Split the dataset into training and test sets

In [None]:
# split the data as train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# check the splitted data is the same size
print("X_train:", X_train.shape)  
print("X_test:",  X_test.shape)
print("y_train:", y_train.shape)
print("y_test:", y_test.shape)

# 4. Linear Regression model

In [None]:
# Create a linear regression model
model = LinearRegression()

In [None]:
# Train the model on the training data
model.fit(X_train, y_train)

In [None]:
# Make predictions on the testing data
y_pred = model.predict(X_test)

# 5. Evaluate the model

In [None]:
#Evaluate the model
# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred) 

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)  

# calculate R squared
r2 = r2_score(y_test , y_pred)
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f" r squared (r2): {r2}")

In [None]:
# Plot actual vs predicted  values
# Scatter plot to compare actual vs predicted values
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, color="blue", label="Predictions")
plt.plot(
    [min(y_test), max(y_test)],
    [min(y_test), max(y_test)],
    color="red",
    linestyle="--",
    label="Perfect Fit (y = x)",
)

plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Actual vs Predicted Values")
plt.legend()
plt.grid(True)

plt.show()