In [28]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import Pipeline

In [29]:
BSinfo = pd.read_csv('Dataset/BSinfo.csv')
CLdata = pd.read_csv('Dataset/CLdata.csv')
ECdata = pd.read_csv('Dataset/ECdata.csv')
sample_submission = pd.read_csv('Dataset/SampleSubmission.csv')

In [30]:
# Define the WMAPE function
def wmape(y_true, y_pred):
    weights = np.abs(y_true)
    wmape = np.sum(np.abs(y_true - y_pred) / weights) * 100.0 / np.sum(weights)
    return wmape

In [31]:
sample_submission.drop(['w'], axis=1, inplace=True)

sample_submission['Time'] = pd.to_datetime(sample_submission['Time'])
ECdata['Time'] = pd.to_datetime(ECdata['Time'])

In [32]:
#merge ECdata and sample_submssion
FullData = pd.concat([ECdata, sample_submission], axis=0, ignore_index=True)
# FullData.drop(['w'], axis=1, inplace=True)
FullData.sort_values(by=['BS','Time'], inplace=True)
FullData.reset_index(drop=True, inplace=True)
# FullData.set_index('Time', inplace=True)
FullData

Unnamed: 0,Time,BS,Energy
0,2023-01-01 01:00:00,B_0,64.275037
1,2023-01-01 02:00:00,B_0,55.904335
2,2023-01-01 03:00:00,B_0,57.698057
3,2023-01-01 04:00:00,B_0,55.156951
4,2023-01-01 05:00:00,B_0,56.053812
...,...,...,...
118763,2023-01-02 19:00:00,B_999,
118764,2023-01-02 20:00:00,B_999,
118765,2023-01-02 21:00:00,B_999,10.014948
118766,2023-01-02 22:00:00,B_999,


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

# Load your DataFrame (replace 'data.csv' with your file)
# FullData = pd.read_csv('data.csv')

# Assuming your DataFrame has columns 'Time', 'BS', and 'Energy'

# Handle missing values for each base station
# FullData['Energy'] = FullData.groupby('BS')['Energy'].fillna(method='ffill')


# Convert 'Time' column to datetime if not already
# FullData['Time'] = pd.to_datetime(FullData['Time'])

# Group by 'BS' and 'Time' and aggregate 'Energy'
grouped_data = FullData.groupby(['BS', 'Time'])['Energy'].sum()
models = {}
models_fit = {}
# Iterate through each base station and train ARIMA model
for base_station, data in grouped_data.groupby('BS'):
    print(f"Training ARIMA model for Base Station: {base_station}")
    
    # # Plot the time series data for the current base station
    # data.plot(figsize=(10, 6))
    # plt.xlabel('Time')
    # plt.ylabel('Energy')
    # plt.title(f'Time Series Data for Base Station {base_station}')
    # plt.show()
    
    # Plot ACF and PACF to determine ARIMA parameters
    # plot_acf(data)
    # plot_pacf(data, lags  =10)
    # plt.show()
    
    # Train ARIMA model for the current base station
    order = (1, 1, 1)  # Example ARIMA order (p, d, q)
    models[base_station] = ARIMA(data, order=order)
    models_fit[base_station] = models[base_station].fit()
    
    # Summary of the model
    # print(model_fit.summary())
    
    # # Plot model residuals
    # residuals = pd.Series(model_fit.resid)
    # residuals.plot(kind='hist', bins=20, density=True)
    # plt.xlabel('Residuals')
    # plt.ylabel('Density')
    # plt.title('Histogram of Model Residuals')
    # plt.show()



In [8]:
sample_submission.head()

Unnamed: 0,Time,BS,Energy
0,2023-01-01 06:00:00,B_0,
1,2023-01-01 11:00:00,B_0,
2,2023-01-01 12:00:00,B_0,
3,2023-01-01 13:00:00,B_0,
4,2023-01-01 23:00:00,B_0,


In [11]:
models_fit

{'B_0': <statsmodels.tsa.arima.model.ARIMAResultsWrapper at 0x229aecab4f0>,
 'B_1': <statsmodels.tsa.arima.model.ARIMAResultsWrapper at 0x229aeca8970>,
 'B_10': <statsmodels.tsa.arima.model.ARIMAResultsWrapper at 0x229aecd5c40>,
 'B_100': <statsmodels.tsa.arima.model.ARIMAResultsWrapper at 0x229aece0190>,
 'B_1000': <statsmodels.tsa.arima.model.ARIMAResultsWrapper at 0x229aece6640>,
 'B_1001': <statsmodels.tsa.arima.model.ARIMAResultsWrapper at 0x229aeceba30>,
 'B_1002': <statsmodels.tsa.arima.model.ARIMAResultsWrapper at 0x229aeceddf0>,
 'B_1003': <statsmodels.tsa.arima.model.ARIMAResultsWrapper at 0x229aecf9490>,
 'B_1004': <statsmodels.tsa.arima.model.ARIMAResultsWrapper at 0x229aecfe820>,
 'B_1005': <statsmodels.tsa.arima.model.ARIMAResultsWrapper at 0x229aed03880>,
 'B_1006': <statsmodels.tsa.arima.model.ARIMAResultsWrapper at 0x229aed08c70>,
 'B_1007': <statsmodels.tsa.arima.model.ARIMAResultsWrapper at 0x229aed0df70>,
 'B_1008': <statsmodels.tsa.arima.model.ARIMAResultsWrapper a

In [43]:
# Specify the time for which you want to predict the energy value
desired_time = pd.Timestamp('2023-01-01 6:00:00')

# Number of steps ahead to forecast (can be adjusted based on your preference)
forecast_steps = 20

# Forecast using the ARIMA model
forecast = models_fit['B_10'].forecast(steps=forecast_steps)

# Convert the forecast result to a DataFrame with a time index
forecast_df = pd.DataFrame({'Forecasted Energy': forecast})
# , index=[desired_time]
# Print the forecasted value
print(forecast_df)


     Forecasted Energy
142          22.649469
143          24.412999
144          24.550310
145          24.561002
146          24.561834
147          24.561899
148          24.561904
149          24.561904
150          24.561904
151          24.561904
152          24.561904
153          24.561904
154          24.561904
155          24.561904
156          24.561904
157          24.561904
158          24.561904
159          24.561904
160          24.561904
161          24.561904


  return get_prediction_index(
  return get_prediction_index(


In [35]:
models_fit['B_0'].summary()

0,1,2,3
Dep. Variable:,Energy,No. Observations:,132.0
Model:,"ARIMA(1, 1, 1)",Log Likelihood,-621.147
Date:,"Sat, 26 Aug 2023",AIC,1248.295
Time:,09:02:16,BIC,1256.92
Sample:,0,HQIC,1251.8
,- 132,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ar.L1,0.1063,0.100,1.067,0.286,-0.089,0.302
ma.L1,-0.9999,7.429,-0.135,0.893,-15.561,13.561
sigma2,742.2535,5510.519,0.135,0.893,-1.01e+04,1.15e+04

0,1,2,3
Ljung-Box (L1) (Q):,0.09,Jarque-Bera (JB):,60.61
Prob(Q):,0.77,Prob(JB):,0.0
Heteroskedasticity (H):,1.13,Skew:,-1.55
Prob(H) (two-sided):,0.69,Kurtosis:,4.22


In [None]:

merged_df = merged_df[['ID','Energy']]

output_csv_path = 'SampleSubmission_25.csv'
merged_df.to_csv(output_csv_path, index=False)

In [25]:
FullData.head(15)

Unnamed: 0,Time,BS,Energy
0,2023-01-01 01:00:00,B_0,64.275037
1,2023-01-01 02:00:00,B_0,55.904335
2,2023-01-01 03:00:00,B_0,57.698057
3,2023-01-01 04:00:00,B_0,55.156951
4,2023-01-01 05:00:00,B_0,56.053812
5,2023-01-01 06:00:00,B_0,56.053812
6,2023-01-01 07:00:00,B_0,82.959641
7,2023-01-01 08:00:00,B_0,91.03139
8,2023-01-01 09:00:00,B_0,78.176383
9,2023-01-01 10:00:00,B_0,72.64574


In [26]:
FullData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118768 entries, 0 to 118767
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype         
---  ------  --------------   -----         
 0   Time    118768 non-null  datetime64[ns]
 1   BS      118768 non-null  object        
 2   Energy  115614 non-null  float64       
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 2.7+ MB


In [27]:
# give the entries with missing values
FullData[FullData.isnull().any(axis=1)]

Unnamed: 0,Time,BS,Energy
406,2023-01-01 01:00:00,B_100,
547,2023-01-02 00:00:00,B_1000,
548,2023-01-02 01:00:00,B_1000,
549,2023-01-02 02:00:00,B_1000,
550,2023-01-02 03:00:00,B_1000,
...,...,...,...
118265,2023-01-02 22:00:00,B_984,
118266,2023-01-02 23:00:00,B_984,
118624,2023-01-02 00:00:00,B_994,
118720,2023-01-02 00:00:00,B_998,


In [None]:
FullData.