In [1]:
# reading data

import gdown
import pandas as pd
import re

link = "https://drive.google.com/file/d/1J0HUaU9Esevlw36KmdhfbmWdDWEgQRL-/view?usp=sharing"

file_id = re.findall(r"/d/([a-zA-Z0-9_-]+)", link)[0]
url = f"https://drive.google.com/uc?id={file_id}"
output = "dataset.csv"
gdown.download(url, output, quiet=False)
df = pd.read_csv(output)
print("Dataset loaded successfully with shape:", df.shape)


Downloading...
From (original): https://drive.google.com/uc?id=1J0HUaU9Esevlw36KmdhfbmWdDWEgQRL-
From (redirected): https://drive.google.com/uc?id=1J0HUaU9Esevlw36KmdhfbmWdDWEgQRL-&confirm=t&uuid=42c54542-e461-4792-97b9-626fb6eb6769
To: /content/dataset.csv
100%|██████████| 330M/330M [00:02<00:00, 114MB/s]
  df = pd.read_csv(output)


Dataset loaded successfully with shape: (1040000, 24)


In [10]:
import pandas as pd
import numpy as np
from statsmodels.tsa.api import VAR

# Define numeric columns
numeric_cols = [
    'Signal_RSRP_dBm', 'SINR_dB', 'Throughput_DL_Mbps', 'Latency_ms', 'Drop_Rate_pct',
    'Temperature_C', 'Voltage_V', 'Power_W', 'Battery_pct', 'Active_Users', 'Traffic_GBph', 'Humidity_pct'
]

# Train VAR model for each antenna
var_models = {}
min_data_points = 2 * (len(numeric_cols) + 1)  # Minimum requirement for VAR stability

for antenna_id in df["Antenna_ID"].unique():
    df_ant = df[df["Antenna_ID"] == antenna_id].copy()
    df_ant["Datetime"] = pd.to_datetime(df_ant["Datetime"])
    df_ant = df_ant.sort_values("Datetime").set_index("Datetime")

    if len(df_ant) < min_data_points:
        print(f"Skipping {antenna_id}: insufficient data ({len(df_ant)} rows)")
        var_models[antenna_id] = None
        continue

    try:
        model = VAR(df_ant[numeric_cols])
        fitted_model = model.fit(1)
        var_models[antenna_id] = fitted_model
        print(f"VAR model trained for {antenna_id}")
    except Exception as e:
        print(f"Failed to fit VAR model for {antenna_id}: {e}")
        var_models[antenna_id] = None


  self._init_dates(dates, freq)


VAR model trained for A39
VAR model trained for A29


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


VAR model trained for A15
VAR model trained for A43


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


VAR model trained for A8
VAR model trained for A21


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


VAR model trained for A19
VAR model trained for A23


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


VAR model trained for A11
VAR model trained for A24


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


VAR model trained for A36
VAR model trained for A40


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


VAR model trained for A3
VAR model trained for A22


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


VAR model trained for A2
VAR model trained for A44


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


VAR model trained for A30
VAR model trained for A38


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


VAR model trained for A33


  self._init_dates(dates, freq)


VAR model trained for A12
VAR model trained for A25


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


VAR model trained for A49


  self._init_dates(dates, freq)


VAR model trained for A27


  self._init_dates(dates, freq)


VAR model trained for A42
VAR model trained for A28


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


VAR model trained for A16
VAR model trained for A47


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


VAR model trained for A37
VAR model trained for A7


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


VAR model trained for A9
VAR model trained for A18


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


VAR model trained for A4
VAR model trained for A14


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


VAR model trained for A50
VAR model trained for A26


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


VAR model trained for A20
VAR model trained for A35


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


VAR model trained for A17
VAR model trained for A6


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


VAR model trained for A34
VAR model trained for A10


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


VAR model trained for A31
VAR model trained for A48


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


VAR model trained for A45
VAR model trained for A41


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


VAR model trained for A1
VAR model trained for A5


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


VAR model trained for A13
VAR model trained for A32


  self._init_dates(dates, freq)


VAR model trained for A46


  self._init_dates(dates, freq)


In [3]:
from lightgbm import LGBMClassifier
from sklearn.preprocessing import LabelEncoder

# Ensure categorical columns are properly typed
categorical_cols = [
    "Firmware_Version", "Technology_Type", "Antenna_Type", "Backhaul_Type", "Power_Source"
]
for col in categorical_cols:
    df[col] = df[col].astype("category")

# --- Step 1: Status Classification (All Antennas) ---
X = df[numeric_cols + categorical_cols]
y_status = df["Status"]

# Encode target
le_status = LabelEncoder()
y_status_encoded = le_status.fit_transform(y_status)

# Train on full dataset
clf_status = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    num_leaves=63,
    random_state=42,
    class_weight="balanced"
)
clf_status.fit(X, y_status_encoded, categorical_feature=categorical_cols)

# --- Step 2a: Warning Subtype Classification ---
df_warning = df[df["Status"] == "Warning"].copy()
if len(df_warning) > 0:
    X_warn = df_warning[numeric_cols + categorical_cols]
    y_warn = df_warning["Warning_Type"].dropna()
    le_warn = LabelEncoder()
    y_warn_encoded = le_warn.fit_transform(y_warn)
    clf_warning = LGBMClassifier(random_state=42, class_weight="balanced")
    clf_warning.fit(X_warn, y_warn_encoded, categorical_feature=categorical_cols)
else:
    clf_warning, le_warn = None, None

# --- Step 2b: Fail Subtype Classification ---
df_fail = df[df["Status"] == "Fail"].copy()
if len(df_fail) > 0:
    X_fail = df_fail[numeric_cols + categorical_cols]
    y_fail = df_fail["Fail_Type"].dropna()
    le_fail = LabelEncoder()
    y_fail_encoded = le_fail.fit_transform(y_fail)
    clf_fail = LGBMClassifier(random_state=42, class_weight="balanced")
    clf_fail.fit(X_fail, y_fail_encoded, categorical_feature=categorical_cols)
else:
    clf_fail, le_fail = None, None


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.057091 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2978
[LightGBM] [Info] Number of data points in the train set: 1040000, number of used features: 17
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.030864 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2983
[LightGBM] [Info] Number of data points in the train set: 560000, number of used features: 17
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Sta

In [25]:
def hybrid_forecast(df_input, forecast_steps=5, antenna_id=None, original_df=None):
    """
    Forecast using VAR (1 lag) and classify using LightGBM.
    Models trained across all antennas.
    """
    if antenna_id is None:
        antenna_id = df_input["Antenna_ID"].iloc[0]

    # Filter and prepare input
    history = df_input[df_input["Antenna_ID"] == antenna_id].copy()
    history["Datetime"] = pd.to_datetime(history["Datetime"])
    history = history.sort_values("Datetime").set_index("Datetime")

    # Get pre-trained VAR model
    var_model = var_models.get(antenna_id)
    if var_model is None:
        raise ValueError(f"No VAR model available for antenna {antenna_id}")



    forecast_records = []
    current_time = history.index[-1]

    for step in range(forecast_steps):

        try:
            # Use last 1 observation
            last_1 = history[numeric_cols].values[-1:].reshape(1, -1)
            forecasted_vals = var_model.forecast(y=last_1, steps=1)[0]
            preds = {col: forecasted_vals[i] for i, col in enumerate(numeric_cols)}
        except Exception as e:
            # Fallback: use last observed values
            last_row = history[numeric_cols].iloc[-1]
            preds = last_row.to_dict()

        # Update time
        new_time = current_time + pd.Timedelta(minutes=1) * (step + 1)

        # --- Classification ---
        clf_input = pd.DataFrame([preds])
        # Ensure clf_input has the same categorical columns and types as the training data
        for col in categorical_cols:
            if col in history.columns:
                clf_input[col] = history[col].iloc[-1]
            else:
                # Add missing categorical columns with a placeholder or mode if necessary
                # For simplicity, we'll add the column with a placeholder
                clf_input[col] = None # Or some other appropriate placeholder

        # Reindex to match training data columns and ensure correct categorical types
        clf_input = clf_input.reindex(columns=numeric_cols + categorical_cols)
        for col in categorical_cols:
             clf_input[col] = clf_input[col].astype("category")


        # Predict status
        status_pred_enc = clf_status.predict(clf_input)[0]
        status_pred = le_status.inverse_transform([status_pred_enc])[0]

        # Predict subtypes
        warning_type, fail_type = None, None
        if status_pred == "Warning" and clf_warning is not None:
            warn_pred = clf_warning.predict(clf_input)[0]
            warning_type = le_warn.inverse_transform([warn_pred])[0]
        elif status_pred == "Fail" and clf_fail is not None:
            fail_pred = clf_fail.predict(clf_input)[0]
            fail_type = le_fail.inverse_transform([fail_pred])[0]

        forecast_records.append({
            "Datetime": new_time,
            **preds,
            "Status": status_pred,
            "Warning_Type": warning_type,
            "Fail_Type": fail_type
        })

        # Update history for next iteration
        new_row = pd.DataFrame([preds], index=[new_time])
        history = pd.concat([history, new_row])

    return pd.DataFrame(forecast_records).set_index("Datetime")

In [27]:
# --- Test the Pipeline ---
# Select a valid antenna ID from the dataset
antenna_id_to_test = input("Enter the Antenna ID to forecast for: ")
df_last = df[df["Antenna_ID"] == antenna_id_to_test].tail(1)

if df_last.empty:
    print(f"Antenna ID '{antenna_id_to_test}' not found in the dataset.")
else:
    forecast_result = hybrid_forecast(
        df_input=df_last,
        forecast_steps=10,
        antenna_id=antenna_id_to_test,
        original_df=df
    )
    print(forecast_result[["Status", "Warning_Type", "Fail_Type"]])

Enter the Antenna ID to forecast for: A18
Datetime                                                 
2023-02-25 12:38:00   Fail         None  Hardware_Failure
2023-02-25 12:39:00   Fail         None  Hardware_Failure
2023-02-25 12:40:00   Fail         None  Hardware_Failure
2023-02-25 12:41:00   Fail         None  Hardware_Failure
2023-02-25 12:42:00   Fail         None  Hardware_Failure
2023-02-25 12:43:00   Fail         None  Hardware_Failure
2023-02-25 12:44:00   Fail         None  Hardware_Failure
2023-02-25 12:45:00   Fail         None  Hardware_Failure
2023-02-25 12:46:00   Fail         None  Hardware_Failure
2023-02-25 12:47:00   Fail         None  Hardware_Failure
