In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, mean_absolute_error

In [3]:
# 1. DATA LOADING & CLEANING
df = pd.read_csv("/content/AlarmList_HistoricalAlarms_1_DEC_2024_1.csv")
df = df.drop('Alarmed Object Source System', axis=1)
df = df.dropna(subset=['Alarm Name','Site Name'])
df['Additional Text'].fillna('Unknown', inplace=True)
df['Is Service Affecting'].fillna(1, inplace=True)
df['is_active'] = df['Last Time Cleared'].isnull().astype(int)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Additional Text'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Is Service Affecting'].fillna(1, inplace=True)


In [4]:
# Merge rare causes for "Probable Cause"

cause_counts = df['Probable Cause'].value_counts()
cutoff = 100
common_causes = set(cause_counts[cause_counts >= cutoff].index)
df['Probable Cause Merged'] = df['Probable Cause'].apply(lambda cause: cause if cause in common_causes else 'Other')


In [5]:
# Feature engineering: timestamps
df['First Time Detected Clean'] = pd.to_datetime(df['First Time Detected'].str.slice(0, 19), format="%Y/%m/%d %H:%M:%S")
df['hour'] = df['First Time Detected Clean'].dt.hour
df['dayofweek'] = df['First Time Detected Clean'].dt.dayofweek

# Label encoding for categorical columns
categorical_cols = [
    'Severity', 'Site Name', 'Source System', 'Alarm Name',
    'Alarmed Object Name', 'Alarmed Object Type', 'Alarm Type',
    'Probable Cause Merged', 'Specific Problem', 'Previous Severity'
]

In [6]:
# Label encoding for categorical columns
categorical_cols = [
    'Severity', 'Site Name', 'Source System', 'Alarm Name',
    'Alarmed Object Name', 'Alarmed Object Type', 'Alarm Type',
    'Probable Cause Merged', 'Specific Problem', 'Previous Severity'
]
encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    encoders[col] = le

In [7]:
df['Life Span (minutes)'] = pd.to_numeric(df['Life Span (minutes)'], errors='coerce').fillna(0)
df['Number Of Occurrences'] = pd.to_numeric(df['Number Of Occurrences'], errors='coerce').fillna(1)
df['Is Service Affecting'] = pd.to_numeric(df['Is Service Affecting'], errors='coerce').fillna(1)
df['is_active'] = pd.to_numeric(df['is_active'], errors='coerce').fillna(0)

In [8]:
df['Life Span (minutes)'] = pd.to_numeric(df['Life Span (minutes)'], errors='coerce').fillna(0)
df['Number Of Occurrences'] = pd.to_numeric(df['Number Of Occurrences'], errors='coerce').fillna(1)
df['Is Service Affecting'] = pd.to_numeric(df['Is Service Affecting'], errors='coerce').fillna(1)
df['is_active'] = pd.to_numeric(df['is_active'], errors='coerce').fillna(0)

# Drop unnecessary columns for modeling
drop_cols = ['Unnamed: 0', 'Alarm ID', 'First Time Detected', 'Last Time Cleared', 'Last Time Detected', 'Additional Text', 'Probable Cause']
df = df.drop(columns=[col for col in drop_cols if col in df.columns], errors='ignore')


In [9]:
# ----------------- TIMESTAMP TARGET CREATION -----------------
df = df.sort_values('First Time Detected Clean').reset_index(drop=True)
df['Next Alarm Timestamp'] = df['First Time Detected Clean'].shift(-1)
df['Next Alarm Unix'] = df['Next Alarm Timestamp'].astype(np.int64) // 10**9
df['This Alarm Unix'] = df['First Time Detected Clean'].astype(np.int64) // 10**9
df_time = df[df['Next Alarm Unix'].notnull()]
df_time = df_time[df_time['Next Alarm Unix'] > df_time['This Alarm Unix']]

In [10]:
# Drop unnecessary columns for modeling
drop_cols = ['Unnamed: 0', 'Alarm ID', 'First Time Detected', 'Last Time Cleared',
             'Last Time Detected', 'Additional Text', 'Probable Cause']
df = df.drop(columns=[col for col in drop_cols if col in df.columns], errors='ignore')

# -- Alarm Type
input_features_alarm_type = [
    'Source System', 'Alarm Name', 'Alarmed Object Name', 'Alarmed Object Type',
    'Previous Severity', 'Is Service Affecting', 'Number Of Occurrences', 'is_active', 'hour', 'dayofweek'
]
target_alarm_type = 'Alarm Type'

# -- Probable Cause
input_features_cause = input_features_alarm_type + ['Alarm Type']
target_cause = 'Probable Cause Merged'

# -- Duration
input_features_duration = input_features_cause + ['Probable Cause Merged']
target_duration = 'Life Span (minutes)'

# -- Severity
input_features_severity = input_features_duration + ['Life Span (minutes)']
target_severity = 'Severity'

In [11]:
# -- Timestamp (independent regression)
input_features_timestamp = [
    'Severity', 'Site Name', 'Source System', 'Probable Cause Merged',
    'Alarmed Object Name', 'Alarmed Object Type', 'Previous Severity',
    'Is Service Affecting', 'Number Of Occurrences', 'is_active',
    'Alarm Type', 'Life Span (minutes)', 'hour', 'dayofweek'
]
target_timestamp = 'Next Alarm Unix'

In [12]:
# ----------------- TRAIN-TEST SPLIT (CHAINED + INDEPENDENT) -----------------
# Chained models - common test set
X = df[input_features_alarm_type]
y_alarm_type = df[target_alarm_type]
X_train, X_test, y_train_alarm_type, y_test_alarm_type = train_test_split(
    X, y_alarm_type, test_size=0.2, random_state=42, stratify=y_alarm_type
)
idxs = X_test.index[:10]  # For top 10 chained predictions

# Timestamp - independent split
X_time = df_time[input_features_timestamp]
y_time = df_time[target_timestamp]
X_time_train, X_time_test, y_time_train, y_time_test = train_test_split(
    X_time, y_time, test_size=0.2, random_state=42
)
time_test_idxs = X_time_test.index[:10]


In [13]:

# ----------------- TRAIN ALL MODELS -----------------
# 1. Alarm Type
alarm_type_clf = XGBClassifier(tree_method="hist", use_label_encoder=False, eval_metric='mlogloss')
alarm_type_clf.fit(X_train, y_train_alarm_type)

# 2. Probable Cause
X_cause = df[input_features_cause]
y_cause = df[target_cause]
cause_clf = XGBClassifier(tree_method="hist", use_label_encoder=False, eval_metric='mlogloss')
cause_clf.fit(X_cause, y_cause)

# 3. Duration
X_duration = df[input_features_duration]
y_duration = df[target_duration]
duration_reg = XGBRegressor(tree_method="hist")
duration_reg.fit(X_duration, y_duration)

# 4. Severity
X_severity = df[input_features_severity]
y_severity = df[target_severity]
severity_clf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, class_weight='balanced')
severity_clf.fit(X_severity, y_severity)

# 5. Next Alarm Timestamp (independent)
reg_time = XGBRegressor(tree_method="hist")
reg_time.fit(X_time_train, y_time_train)

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



In [14]:
# ----------------- CHAINED PREDICTION FOR TOP 10 TEST SAMPLES -----------------
results = []
for idx in idxs:
    row = X_test.loc[[idx]].copy()
    # 1. Predict Alarm Type
    pred_alarm_type = alarm_type_clf.predict(row)[0]
    row_cause = row.copy()
    row_cause['Alarm Type'] = pred_alarm_type
    # 2. Predict Probable Cause
    pred_cause = cause_clf.predict(row_cause)[0]
    row_duration = row_cause.copy()
    row_duration['Probable Cause Merged'] = pred_cause
    # 3. Predict Duration
    pred_duration = duration_reg.predict(row_duration)[0]
    row_sev = row_duration.copy()
    row_sev['Life Span (minutes)'] = pred_duration
    # 4. Predict Severity
    pred_severity = severity_clf.predict(row_sev)[0]

    # Decode predictions for human-readable output
    decoded_alarm_type = encoders['Alarm Type'].inverse_transform([pred_alarm_type])[0]
    decoded_cause = encoders['Probable Cause Merged'].inverse_transform([pred_cause])[0]
    decoded_severity = encoders['Severity'].inverse_transform([pred_severity])[0]

    results.append({
        'row': int(idx),
        'Predicted Alarm Type': decoded_alarm_type,
        'Predicted Probable Cause': decoded_cause,
        'Predicted Duration (minutes)': float(pred_duration),
        'Predicted Severity': decoded_severity
    })

result_df = pd.DataFrame(results)
print("\nTop 10 Chained Predictions (for Visualization/Export):")
print(result_df)


Top 10 Chained Predictions (for Visualization/Export):
      row Predicted Alarm Type Predicted Probable Cause  \
0  285421       equipmentAlarm    SNCP Protection Event   
1    7007            EQUIPMENT    SoftRerouteInProgress   
2   76167       COMMUNICATIONS                      SSF   
3  220612       COMMUNICATIONS                  PWRSUSP   
4   37853       COMMUNICATIONS              URU-OCH-LOS   
5  164588       equipmentAlarm                REMOVEMOD   
6  221302       equipmentAlarm                INSERTMOD   
7   72367       equipmentAlarm                    SSF-T   
8  222300       COMMUNICATIONS    SoftRerouteInProgress   
9  131969   QUALITY OF SERVICE        T-FEUAS-OTU-1-DAY   

   Predicted Duration (minutes) Predicted Severity  
0                   1426.254761      indeterminate  
1                     72.636749            cleared  
2                     55.019115            cleared  
3                     11.748476            cleared  
4                     74.4857

In [15]:
y_time_pred = reg_time.predict(X_time_test)
mae_sec = mean_absolute_error(y_time_test, y_time_pred)
mae_min = mae_sec / 60
predicted_times = pd.to_datetime(y_time_pred, unit='s')
actual_times = pd.to_datetime(y_time_test.values, unit='s')
timestamp_result_df = pd.DataFrame({
    "Actual Next Alarm Time": actual_times[:10].values,
    "Predicted Next Alarm Time": predicted_times[:10].values
})
print(f"\nNext Alarm Timestamp MAE: {mae_min:.2f} minutes")
print("\nTop 10 Next Alarm Timestamp Predictions (Independent):")
print(timestamp_result_df.head(10))



Next Alarm Timestamp MAE: 33.10 minutes

Top 10 Next Alarm Timestamp Predictions (Independent):
  Actual Next Alarm Time Predicted Next Alarm Time
0    2024-12-01 03:17:21       2024-12-01 03:29:04
1    2024-12-01 04:22:59       2024-12-01 04:20:16
2    2024-11-19 10:51:58       2024-11-19 08:23:28
3    2024-12-01 01:28:34       2024-12-01 01:29:36
4    2024-12-01 11:39:33       2024-12-01 11:29:04
5    2024-12-01 09:13:51       2024-12-01 09:29:36
6    2024-12-01 07:46:06       2024-12-01 07:28:00
7    2024-12-01 09:37:11       2024-12-01 09:33:52
8    2024-12-01 03:37:04       2024-12-01 03:26:56
9    2024-12-01 07:49:29       2024-12-01 07:51:28
