In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, mean_absolute_error

In [6]:
# 1. DATA LOADING & CLEANING
df = pd.read_csv("/content/AlarmList_HistoricalAlarms_1_DEC_2024_1.csv")
df = df.drop('Alarmed Object Source System', axis=1)
df = df.dropna(subset=['Alarm Name','Site Name'])
df['Additional Text'].fillna('Unknown', inplace=True)
df['Is Service Affecting'].fillna(1, inplace=True)
df['is_active'] = df['Last Time Cleared'].isnull().astype(int)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Additional Text'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Is Service Affecting'].fillna(1, inplace=True)


In [7]:
# Merge rare causes for "Probable Cause"

cause_counts = df['Probable Cause'].value_counts()
cutoff = 100
common_causes = set(cause_counts[cause_counts >= cutoff].index)
df['Probable Cause Merged'] = df['Probable Cause'].apply(lambda cause: cause if cause in common_causes else 'Other')


In [8]:
# Feature engineering: timestamps
df['First Time Detected Clean'] = pd.to_datetime(df['First Time Detected'].str.slice(0, 19), format="%Y/%m/%d %H:%M:%S")
df['hour'] = df['First Time Detected Clean'].dt.hour
df['dayofweek'] = df['First Time Detected Clean'].dt.dayofweek

# Label encoding for categorical columns
categorical_cols = [
    'Severity', 'Site Name', 'Source System', 'Alarm Name',
    'Alarmed Object Name', 'Alarmed Object Type', 'Alarm Type',
    'Probable Cause Merged', 'Specific Problem', 'Previous Severity'
]

In [9]:
# Label encoding for categorical columns
categorical_cols = [
    'Severity', 'Site Name', 'Source System', 'Alarm Name',
    'Alarmed Object Name', 'Alarmed Object Type', 'Alarm Type',
    'Probable Cause Merged', 'Specific Problem', 'Previous Severity'
]
encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    encoders[col] = le

In [10]:
df['Life Span (minutes)'] = pd.to_numeric(df['Life Span (minutes)'], errors='coerce').fillna(0)
df['Number Of Occurrences'] = pd.to_numeric(df['Number Of Occurrences'], errors='coerce').fillna(1)
df['Is Service Affecting'] = pd.to_numeric(df['Is Service Affecting'], errors='coerce').fillna(1)
df['is_active'] = pd.to_numeric(df['is_active'], errors='coerce').fillna(0)

In [11]:
df['Life Span (minutes)'] = pd.to_numeric(df['Life Span (minutes)'], errors='coerce').fillna(0)
df['Number Of Occurrences'] = pd.to_numeric(df['Number Of Occurrences'], errors='coerce').fillna(1)
df['Is Service Affecting'] = pd.to_numeric(df['Is Service Affecting'], errors='coerce').fillna(1)
df['is_active'] = pd.to_numeric(df['is_active'], errors='coerce').fillna(0)

# Drop unnecessary columns for modeling
drop_cols = ['Unnamed: 0', 'Alarm ID', 'First Time Detected', 'Last Time Cleared', 'Last Time Detected', 'Additional Text', 'Probable Cause']
df = df.drop(columns=[col for col in drop_cols if col in df.columns], errors='ignore')


In [12]:
# ----------------- TIMESTAMP TARGET CREATION -----------------
df = df.sort_values('First Time Detected Clean').reset_index(drop=True)
df['Next Alarm Timestamp'] = df['First Time Detected Clean'].shift(-1)
df['Next Alarm Unix'] = df['Next Alarm Timestamp'].astype(np.int64) // 10**9
df['This Alarm Unix'] = df['First Time Detected Clean'].astype(np.int64) // 10**9
df_time = df[df['Next Alarm Unix'].notnull()]
df_time = df_time[df_time['Next Alarm Unix'] > df_time['This Alarm Unix']]

In [13]:
# Drop unnecessary columns for modeling
drop_cols = ['Unnamed: 0', 'Alarm ID', 'First Time Detected', 'Last Time Cleared',
             'Last Time Detected', 'Additional Text', 'Probable Cause']
df = df.drop(columns=[col for col in drop_cols if col in df.columns], errors='ignore')

# -- Alarm Type
input_features_alarm_type = [
    'Source System', 'Alarm Name', 'Alarmed Object Name', 'Alarmed Object Type',
    'Previous Severity', 'Is Service Affecting', 'Number Of Occurrences', 'is_active', 'hour', 'dayofweek'
]
target_alarm_type = 'Alarm Type'

# -- Probable Cause
input_features_cause = input_features_alarm_type + ['Alarm Type']
target_cause = 'Probable Cause Merged'

# -- Duration
input_features_duration = input_features_cause + ['Probable Cause Merged']
target_duration = 'Life Span (minutes)'

# -- Severity
input_features_severity = input_features_duration + ['Life Span (minutes)']
df.drop(columns='Previous Severity',errors='ignore')
target_severity = 'Previous Severity'

In [14]:
# -- Timestamp (independent regression)
input_features_timestamp = [
    'Severity', 'Site Name', 'Source System', 'Probable Cause Merged',
    'Alarmed Object Name', 'Alarmed Object Type', 'Previous Severity',
    'Is Service Affecting', 'Number Of Occurrences', 'is_active',
    'Alarm Type', 'Life Span (minutes)', 'hour', 'dayofweek'
]
target_timestamp = 'Next Alarm Unix'

In [15]:
# ----------------- TRAIN-TEST SPLIT (CHAINED + INDEPENDENT) -----------------
# Chained models - common test set
X = df[input_features_alarm_type]
y_alarm_type = df[target_alarm_type]
X_train, X_test, y_train_alarm_type, y_test_alarm_type = train_test_split(
    X, y_alarm_type, test_size=0.2, random_state=42, stratify=y_alarm_type
)
idxs = X_test.index[:10]  # For top 10 chained predictions

# Timestamp - independent split
X_time = df_time[input_features_timestamp]
y_time = df_time[target_timestamp]
X_time_train, X_time_test, y_time_train, y_time_test = train_test_split(
    X_time, y_time, test_size=0.2, random_state=42
)
time_test_idxs = X_time_test.index[:10]


In [16]:

# ----------------- TRAIN ALL MODELS -----------------
# 1. Alarm Type
alarm_type_clf = XGBClassifier(tree_method="hist", use_label_encoder=False, eval_metric='mlogloss')
alarm_type_clf.fit(X_train, y_train_alarm_type)

# 2. Probable Cause
X_cause = df[input_features_cause]
y_cause = df[target_cause]
cause_clf = XGBClassifier(tree_method="hist", use_label_encoder=False, eval_metric='mlogloss')
cause_clf.fit(X_cause, y_cause)

# 3. Duration
X_duration = df[input_features_duration]
y_duration = df[target_duration]
duration_reg = XGBRegressor(tree_method="hist")
duration_reg.fit(X_duration, y_duration)

# 4. Severity
X_severity = df[input_features_severity]
y_severity = df[target_severity]
severity_clf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, class_weight='balanced')
severity_clf.fit(X_severity, y_severity)

# 5. Next Alarm Timestamp (independent)
reg_time = XGBRegressor(tree_method="hist")
reg_time.fit(X_time_train, y_time_train)

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



In [17]:
# 5. CHAINED PREDICTION FOR TOP 10 TEST SAMPLES (WITH CONFIDENCE)
results = []
for idx in idxs:
    row = X_test.loc[[idx]].copy()
    # 1. Predict Alarm Type
    alarm_type_proba = alarm_type_clf.predict_proba(row)
    pred_alarm_type = alarm_type_clf.predict(row)[0]
    alarm_type_conf = alarm_type_proba[0][pred_alarm_type]
    row_cause = row.copy()
    row_cause['Alarm Type'] = pred_alarm_type

    # 2. Predict Probable Cause
    cause_proba = cause_clf.predict_proba(row_cause)
    pred_cause = cause_clf.predict(row_cause)[0]
    cause_conf = cause_proba[0][pred_cause]
    row_duration = row_cause.copy()
    row_duration['Probable Cause Merged'] = pred_cause

    # 3. Predict Duration
    pred_duration = duration_reg.predict(row_duration)[0]
    row_sev = row_duration.copy()
    row_sev['Life Span (minutes)'] = pred_duration

    # 4. Predict Severity
    sev_proba = severity_clf.predict_proba(row_sev)
    pred_severity = severity_clf.predict(row_sev)[0]
    sev_conf = sev_proba[0][pred_severity]

    # Decode predictions for human-readable output
    decoded_alarm_type = encoders['Alarm Type'].inverse_transform([pred_alarm_type])[0]
    decoded_cause = encoders['Probable Cause Merged'].inverse_transform([pred_cause])[0]
    decoded_severity = encoders['Previous Severity'].inverse_transform([pred_severity])[0]

    results.append({
        'row': int(idx),
        'Predicted Alarm Type': decoded_alarm_type,
        'Alarm Type Confidence': alarm_type_conf,
        'Predicted Probable Cause': decoded_cause,
        'Cause Confidence': cause_conf,
        'Predicted Duration (minutes)': float(pred_duration),
        'Predicted Severity': decoded_severity,
        'Severity Confidence': sev_conf
    })

result_df = pd.DataFrame(results)
print("\nTop 10 Chained Predictions (for Visualization/Export):")
print(result_df)



Top 10 Chained Predictions (for Visualization/Export):
      row Predicted Alarm Type  Alarm Type Confidence  \
0  285421       equipmentAlarm               0.999997   
1    7007            EQUIPMENT               0.999994   
2   76167       COMMUNICATIONS               0.999995   
3  220612       COMMUNICATIONS               0.937987   
4   37853       COMMUNICATIONS               0.999993   
5  164588       equipmentAlarm               0.999993   
6  221302       equipmentAlarm               0.999987   
7   72367       equipmentAlarm               0.999995   
8  222300       COMMUNICATIONS               0.999992   
9  131969   QUALITY OF SERVICE               0.999999   

  Predicted Probable Cause  Cause Confidence  Predicted Duration (minutes)  \
0    SNCP Protection Event          1.000000                   1426.254761   
1    SoftRerouteInProgress          0.549384                     72.636749   
2                      SSF          1.000000                     55.019115   
3   

In [18]:
import joblib

# Save models
joblib.dump(alarm_type_clf, 'alarm_type_clf.pkl')
joblib.dump(cause_clf, 'cause_clf.pkl')
joblib.dump(duration_reg, 'duration_reg.pkl')
joblib.dump(severity_clf, 'severity_clf.pkl')

# Save encoders (as a dict)
joblib.dump(encoders, 'encoders.pkl')



['encoders.pkl']

In [19]:
from google.colab import files
files.download("/content/alarm_type_clf.pkl")
files.download("/content/cause_clf.pkl")
files.download("/content/duration_reg.pkl")
files.download("/content/severity_clf.pkl")
files.download("/content/encoders.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>