## Load the DATASET

In [3]:
import pandas as pd
merged=pd.read_csv("/home/sohanx1/Downloads/6th sem/SE/nokia/codes/gshare/merged_dataset.csv")

## Details of Dataset

In [None]:
row_counts = merged.count()
print(row_counts)
merged.isnull().sum()

## Cleaning

In [None]:
## Dropping Alarmed Object Source System entirely as it is totally empty
merged.drop('Alarmed Object Source System', axis=1, inplace=True)
merged = merged.dropna(subset=['Alarm Name','Site Name'])

In [None]:
merged['Additional Text'].fillna('Unknown', inplace=True)
merged['Is Service Affecting'].fillna(1, inplace=True)


## Handling missing values for last_time_cleared columns 

- ### If null/empty that means it has not yet been cleared

In [None]:
merged['is_active'] = merged['Last Time Cleared'].isnull().astype(int)  # 1 = active, 0 = cleared

## Removing unique rows--->columns where a records occured only once

In [6]:

def remove_rare_classes(merged, target_col, min_count=2):
    """
    Removes rows where the target_col occurs less than min_count times.
    Returns a cleaned DataFrame.
    """
    counts = merged[target_col].value_counts()
    good_classes = counts[counts >= min_count].index
    return merged[merged[target_col].isin(good_classes)].copy()

for col in merged.columns:
# Remove rare classes from Probable Cause
    merged = remove_rare_classes(merged, target_col=col, min_count=2)

# (Repeat for any other target columns if needed, e.g. Alarm Type)
# merged = remove_rare_classes(merged, target_col='Alarm Type', min_count=2)


## Get the Unique values from each columns

In [7]:
unique_values = {col: merged[col].unique() for col in merged.columns}
max_len = max(len(v) for v in unique_values.values())
for k in unique_values:
    unique_values[k] = list(unique_values[k]) + [None] * (max_len - len(unique_values[k]))

unique_df = pd.DataFrame(unique_values)
unique_df.to_csv('unique_values.csv', index=False)


## Print No of Unique values in each columns
for col in merged.columns:
    print(f"{col}: {merged[col].nunique()} unique values")

Unnamed: 0: 0 unique values
Severity: 0 unique values
Site Name: 0 unique values
Source System: 0 unique values
Life Span (minutes): 0 unique values
Alarm Name: 0 unique values
First Time Detected: 0 unique values
Last Time Cleared: 0 unique values
Alarmed Object Name: 0 unique values
Last Time Detected: 0 unique values
Alarmed Object Type: 0 unique values
Alarmed Object Source System: 0 unique values
Alarm Type: 0 unique values
Probable Cause: 0 unique values
Specific Problem: 0 unique values
Is Service Affecting: 0 unique values
Alarm ID: 0 unique values
Previous Severity: 0 unique values
Number Of Occurrences: 0 unique values
Additional Text: 0 unique values


## Getting only 400k rows for training due to computing constraint

In [None]:
dff=pd.read_csv("/home/sohanx1/Downloads/6th sem/SE/nokia/codes/gshare/cleaned_merged.csv",nrows=400000)
dff.to_csv("400k_merged_cleaned_data.csv",index=False)

## Label Encoding

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier, XGBRegressor
from sklearn.metrics import classification_report, mean_absolute_error
import numpy as np

# 1. Load your cleaned dataset
df = pd.read_csv("/content/400k_merged_cleaned_data.csv")




In [None]:
# 2. Label Encode Categorical Features
categorical_cols = [
    'Severity', 'Site Name', 'Source System', 'Alarm Name',
    'Alarmed Object Name', 'Alarmed Object Type', 'Alarm Type',
    'Probable Cause', 'Specific Problem', 'Previous Severity'
]
encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    encoders[col] = le

# Ensure numeric
df['Life Span (minutes)'] = pd.to_numeric(df['Life Span (minutes)'], errors='coerce')
df['Number Of Occurrences'] = pd.to_numeric(df['Number Of Occurrences'], errors='coerce')
df['Is Service Affecting'] = pd.to_numeric(df['Is Service Affecting'], errors='coerce')
df['is_active'] = pd.to_numeric(df['is_active'], errors='coerce')

# Drop unneeded columns
drop_cols = ['Unnamed: 0', 'Alarm ID', 'First Time Detected', 'Last Time Cleared', 'Last Time Detected', 'Additional Text']
df = df.drop(columns=drop_cols, errors='ignore')

# --------------------------
# 3. Step 1: Predict Next Alarm Type (Classification)
# --------------------------
# Inputs (exclude Alarm Type and targets to be predicted later)
input_features_alarm_type = [
    'Severity', 'Site Name', 'Source System', 'Alarm Name',
    'Alarmed Object Name', 'Alarmed Object Type',
    'Previous Severity', 'Is Service Affecting', 'Number Of Occurrences', 'is_active'
]
target_alarm_type = 'Alarm Type'

X1 = df[input_features_alarm_type]
y1 = df[target_alarm_type]
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2, random_state=42, stratify=y1)

alarm_type_clf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, class_weight='balanced')
alarm_type_clf.fit(X1_train, y1_train)
y1_pred = alarm_type_clf.predict(X1_test)

print("\n--- [Step 1] Alarm Type Classification Report ---")
print(classification_report(y1_test, y1_pred))

# --------------------------
# 4. Step 2: Predict Alarm Duration (Regression)
# --------------------------
# Inputs: Same as above + predicted alarm type (use true during training, predicted at test/inference)
input_features_duration = input_features_alarm_type + ['Alarm Type']
target_duration = 'Life Span (minutes)'

X2 = df[input_features_duration]
y2 = df[target_duration]
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=42)

duration_reg = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
duration_reg.fit(X2_train, y2_train)
y2_pred = duration_reg.predict(X2_test)

print("\n--- [Step 2] Duration Regression ---")
print("MAE:", mean_absolute_error(y2_test, y2_pred))

# --------------------------
# 5. Step 3: Predict Probable Cause (Classification)
# --------------------------
# Inputs: All above + predicted duration (true during training, predicted at test/inference)



In [None]:
# 5. Step 3: Predict Probable Cause (Classification)
# Inputs: All above + predicted duration (true during training, predicted at test/inference)



input_features_cause = input_features_alarm_type + ['Alarm Type', 'Life Span (minutes)']
target_cause = 'Probable Cause'
# ----Removing rare class-------
value_counts = df[target_cause].value_counts()
df_filtered = df[df[target_cause].isin(value_counts[value_counts > 1].index)]


X3 = df_filtered[input_features_cause]
y3 = df_filtered[target_cause]
le_cause = LabelEncoder()
y3 = le_cause.fit_transform(y3)
X3_train, X3_test, y3_train, y3_test = train_test_split(
    X3, y3, test_size=0.4, random_state=42, stratify=y3
)


cause_clf = XGBClassifier(tree_method="hist", use_label_encoder=False, eval_metric='mlogloss')
cause_clf.fit(X3_train, y3_train)
y3_pred = cause_clf.predict(X3_test)

print("\n--- [Step 3] Probable Cause Classification Report ---")
print(classification_report(y3_test, y3_pred))

# 6. How to Chain at Inference (on new sample)
# Let's say you have a new incoming alarm row: new_alarm_row (pd.DataFrame with same columns)
# You would do:
#   1. Predict Alarm Type with alarm_type_clf
#   2. Add that to features, predict Duration with duration_reg
#   3. Add both, predict Probable Cause with cause_clf


# Decode predictions (optional, for interpretability)
decoded_alarm_type = encoders['Alarm Type'].inverse_transform([pred_alarm_type])[0]
decoded_cause = encoders['Probable Cause'].inverse_transform([pred_cause])[0]

print("\n--- Chained Inference Example ---")
print(f"Predicted Next Alarm Type: {decoded_alarm_type}")
print(f"Predicted Duration: {pred_duration:.2f} minutes")
print(f"Predicted Probable Cause: {decoded_cause}")

