## Load the DATASET

In [None]:
import pandas as pd
merged=pd.read_csv("/home/sohanx1/Downloads/6th sem/SE/nokia/codes/gshare/merged_dataset.csv")

## Details of Dataset

In [None]:
row_counts = merged.count()
print(row_counts)
merged.isnull().sum()

## Cleaning

In [None]:
## Dropping Alarmed Object Source System entirely as it is totally empty
merged.drop('Alarmed Object Source System', axis=1, inplace=True)
merged = merged.dropna(subset=['Alarm Name','Site Name'])

In [None]:
merged['Additional Text'].fillna('Unknown', inplace=True)
merged['Is Service Affecting'].fillna(1, inplace=True)


## Handling missing values for last_time_cleared columns 

- ### If null/empty that means it has not yet been cleared

In [None]:
merged['is_active'] = merged['Last Time Cleared'].isnull().astype(int)  # 1 = active, 0 = cleared

## Get the Unique values from each columns

In [None]:
unique_values = {col: merged[col].unique() for col in merged.columns}
max_len = max(len(v) for v in unique_values.values())
for k in unique_values:
    unique_values[k] = list(unique_values[k]) + [None] * (max_len - len(unique_values[k]))

unique_df = pd.DataFrame(unique_values)
unique_df.to_csv('unique_values.csv', index=False)


## Print No of Unique values in each columns
for col in df.columns:
    print(f"{col}: {df[col].nunique()} unique values")

## Getting only 400k rows for training due to computing constraint

In [None]:
dff=pd.read_csv("/home/sohanx1/Downloads/6th sem/SE/nokia/codes/gshare/cleaned_merged.csv",nrows=400000)
dff.to_csv("400k_merged_cleaned_data.csv",index=False)

## Feature Engineering Training 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from xgboost import XGBClassifier
import seaborn as sns

# 1. Load Data
df = pd.read_csv("/content/400k_merged_cleaned_data.csv")

# 2. Basic EDA
print("\n--- Head ---\n", df.head())
print("\n--- Info ---\n")
print(df.info())
print("\n--- Describe ---\n", df.describe(include='all'))
print("\n--- Null Counts ---\n", df.isnull().sum())

# 3. Visualizations
plt.figure(figsize=(12,6))
df['Severity'].value_counts().plot(kind='bar', title='Severity Distribution')
plt.xlabel('Severity')
plt.ylabel('Count')
plt.show()

plt.figure(figsize=(12,6))
df['Alarm Type'].value_counts().head(20).plot(kind='bar', title='Top 20 Alarm Types')
plt.xlabel('Alarm Type')
plt.ylabel('Count')
plt.show()

plt.figure(figsize=(12,6))
df['Life Span (minutes)'].hist(bins=30)
plt.title('Distribution of Life Span (minutes)')
plt.xlabel('Minutes')
plt.ylabel('Count')
plt.show()

plt.figure(figsize=(6,6))
df['is_active'].value_counts().plot.pie(autopct='%1.1f%%', labels=['Cleared','Active'])
plt.title('Active vs Cleared Alarms')
plt.ylabel('')
plt.show()

# 4. Encoding Categorical Features
categorical_cols = [
    'Severity', 'Site Name', 'Source System', 'Alarm Name',
    'Alarmed Object Name', 'Alarmed Object Type', 'Alarm Type',
    'Probable Cause', 'Specific Problem', 'Previous Severity'
]

encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    encoders[col] = le

# Ensure numeric columns
df['Life Span (minutes)'] = pd.to_numeric(df['Life Span (minutes)'], errors='coerce')
df['Number Of Occurrences'] = pd.to_numeric(df['Number Of Occurrences'], errors='coerce')
df['Is Service Affecting'] = pd.to_numeric(df['Is Service Affecting'], errors='coerce')
df['is_active'] = pd.to_numeric(df['is_active'], errors='coerce')

# 5. Drop Unused Columns
df = df.drop(columns=['Unnamed: 0', 'Alarm ID', 'First Time Detected', 'Last Time Cleared', 'Last Time Detected', 'Additional Text'], errors='ignore')

# 6. Feature & Target Mapping
# Example: Predict Alarm Type (multi-class classification)
target_column = 'Severity'
input_features = [col for col in df.columns if col != target_column]

print("\n--- Feature Mapping for Alarm Type Prediction ---")
print("Target:", target_column)
print("Input Features:", input_features)

# If you want to try other targets (e.g., Severity, Site Name, Is Service Affecting), just change target_column

# 7. Train/Test Split
X = df[input_features]
y = df[target_column]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 8. Model Training: Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, class_weight='balanced')
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

print("\n--- Random Forest Classification Report ---\n")
print(classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=False, cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# 9. Model Training: XGBoost (Optional)
xgb = XGBClassifier(tree_method="hist", use_label_encoder=False, eval_metric='mlogloss')
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)

print("\n--- XGBoost Classification Report ---\n")
print(classification_report(y_test, y_pred_xgb))

# 10. Cross-validation Scores
rf_cv = cross_val_score(rf, X, y, cv=5, scoring='f1_weighted')
print("\nRandom Forest CV F1 (weighted):", rf_cv.mean())

xgb_cv = cross_val_score(xgb, X, y, cv=5, scoring='f1_weighted')
print("XGBoost CV F1 (weighted):", xgb_cv.mean())

# 11. Feature Importance Plot (Random Forest)
importances = rf.feature_importances_
feat_names = X.columns
imp_df = pd.DataFrame({'feature': feat_names, 'importance': importances})
imp_df = imp_df.sort_values('importance', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(data=imp_df.head(15), x='importance', y='feature')
plt.title("Top 15 Feature Importances (Random Forest)")
plt.show()
