<a href="https://colab.research.google.com/github/Bhanugt/incident-impact-prediction/blob/main/incident_impact_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, fbeta_score, classification_report

# 🔹 Load dataset
df = pd.read_csv("/content/sample_data/Incident_Event_Log.csv")  # Ensure correct filename

# 🔹 Handle Missing Values
df.fillna(df.mode().iloc[0], inplace=True)  # Fill missing values with mode

# 🔹 Encode categorical variables
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Save encoders for later use

# 🔹 Define Features (X) and Target (y)
target_col = "impact"  # Change if needed
X = df.drop(columns=[target_col])
y = df[target_col]

# 🔹 Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 🔹 Hyperparameter Tuning using GridSearchCV
param_grid = {
    'max_depth': [5, 10, 15, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# 🔹 Best Model
best_model = grid_search.best_estimator_
print(f"✅ Best Model Parameters: {grid_search.best_params_}")

# 🔹 Train Final Model on Full Data
best_model.fit(X_train, y_train)

# 🔹 Predict on Test Data
y_pred = best_model.predict(X_test)

# 🔹 Calculate Performance Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
f2 = fbeta_score(y_test, y_pred, beta=2, average='weighted')  # F2-score

# 🔹 Print the Metrics
print("✅ Model Performance Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"F2-Score: {f2:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# 🔹 Apply K-Fold Cross-Validation on Best Model
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(best_model, X, y, cv=kfold, scoring='accuracy')

# 🔹 Print Cross-Validation Scores
print("\n✅ Cross-Validation Results:")
print("CV Scores:", cv_scores)
print("Average CV Accuracy:", np.mean(cv_scores))


✅ Best Model Parameters: {'criterion': 'gini', 'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 10}
✅ Model Performance Metrics:
Accuracy: 0.3313
Precision: 0.3312
Recall: 0.3313
F1-Score: 0.3305
F2-Score: 0.3308

Classification Report:
               precision    recall  f1-score   support

           0       0.33      0.29      0.31      6649
           1       0.34      0.33      0.33      6779
           2       0.33      0.37      0.35      6642

    accuracy                           0.33     20070
   macro avg       0.33      0.33      0.33     20070
weighted avg       0.33      0.33      0.33     20070


✅ Cross-Validation Results:
CV Scores: [0.33129048 0.32919781 0.33761834 0.32939711 0.33597409]
Average CV Accuracy: 0.3326955655206777


In [2]:
%%writefile app.py
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder

# 🔹 Load dataset for feature reference
df = pd.read_csv("Incident_Event_Log.csv")

# 🔹 Encode categorical variables
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# 🔹 Define Features (X) and Target (y)
target_col = "impact"  # Change if needed
X = df.drop(columns=[target_col])
y = df[target_col]

# 🔹 Train a new Decision Tree model
model = DecisionTreeClassifier()
model.fit(X, y)

# 🔹 Streamlit UI
st.title("Incident Impact Prediction")

# Generate input fields dynamically based on dataset columns
input_data = {}
for col in X.columns:
    input_data[col] = st.number_input(f"Enter {col}", min_value=0, value=int(df[col].mean()))

# Convert input into DataFrame
input_df = pd.DataFrame([input_data])

# Encode categorical variables
for col in label_encoders:
    if col in input_df:
        input_df[col] = label_encoders[col].transform(input_df[col].astype(str))

# Predict Impact
if st.button("Predict Impact"):
    prediction = model.predict(input_df)
    st.success(f"Predicted Impact: {prediction[0]}")


Writing app.py


In [3]:
from google.colab import files
files.download("app.py")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>