In [6]:
pip install xgboost scikit-learn pandas joblib

Note: you may need to restart the kernel to use updated packages.


    stone (>=2.*)
           ~~~~^

[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import xgboost as xgb

# Load the dataset
df = pd.read_csv("k8Dataset .csv")

# Drop unnecessary column
df_cleaned = df.drop(columns=["Slow Response Likelihood (%)"])

# One-Hot Encode categorical column 'Event Type'
encoder = OneHotEncoder(sparse_output=False)
encoded_event_type = encoder.fit_transform(df_cleaned[['Event Type']])
encoded_event_df = pd.DataFrame(encoded_event_type, columns=[f"Event_Type_{i}" for i in range(encoded_event_type.shape[1])])

# Merge encoded data and drop 'Event Type' column
df_cleaned = df_cleaned.drop(columns=['Event Type']).reset_index(drop=True)
df_preprocessed = pd.concat([df_cleaned, encoded_event_df], axis=1)

# Define features and target variable
X = df_preprocessed.drop(columns=['Failure Type'])
y = df_preprocessed['Failure Type']

# Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [8]:
# Define the XGBoost model
xgb_model = xgb.XGBClassifier(
    n_estimators=100,  # Number of trees
    max_depth=6,  # Depth of each tree
    learning_rate=0.1,  # Step size
    eval_metric="mlogloss",  # Multi-class log loss
    
)

# Train the model
xgb_model.fit(X_train, y_train)

# Predict on test data
y_pred = xgb_model.predict(X_test)


In [9]:
import pandas as pd

# Load the dataset
df = pd.read_csv("k8Dataset .csv")

# Drop unnecessary column
if "Slow Response Likelihood (%)" in df.columns:
    df = df.drop(columns=["Slow Response Likelihood (%)"])

# One-Hot Encode 'Event Type'
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)
encoded_event_type = encoder.fit_transform(df[['Event Type']])
encoded_event_df = pd.DataFrame(encoded_event_type, columns=[f"Event_Type_{i}" for i in range(encoded_event_type.shape[1])])

# Merge encoded data and drop the original categorical column
df = df.drop(columns=['Event Type']).reset_index(drop=True)
df = pd.concat([df, encoded_event_df], axis=1)

# Define Features (X) and Target Variable (y)
X = df.drop(columns=['Failure Type'])  # Features
y = df['Failure Type']  # Target variable


In [10]:
from sklearn.model_selection import train_test_split

# Define features (X) and target (y)
X = df_preprocessed.drop(columns=['Failure Type'])  # Features
y = df_preprocessed['Failure Type']  # Target

# Split data into training (80%) and testing (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)



In [11]:
from sklearn.metrics import classification_report, accuracy_score

# Predict on test set
y_pred = xgb_model.predict(X_test)

# Compute accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"XGBoost Model Accuracy: {accuracy * 100:.2f}%")

# Print classification report
print(classification_report(y_test, y_pred))


XGBoost Model Accuracy: 97.85%
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        38
           1       0.98      1.00      0.99      1743
           2       1.00      0.99      0.99        99
           3       0.99      0.97      0.98       120

    accuracy                           0.98      2000
   macro avg       0.74      0.74      0.74      2000
weighted avg       0.96      0.98      0.97      2000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [12]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred, zero_division=1))  # Fix warning


              precision    recall  f1-score   support

           0       1.00      0.00      0.00        38
           1       0.98      1.00      0.99      1743
           2       1.00      0.99      0.99        99
           3       0.99      0.97      0.98       120

    accuracy                           0.98      2000
   macro avg       0.99      0.74      0.74      2000
weighted avg       0.98      0.98      0.97      2000



In [13]:
import numpy as np

unique_classes_train, train_counts = np.unique(y_train, return_counts=True)
unique_classes_test, test_counts = np.unique(y_test, return_counts=True)

print("Class distribution in training data:", dict(zip(unique_classes_train, train_counts)))
print("Class distribution in testing data:", dict(zip(unique_classes_test, test_counts)))


Class distribution in training data: {0: 153, 1: 6971, 2: 398, 3: 478}
Class distribution in testing data: {0: 38, 1: 1743, 2: 99, 3: 120}


In [14]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to balance the classes
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Train the model again with balanced data
xgb_model.fit(X_train_balanced, y_train_balanced)


In [15]:
from sklearn.metrics import classification_report, accuracy_score

# Predict on the test set
y_pred = xgb_model.predict(X_test)

# Compute Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"XGBoost Model Accuracy: {accuracy * 100:.2f}%")

# Print Detailed Classification Report
print(classification_report(y_test, y_pred, zero_division=1))


XGBoost Model Accuracy: 94.60%
              precision    recall  f1-score   support

           0       0.03      0.05      0.04        38
           1       0.98      0.96      0.97      1743
           2       1.00      0.98      0.99        99
           3       0.95      0.99      0.97       120

    accuracy                           0.95      2000
   macro avg       0.74      0.75      0.74      2000
weighted avg       0.96      0.95      0.95      2000



In [16]:
pip install xgboost scikit-learn pandas joblib imbalanced-learn optuna

Note: you may need to restart the kernel to use updated packages.


    stone (>=2.*)
           ~~~~^

[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [17]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

# Split data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Apply SMOTE to balance training data
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Print class distribution after balancing
import numpy as np
unique_classes, class_counts = np.unique(y_train_balanced, return_counts=True)
print("Class Distribution After SMOTE:", dict(zip(unique_classes, class_counts)))


Class Distribution After SMOTE: {0: 6971, 1: 6971, 2: 6971, 3: 6971}


In [18]:
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

# Define parameter grid for tuning
param_grid = {
    'n_estimators': [100, 200, 500],      # More trees = better learning
    'max_depth': [6, 8, 10],              # Higher depth = better pattern recognition
    'learning_rate': [0.01, 0.05, 0.1],   # Smaller values = better accuracy
    'gamma': [0, 0.1, 0.2],               # Controls regularization
    'colsample_bytree': [0.6, 0.8, 1.0]   # Feature selection
}

# Initialize XGBoost model
xgb_model = xgb.XGBClassifier(eval_metric="mlogloss")

# Perform GridSearchCV for best parameters
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search.fit(X_train_balanced, y_train_balanced)

# Print best parameters
print(f"Best Parameters: {grid_search.best_params_}")

# Train final model with best parameters
best_xgb_model = grid_search.best_estimator_


Fitting 3 folds for each of 243 candidates, totalling 729 fits
Best Parameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 500}


In [21]:
# Train XGBoost with best parameters from GridSearch
best_xgb_model.fit(X_train_balanced, y_train_balanced)

# Predict on Test Set
y_pred = best_xgb_model.predict(X_test)

# Evaluate Performance
from sklearn.metrics import classification_report, accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print(f"Optimized XGBoost Accuracy: {accuracy * 100:.2f}%")

# Print detailed classification report
print(classification_report(y_test, y_pred, zero_division=1))


Optimized XGBoost Accuracy: 96.15%
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        38
           1       0.98      0.98      0.98      1743
           2       1.00      0.98      0.99        99
           3       0.95      0.99      0.97       120

    accuracy                           0.96      2000
   macro avg       0.73      0.74      0.73      2000
weighted avg       0.96      0.96      0.96      2000



In [22]:
import joblib
joblib.dump(best_xgb_model, "optimized_xgboost_model.pkl")
print("Optimized model saved successfully!")

Optimized model saved successfully!
