In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, IsolationForest,RandomForestRegressor

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score,precision_score, recall_score, f1_score,mean_squared_error, mean_absolute_percentage_error
import datetime 

import lightgbm as lgb
from skopt import BayesSearchCV
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

from scipy.stats.mstats import winsorize
from pmdarima.arima import auto_arima
from statsmodels.tsa.statespace.sarimax import SARIMAX
from prophet import Prophet
import warnings
from sklearn.exceptions import UndefinedMetricWarning
from imblearn.over_sampling import RandomOverSampler 

# Ignore all undefined‐metric warnings from sklearn
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)

## Data Loading

In [2]:
file_path="Desk Ticket Analysis.csv"

In [3]:
df=pd.read_csv(file_path,low_memory=False)

In [4]:
print(f"No of Rows : {df.shape[0]}, No of Columns : {df.shape[1]}")

No of Rows : 46606, No of Columns : 25


In [5]:
df.head()

Unnamed: 0,Asset_ID,Asset_Group,Asset_Subtype,Budget_Code,Ticket_ID,Ticket_State,Business_Impact,Time_Criticality,Response_Priority,Client_Contact_Score,...,Reinitiated_At,Solution_Completed_At,Final_Closure_At,Effort_Duration_Hrs,Closure_Rationale,Interaction_Tally,Interaction_Ref,Incident_Link_Count,Change_Link_Count,Change_Ref_ID
0,SUB000508,subapplication,Web Based Application,WBS000162,IM0000004,Closed,4,4,4.0,0.601292,...,,04-11-2013 13:50,04-11-2013 13:51,3871691111,Other,1.0,SD0000007,2.0,,
1,WBA000124,application,Web Based Application,WBS000088,IM0000005,Closed,3,3,3.0,0.41505,...,02-12-2013 12:31,02-12-2013 12:36,02-12-2013 12:36,4354786389,Software,1.0,SD0000011,1.0,,
2,DTA000024,application,Desktop Application,WBS000092,IM0000006,Closed,NS,3,,0.517551,...,,13-01-2014 15:12,13-01-2014 15:13,4843119444,No error - works as designed,1.0,SD0000017,,,
3,WBA000124,application,Web Based Application,WBS000088,IM0000011,Closed,4,4,4.0,0.642927,...,,14-11-2013 09:31,14-11-2013 09:31,4321833333,Operator error,1.0,SD0000025,,,
4,WBA000124,application,Web Based Application,WBS000088,IM0000012,Closed,4,4,4.0,0.345258,...,,08-11-2013 13:55,08-11-2013 13:55,3383903333,Other,1.0,SD0000029,,,


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46606 entries, 0 to 46605
Data columns (total 25 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Asset_ID               46606 non-null  object 
 1   Asset_Group            46495 non-null  object 
 2   Asset_Subtype          46495 non-null  object 
 3   Budget_Code            46606 non-null  object 
 4   Ticket_ID              46606 non-null  object 
 5   Ticket_State           46606 non-null  object 
 6   Business_Impact        46606 non-null  object 
 7   Time_Criticality       46606 non-null  object 
 8   Response_Priority      45226 non-null  float64
 9   Client_Contact_Score   46606 non-null  float64
 10  Issue_Type             46606 non-null  object 
 11  Knowledge_Link_ID      46606 non-null  object 
 12  Monitoring_Origin      46606 non-null  object 
 13  Team_Reassignments     46605 non-null  float64
 14  Logged_At              46606 non-null  object 
 15  Re

#### Removing Meta Columns

In [7]:
"""
Meta Columns like Asset ID, etc wont help us, instead it will cause a deviation
"""

drop_cols = [
        'Asset_ID', 'Ticket_ID', 'Budget_Code',
        'Knowledge_Link_ID', 'Interaction_Ref', 'Change_Ref_ID',
    ]
df = df.drop(columns=[c for c in drop_cols if c in df.columns])

## Data Preprocessing

##### Handling NULL Values

In [8]:
def missing_summary(df):
    miss = pd.DataFrame({
        'NonNullCount': df.count(),
        'TotalCount': len(df),
        'MissingCount': df.isna().sum(),
        'MissingPct': df.isna().mean() * 100
    })
    print("Missing Value Summary:\n", miss)
missing_summary(df)

Missing Value Summary:
                        NonNullCount  TotalCount  MissingCount  MissingPct
Asset_Group                   46495       46606           111    0.238167
Asset_Subtype                 46495       46606           111    0.238167
Ticket_State                  46606       46606             0    0.000000
Business_Impact               46606       46606             0    0.000000
Time_Criticality              46606       46606             0    0.000000
Response_Priority             45226       46606          1380    2.960992
Client_Contact_Score          46606       46606             0    0.000000
Issue_Type                    46606       46606             0    0.000000
Monitoring_Origin             46606       46606             0    0.000000
Team_Reassignments            46605       46606             1    0.002146
Logged_At                     46606       46606             0    0.000000
Reinitiated_At                 2284       46606         44322   95.099343
Solution_Compl

In [9]:
"""
As Response Priority Has 5 values, we can fill mode
"""
mode_prio = df['Response_Priority'].mode()[0]
df['Response_Priority'] = df['Response_Priority'].fillna(mode_prio)

In [10]:
"""
Filling 0 would me mean that no Reassignments
"""
df['Team_Reassignments']=df['Team_Reassignments'].fillna(0)

In [11]:
#Interaction_Tally: ~0.2% null → impute with median and cast to integer
median_interactions = df['Interaction_Tally'].median()
df['Interaction_Tally'] = df['Interaction_Tally'].fillna(median_interactions).astype(int)

In [12]:
"""
Null Values Means, Never they got any links, so we can fill 0
"""
df['Incident_Link_Count'] = df['Incident_Link_Count'].fillna(0).astype(int)
df['Change_Link_Count']   = df['Change_Link_Count'].fillna(0).astype(int)

In [13]:
df['Effort_Duration_Hrs'] = (
    df['Effort_Duration_Hrs']
    .astype(str)
    .str.replace(r"[^\d.]", "", regex=True)
    .replace('', np.nan)
    .astype(float)
)
# Impute missing with median
median_effort = df['Effort_Duration_Hrs'].median()
df['Effort_Duration_Hrs'] = df['Effort_Duration_Hrs'].fillna(median_effort)

In [14]:
"""
Closure Rationale has many values, so lets consider null as Other
"""
df['Closure_Rationale'] = df['Closure_Rationale'].fillna('Other')

##### No Duplicated Values Detected

In [15]:
df.duplicated().sum()

0

In [16]:
def type_unique_summary(df):
    tu = pd.DataFrame({
        'DataType': df.dtypes.astype(str),
        'UniqueValues': df.nunique(dropna=False)
    })
    print("Data Types & Unique Values:\n", tu)
type_unique_summary(df)

Data Types & Unique Values:
                       DataType  UniqueValues
Asset_Group             object            13
Asset_Subtype           object            65
Ticket_State            object             2
Business_Impact         object             6
Time_Criticality        object             6
Response_Priority      float64             5
Client_Contact_Score   float64         46606
Issue_Type              object             4
Monitoring_Origin       object             1
Team_Reassignments     float64            41
Logged_At               object         34636
Reinitiated_At          object          2245
Solution_Completed_At   object         33628
Final_Closure_At        object         34528
Effort_Duration_Hrs    float64         30406
Closure_Rationale       object            14
Interaction_Tally        int32            49
Incident_Link_Count      int32            25
Change_Link_Count        int32             5


#### Feature Engineering

In [None]:
# 1. Asset_Group: keep top 5, collapse the rest into "Other"
top5_groups = df['Asset_Group'].value_counts().nlargest(5).index.tolist()

df['Asset_Group_Cat'] = (
    df['Asset_Group']
    .where(df['Asset_Group'].isin(top5_groups), other='Other')
    .astype('category')
)
loo
# 2. Asset_Subtype: keep top 5, collapse the rest into "Other"
top5_subtypes = df['Asset_Subtype'].value_counts().nlargest(5).index.tolist()

df['Asset_Subtype_Cat'] = (
    df['Asset_Subtype']
    .where(df['Asset_Subtype'].isin(top5_subtypes), other='Other')
    .astype('category')
)

In [18]:
# 1. Business_Impact → integer, map 'NS' (Not Specified) to 0
df['Business_Impact_Clean'] = (
    df['Business_Impact']
    .replace('NS', 0)                # map NS → 0
    .astype(int)                     # cast all to int
)

# 2. Time_Criticality → integer (strip any trailing text like ' - Very Low')
df['Time_Criticality_Clean'] = (
    df['Time_Criticality']
    .astype(str)
    .str.extract(r'(\d+)')           # grab numeric portion
    .astype(int)
)
# 3b. Cast to integer
df['Response_Priority'] = df['Response_Priority'].astype(int)

# 3c. Extract binary feature for “high” (P1 or P2) vs others
df['Is_High_Priority'] = (df['Response_Priority'] <= 2).astype(int)

In [19]:
"""
Converting into reuqired Format
"""
date_cols = ['Logged_At', 'Reinitiated_At', 'Solution_Completed_At', 'Final_Closure_At']
for col in date_cols:
    df[col] = pd.to_datetime(df[col], format='%d-%m-%Y %H:%M', errors='coerce')

In [20]:
"""
Since it has many unique values, we are taking top 5 and rest as others
"""
counts = df['Closure_Rationale'].value_counts()
rare = counts[counts < 100].index  # e.g. anything under 100 occurrences
df['Closure_Rationale_Cat'] = (
    df['Closure_Rationale']
    .where(~df['Closure_Rationale'].isin(rare), other='Other')
    .astype('category')
)


#### Removal After Feature Engineering

In [21]:
drop_cols=[ # Only one value left
           'Monitoring_Origin',
            # Raw categoricals replaced by cleaned versions
            'Asset_Group','Asset_Subtype','Business_Impact','Time_Criticality',
            # Raw text field
            'Closure_Rationale'
]

In [22]:
df = df.drop(columns=[c for c in drop_cols if c in df.columns])

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46606 entries, 0 to 46605
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Ticket_State            46606 non-null  object        
 1   Response_Priority       46606 non-null  int32         
 2   Client_Contact_Score    46606 non-null  float64       
 3   Issue_Type              46606 non-null  object        
 4   Team_Reassignments      46606 non-null  float64       
 5   Logged_At               46606 non-null  datetime64[ns]
 6   Reinitiated_At          2284 non-null   datetime64[ns]
 7   Solution_Completed_At   44826 non-null  datetime64[ns]
 8   Final_Closure_At        46606 non-null  datetime64[ns]
 9   Effort_Duration_Hrs     46606 non-null  float64       
 10  Interaction_Tally       46606 non-null  int32         
 11  Incident_Link_Count     46606 non-null  int32         
 12  Change_Link_Count       46606 non-null  int32 

### Encoding

In [24]:
# Columns to one-hot encode
onehot_cols = [
    'Asset_Group_Cat',
    'Asset_Subtype_Cat',
    'Closure_Rationale_Cat',
    'Issue_Type'
]

# Instantiate OneHotEncoder
# drop='first' to avoid multicollinearity (optional)
ohe = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')

# Fit and transform the dataframe
ohe_array = ohe.fit_transform(df[onehot_cols])

# Generate feature names and create a DataFrame
ohe_cols = ohe.get_feature_names_out(onehot_cols)
df_ohe = pd.DataFrame(ohe_array, columns=ohe_cols, index=df.index)

# Concatenate back to original df and drop original columns
df = pd.concat([df.drop(columns=onehot_cols), df_ohe], axis=1)

# Label encode Ticket_State
ticket_le = LabelEncoder()
df['Ticket_State_Label'] = ticket_le.fit_transform(df['Ticket_State'])

# Drop the original Ticket_State column if desired
df.drop(columns=['Ticket_State'], inplace=True)

In [26]:
df.shape

(46606, 39)

#### Train_Test_split

In [28]:
# Removing cols that may mismatch patterns
leak_cols = [
    'Logged_At', 
    'Reinitiated_At', 
    'Solution_Completed_At', 
    'Final_Closure_At',
    'Effort_Duration_Hrs',
]
df_ml = df.drop(columns=leak_cols)

In [29]:
df_ml['Is_High_Priority'] = (df_ml['Response_Priority'] <= 2).astype(int)
# Features & target
X = df_ml.drop(columns=['Is_High_Priority', 'Response_Priority','Business_Impact_Clean' ])
y = df_ml['Is_High_Priority']

# Stratified split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42
)

# Machine Leaning

### High-Priority Ticket Prediction (P1, P2)

In [30]:
# Random Forest
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=8,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)
pred_rf = rf.predict(X_test)
prob_rf = rf.predict_proba(X_test)[:,1]

print("RF Classification Report\n", classification_report(y_test, pred_rf))
print("RF ROC AUC:", roc_auc_score(y_test, prob_rf))

# XGBoost
xgb = XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=(y_train==0).sum()/(y_train==1).sum(),
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42,
    n_jobs=-1
)
xgb.fit(X_train, y_train)
pred_xgb = xgb.predict(X_test)
prob_xgb = xgb.predict_proba(X_test)[:,1]

print("\nXGB Classification Report\n", classification_report(y_test, pred_xgb))
print("XGB ROC AUC:", roc_auc_score(y_test, prob_xgb))


RF Classification Report
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     13772
           1       0.98      1.00      0.99       210

    accuracy                           1.00     13982
   macro avg       0.99      1.00      0.99     13982
weighted avg       1.00      1.00      1.00     13982

RF ROC AUC: 0.9998091365503506

XGB Classification Report
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     13772
           1       0.98      1.00      0.99       210

    accuracy                           1.00     13982
   macro avg       0.99      1.00      0.99     13982
weighted avg       1.00      1.00      1.00     13982

XGB ROC AUC: 0.9999355144323195


In [31]:
print(f" Random Forest F1 Score: {f1_score(y_test, pred_rf)*100}")
print(f" XG Boost F1 Score: {f1_score(y_test, pred_xgb)*100}")

 Random Forest F1 Score: 98.81796690307328
 XG Boost F1 Score: 98.81796690307328


### Automated Classification of Tickets

In [34]:
df_priority = df_ml.copy()
drop_cols=['Is_High_Priority','Response_Priority','Time_Criticality_Clean']
X_prio = df_priority.drop(columns=drop_cols, errors='ignore')
y_prio = df_priority['Response_Priority'].astype(int) - 1


# Train/test split: stratify by priority
X_train_pr, X_test_pr, y_train_pr, y_test_pr = train_test_split(
    X_prio, y_prio,
    test_size=0.2,
    stratify=y_prio,
    random_state=42
)

# 2) XGBoost for Priority
xgb_prio = XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    use_label_encoder=False,
    eval_metric='mlogloss',
    random_state=42
)
xgb_prio.fit(X_train_pr, y_train_pr)
y_pred_xgb_prio = xgb_prio.predict(X_test_pr)
print("=== Priority Classification: XGBoost ===")
print(classification_report(y_test_pr, y_pred_xgb_prio, digits=4))

# 3) Random Forest for Priority
rf_prio = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)
rf_prio.fit(X_train_pr, y_train_pr)
y_pred_rf_prio = rf_prio.predict(X_test_pr)
print("=== Priority Classification: Random Forest ===")
print(classification_report(y_test_pr, y_pred_rf_prio, digits=4))


=== Priority Classification: XGBoost ===
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000         1
           1     1.0000    0.9928    0.9964       139
           2     0.9962    0.9784    0.9872      1065
           3     0.9952    0.9900    0.9926      4820
           4     0.9859    0.9997    0.9928      3297

    accuracy                         0.9921      9322
   macro avg     0.7955    0.7922    0.7938      9322
weighted avg     0.9920    0.9921    0.9920      9322

=== Priority Classification: Random Forest ===
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000         1
           1     0.8804    0.5827    0.7013       139
           2     0.9951    0.9446    0.9692      1065
           3     0.9764    0.9867    0.9815      4820
           4     0.9848    1.0000    0.9923      3297

    accuracy                         0.9805      9322
   macro avg     0.7673    0.7028    0.7289

### RFC and Misconfiguration Risk Prediction

#### Request for Change

In [35]:
""" 
Only one value is avaialbale it won't be helpful, so trying with a different feature

"""
df_ml['Issue_Type_request for change'].value_counts()

Issue_Type_request for change
0.0    46605
1.0        1
Name: count, dtype: int64

In [36]:
"""
As we the actual feature has only 1 value, which wont help us, lets try with differnt feature

"""
df_ml['Will_Require_Change'] = (df_ml['Change_Link_Count'] > 0).astype(int)

In [37]:
"""
Here Also less count of 1's, using SMOT for oversampling
"""
df_ml['Will_Require_Change'].value_counts()

Will_Require_Change
0    46046
1      560
Name: count, dtype: int64

In [38]:
X = df_ml.drop(columns=['Will_Require_Change','Change_Link_Count'])
y = df_ml['Will_Require_Change']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,
    stratify=y,
    random_state=42)

In [39]:
# === BALANCING STEP (CHANGED) ===

sm = SMOTE(
    sampling_strategy='auto',
    random_state=0,
    k_neighbors=5,
)
X_res, y_res = sm.fit_resample(X, y)
X_train_bal, y_train_bal = sm.fit_resample(X_train, y_train)

print("After oversampling, training class distribution:", np.bincount(y_train_bal))

# 3) Random Forest with balanced data
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=8,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train_bal, y_train_bal)  # CHANGED: fit on balanced data
pred_rf = rf.predict(X_test)
prob_rf = rf.predict_proba(X_test)[:,1]
print("=== Random Forest (with oversampling) ===")
print(classification_report(y_test, pred_rf))
print("ROC AUC:", roc_auc_score(y_test, prob_rf))

# 4) XGBoost with scale_pos_weight and balanced data
# Compute new scale_pos_weight after oversampling
scale_pos = (y_train_bal == 0).sum() / (y_train_bal == 1).sum()
xgb = XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=scale_pos,  # CHANGED: recomputed after oversampling
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42,
    n_jobs=-1
)
xgb.fit(X_train_bal, y_train_bal)  # CHANGED: fit on balanced data
pred_xgb = xgb.predict(X_test)
prob_xgb = xgb.predict_proba(X_test)[:,1]
print("=== XGBoost (with oversampling) ===")
print(classification_report(y_test, pred_xgb))
print("ROC AUC:", roc_auc_score(y_test, prob_xgb))

After oversampling, training class distribution: [36836 36836]
=== Random Forest (with oversampling) ===
              precision    recall  f1-score   support

           0       0.99      0.76      0.86      9210
           1       0.03      0.64      0.06       112

    accuracy                           0.76      9322
   macro avg       0.51      0.70      0.46      9322
weighted avg       0.98      0.76      0.85      9322

ROC AUC: 0.7779412905227238
=== XGBoost (with oversampling) ===
              precision    recall  f1-score   support

           0       0.99      0.87      0.93      9210
           1       0.04      0.46      0.08       112

    accuracy                           0.87      9322
   macro avg       0.52      0.67      0.50      9322
weighted avg       0.98      0.87      0.92      9322

ROC AUC: 0.7372120753838995


#### HyperParameter Tuning For RFC ( Scoring= roc_auc )

In [40]:
## Random Forest pipeline
rf_pipe = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('rf', RandomForestClassifier(random_state=42, n_jobs=-1))
])

## Bayesian search space for RF
rf_search_space = {
    'rf__n_estimators': (50, 300),
    'rf__max_depth': (3, 12),
    'rf__min_samples_leaf': (1, 20)
}

bayes_rf = BayesSearchCV(
    rf_pipe,
    rf_search_space,
    n_iter=20,
    scoring='roc_auc', # Changed to roc_auc
    cv=5,
    random_state=42,
    n_jobs=-1
)
print("Starting Bayesian search for RF pipeline...")
bayes_rf.fit(X_train, y_train)
print("Best RF params:", bayes_rf.best_params_)

y_pred_rf = bayes_rf.predict(X_test)
y_proba_rf = bayes_rf.predict_proba(X_test)[:, 1] # Get probabilities for ROC AUC

print("=== Tuned Random Forest (SMOTE + BayesianSearch) ===")
print(classification_report(y_test, y_pred_rf))
print(f"Random Forest ROC AUC: {roc_auc_score(y_test, y_proba_rf):.4f}") # Print ROC AUC

## XGBoost pipeline
xgb_pipe = Pipeline([
    ('smote', SMOTE(
        sampling_strategy='auto',
        random_state=0,
        k_neighbors=5,)),
    ('xgb', XGBClassifier(
        use_label_encoder=False,
        eval_metric='logloss',
        random_state=42,
        n_jobs=-1
    ))
])

## Bayesian search space for XGB
xgb_search_space = {
    'xgb__n_estimators': (100, 500),
    'xgb__max_depth': (3, 12),
    'xgb__learning_rate': (1e-3, 1.0, 'log-uniform'),
    'xgb__subsample': (0.5, 1.0, 'uniform'),
    'xgb__colsample_bytree': (0.5, 1.0, 'uniform')
}

bayes_xgb = BayesSearchCV(
    xgb_pipe,
    xgb_search_space,
    n_iter=20,
    scoring='roc_auc',  # Changed to roc_auc
    cv=5,
    random_state=42,
    n_jobs=-1
)
print("Starting Bayesian search for XGBoost pipeline...")
bayes_xgb.fit(X_train, y_train)
print("Best XGB params:", bayes_xgb.best_params_)

y_pred_xgb = bayes_xgb.predict(X_test)
y_proba_xgb = bayes_xgb.predict_proba(X_test)[:, 1] # Get probabilities for ROC AUC

print("=== Tuned XGBoost (SMOTE + BayesianSearch) ===")
print(classification_report(y_test, y_pred_xgb))
print(f"XGBoost ROC AUC: {roc_auc_score(y_test, y_proba_xgb):.4f}") # Print ROC AUC

Starting Bayesian search for RF pipeline...
Best RF params: OrderedDict({'rf__max_depth': 12, 'rf__min_samples_leaf': 20, 'rf__n_estimators': 300})
=== Tuned Random Forest (SMOTE + BayesianSearch) ===
              precision    recall  f1-score   support

           0       0.99      0.79      0.88      9210
           1       0.03      0.60      0.06       112

    accuracy                           0.79      9322
   macro avg       0.51      0.69      0.47      9322
weighted avg       0.98      0.79      0.87      9322

Random Forest ROC AUC: 0.7923
Starting Bayesian search for XGBoost pipeline...
Best XGB params: OrderedDict({'xgb__colsample_bytree': 1.0, 'xgb__learning_rate': 0.00979199861122808, 'xgb__max_depth': 12, 'xgb__n_estimators': 500, 'xgb__subsample': 0.5})
=== Tuned XGBoost (SMOTE + BayesianSearch) ===
              precision    recall  f1-score   support

           0       0.99      0.87      0.93      9210
           1       0.04      0.45      0.08       112

    acc

### Misconfiguration Risk Prediction

In [41]:
# 2. Instantiate IsolationForest with tuned hyperparameters
iso = IsolationForest(
    n_estimators=300,        # number of trees in the ensemble
    max_samples=0.7,         # use 70% of samples for each tree (reduces variance)
    contamination=0.05,      # assume ~5% of tickets are high‐risk anomalies
    max_features=0.7,        # use 70% of features when building each tree
    bootstrap=False,         # sample without replacement
    n_jobs=-1,               # parallelize across all CPUs
    random_state=42,         # reproducible results
    verbose=0
)

# 3. Fit on all available tickets (unsupervised)
iso.fit(df_ml)

# 4. Predict anomaly scores: -1 for anomalous (high misconfiguration risk), 1 for normal
anom_labels = iso.predict(df_ml)

# 5. Count how many tickets are flagged as high risk
high_risk_count = np.sum(anom_labels == -1)
print("=== Isolation Forest Misconfiguration Risk ===")
print(f"Total tickets flagged as high risk (~top 5%): {high_risk_count}")

# 6. (Optional) Attach risk label back to DataFrame
df_ml['Iso_Risk_Flag'] = (anom_labels == -1).astype(int)

=== Isolation Forest Misconfiguration Risk ===
Total tickets flagged as high risk (~top 5%): 2331


In [42]:
"""
The IT team can evaluate 2331 records, which are misconfigured, this saves a lot of time 
"""

'\nThe IT team can evaluate 2331 records, which are misconfigured\n'