In [7]:
# 1. Imports
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import xgboost as xgb

# 2. Load data
df_data = pd.read_csv('C:/Users/ammar/SHAP_ML/datasets/secom.data', sep=' ', header=None)
df_labels = pd.read_csv('C:/Users/ammar/SHAP_ML/datasets/secom_labels.data', sep=' ', header=None)
df_data.columns = [f'feature_{i}' for i in range(df_data.shape[1])]
df_data['label'] = df_labels[0]

# 3. Clean data
missing = df_data.isna().sum()
df_clean = df_data.loc[:, missing < df_data.shape[0] * 0.5].copy()
df_clean = df_clean.fillna(df_clean.mean())

# 4. Prepare features and labels
X = df_clean.drop(columns=['label'])
y = df_clean['label'].replace(-1, 0)

# 5. Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Train XGBoost model
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
xgb_model = xgb.XGBClassifier(
    n_estimators=100,
    random_state=42,
    scale_pos_weight=scale_pos_weight,
    eval_metric='logloss'
)
xgb_model.fit(X_train, y_train)

# 7. Evaluation
y_pred = xgb_model.predict(X_test)
print(classification_report(y_test, y_pred))

# 8. Save model (optional)
import joblib
joblib.dump(xgb_model, 'C:/Users/ammar/SHAP_ML/models/secom_xgboost.pkl')

              precision    recall  f1-score   support

           0       0.93      0.99      0.95       290
           1       0.20      0.04      0.07        24

    accuracy                           0.91       314
   macro avg       0.56      0.51      0.51       314
weighted avg       0.87      0.91      0.89       314



['C:/Users/ammar/SHAP_ML/models/secom_xgboost.pkl']