In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, hamming_loss, precision_score, recall_score,
    f1_score, classification_report
)
from sklearn.multiclass import OneVsRestClassifier
import pandas as pd
import numpy as np


file_path = r"final_merged.csv"
df = pd.read_csv(file_path)


X = df.iloc[:, 92:110].values
y = df.iloc[:, 1:92].values


y = np.nan_to_num(y, nan=0)

X = np.nan_to_num(X, nan=0)


X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Initialize OneVsRestClassifier with Logistic Regression
logistic_model = OneVsRestClassifier(
    LogisticRegression(max_iter=500, solver='lbfgs', n_jobs=-1)
)

# Train the model
logistic_model.fit(X_train, y_train)

# Function to evaluate the model
def evaluate_model(X, y, dataset_name):
    y_pred = logistic_model.predict(X)

    # Compute evaluation metrics
    hamming = hamming_loss(y, y_pred)
    precision = precision_score(y, y_pred, average='micro', zero_division=1)
    recall = recall_score(y, y_pred, average='micro', zero_division=1)
    f1 = f1_score(y, y_pred, average='micro', zero_division=1)
    subset_accuracy = accuracy_score(y, y_pred)

    # Classification report
    report = classification_report(y, y_pred, zero_division=1)

    print(f"{dataset_name} Hamming Loss: {hamming}")
    print(f"{dataset_name} Precision: {precision}")
    print(f"{dataset_name} Recall: {recall}")
    print(f"{dataset_name} F1 Score: {f1}")
    print(f"{dataset_name} Subset Accuracy: {subset_accuracy}")
    print("\nClassification Report:\n")
    print(report)
    print("=" * 80)

# Evaluate on training, validation, and test sets
evaluate_model(X_train, y_train, "Training")
evaluate_model(X_val, y_val, "Validation")
evaluate_model(X_test, y_test, "Testing")




Training Hamming Loss: 0.04660440552805752
Training Precision: 0.6769247854854925
Training Recall: 0.26836367006917006
Training F1 Score: 0.384352561858125
Training Subset Accuracy: 0.003978006652873196

Classification Report:

              precision    recall  f1-score   support

           0       0.93      0.88      0.90     16068
           1       0.85      0.05      0.10       201
           2       1.00      0.00      0.00       193
           3       0.00      0.00      0.00      1081
           4       0.40      0.02      0.03       866
           5       1.00      0.00      0.00       186
           6       0.60      0.02      0.04      6670
           7       0.50      0.04      0.08     12940
           8       0.51      0.05      0.09      5940
           9       0.08      0.00      0.00      1233
          10       0.00      0.00      0.00      1882
          11       0.00      0.00      0.00       367
          12       0.15      0.00      0.01     11988
          13   

In [3]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, hamming_loss, precision_score, recall_score,
    f1_score, classification_report
)
from sklearn.multioutput import MultiOutputClassifier
import pandas as pd
import numpy as np

# Load the data
file_path = r"final_merged.csv"
df = pd.read_csv(file_path)

# Split data into features (X) and target (y)
X = df.iloc[:, 92:110].values  # Features
y = df.iloc[:, 1:92].values  # Targets (binary multi-label data)

# Fill NaN values in target columns with 0
y = np.nan_to_num(y, nan=0)


# First split: Separate test set (30%)
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Second split: Separate train and validation set
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Initialize MultiOutputClassifier with XGBoost
xgb_model = MultiOutputClassifier(
    XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
)

# Train the model
xgb_model.fit(X_train, y_train)

# Function to evaluate the model
def evaluate_model(X, y, dataset_name):
    y_pred = xgb_model.predict(X)

    # Compute evaluation metrics
    hamming = hamming_loss(y, y_pred)
    precision = precision_score(y, y_pred, average='micro', zero_division=1)
    recall = recall_score(y, y_pred, average='micro', zero_division=1)
    f1 = f1_score(y, y_pred, average='micro', zero_division=1)
    subset_accuracy = accuracy_score(y, y_pred)

    # Classification report
    report = classification_report(y, y_pred, zero_division=1)

    print(f"{dataset_name} Hamming Loss: {hamming}")
    print(f"{dataset_name} Precision: {precision}")
    print(f"{dataset_name} Recall: {recall}")
    print(f"{dataset_name} F1 Score: {f1}")
    print(f"{dataset_name} Subset Accuracy: {subset_accuracy}")
    print("\nClassification Report:\n")
    print(report)
    print("=" * 80)

# Evaluate on training, validation, and test sets
evaluate_model(X_train, y_train, "Training")
evaluate_model(X_val, y_val, "Validation")
evaluate_model(X_test, y_test, "Testing")


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Training Hamming Loss: 0.007577281145255905
Training Precision: 0.9502085960997381
Training Recall: 0.9077894541704805
Training F1 Score: 0.9285147986845613
Training Subset Accuracy: 0.6164995827665436

Classification Report:

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16068
           1       1.00      1.00      1.00       201
           2       1.00      0.99      1.00       193
           3       1.00      1.00      1.00      1081
           4       1.00      1.00      1.00       866
           5       1.00      0.99      1.00       186
           6       0.98      0.85      0.91      6670
           7       0.95      0.69      0.80     12940
           8       0.99      0.87      0.93      5940
           9       1.00      1.00      1.00      1233
          10       1.00      0.94      0.97      1882
          11       1.00      1.00      1.00       367
          12       0.96      0.74      0.84     11988
          13    