In [9]:
# Applied class weight for class imbalance
# Applied threashold of 0.4
# used multi-output classifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, hamming_loss, precision_score, recall_score, f1_score
from sklearn.multioutput import MultiOutputClassifier
from sklearn.utils.class_weight import compute_class_weight
import pandas as pd
import numpy as np

# Load the data
file_path = r"C:\Users\Muralish\Desktop\Sapphires_Cleaned\combine\final_combined_data.csv"
df = pd.read_csv(file_path)

# Split data into features (X) and target (y)
X = df.iloc[:, 92:120].values  # Features
y = df.iloc[:, 1:92].values  # Targets (binary multi-label data)

# First split: Separate test set (30%)
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Second split: Separate train and validation set
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Compute class weights for each label
class_weights = []
for i in range(y.shape[1]):  # Loop through each label column
    unique_classes = np.unique(y_train[:, i])
    if len(unique_classes) > 1:  # Avoid errors with single-class labels
        class_weight = compute_class_weight(class_weight='balanced', classes=unique_classes, y=y_train[:, i])
        class_weights.append(dict(zip(unique_classes, class_weight)))
    else:
        class_weights.append(None)  # No weight needed if only one class is present

# Initialize MultiOutputClassifier with RandomForestClassifier
multi_output_model = MultiOutputClassifier(RandomForestClassifier(random_state=42), n_jobs=-1)

# Fit the model
multi_output_model.fit(X_train, y_train)

# Function to apply a probability threshold (0.4) and evaluate
def evaluate_model(X, y, dataset_name, threshold=0.4):
    y_pred_probs = []
    
    # Loop through each label classifier
    for i, clf in enumerate(multi_output_model.estimators_):
        try:
            proba = clf.predict_proba(X)  # Get probabilities for label i
            if proba.shape[1] == 2:  # Binary classification check
                y_pred_probs.append(proba[:, 1] >= threshold)  # Apply threshold for class 1
            else:
                y_pred_probs.append(proba.argmax(axis=1))  # Use argmax if multi-class
        except AttributeError:
            y_pred_probs.append(clf.predict(X))  # Fallback for classifiers without predict_proba
    
    y_pred = np.array(y_pred_probs).T.astype(int)  # Convert to int and transpose

    # Calculate evaluation metrics
    hamming = hamming_loss(y, y_pred)
    precision = precision_score(y, y_pred, average='micro', zero_division=0)
    recall = recall_score(y, y_pred, average='micro', zero_division=0)
    f1 = f1_score(y, y_pred, average='micro', zero_division=0)
    subset_accuracy = accuracy_score(y, y_pred)

    print(f"{dataset_name} Hamming Loss: {hamming}")
    print(f"{dataset_name} Precision: {precision}")
    print(f"{dataset_name} Recall: {recall}")
    print(f"{dataset_name} F1 Score: {f1}")
    print(f"{dataset_name} Subset Accuracy: {subset_accuracy}")
    print()

# Evaluate on training, validation, and test sets
evaluate_model(X_train, y_train, "Training")
evaluate_model(X_val, y_val, "Validation")
evaluate_model(X_test, y_test, "Testing")


Training Hamming Loss: 3.5790660427161534e-05
Training Precision: 0.9993629655117898
Training Recall: 0.9999860416424334
Training F1 Score: 0.9996744064895438
Training Subset Accuracy: 0.9980923577992323

Validation Hamming Loss: 0.01468281982641476
Validation Precision: 0.8853058608237838
Validation Recall: 0.8418350373963314
Validation F1 Score: 0.8630233853205868
Validation Subset Accuracy: 0.5436600714194322

Testing Hamming Loss: 0.01472026412995337
Testing Precision: 0.8850608842129306
Testing Recall: 0.8413543221604017
Testing F1 Score: 0.8626543574902081
Testing Subset Accuracy: 0.5420952639435473



In [10]:
# Applied class weight for class imbalance
# Applied threashold of 0.5
# used multi-output classifier



from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, hamming_loss, precision_score, recall_score, f1_score
from sklearn.multioutput import MultiOutputClassifier
from sklearn.utils.class_weight import compute_class_weight
import pandas as pd
import numpy as np

# Load the data
file_path = r"C:\Users\Muralish\Desktop\Sapphires_Cleaned\combine\final_combined_data.csv"
df = pd.read_csv(file_path)

# Split data into features (X) and target (y)
X = df.iloc[:, 92:120].values  # Features
y = df.iloc[:, 1:92].values  # Targets (binary multi-label data)

# First split: Separate test set (30%)
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Second split: Separate train and validation set
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Compute class weights for each label
class_weights = []
for i in range(y.shape[1]):  # Loop through each label column
    unique_classes = np.unique(y_train[:, i])
    if len(unique_classes) > 1:  # Avoid errors with single-class labels
        class_weight = compute_class_weight(class_weight='balanced', classes=unique_classes, y=y_train[:, i])
        class_weights.append(dict(zip(unique_classes, class_weight)))
    else:
        class_weights.append(None)  # No weight needed if only one class is present

# Initialize MultiOutputClassifier with RandomForestClassifier
multi_output_model = MultiOutputClassifier(RandomForestClassifier(random_state=42), n_jobs=-1)

# Fit the model
multi_output_model.fit(X_train, y_train)

# Function to apply a probability threshold (0.4) and evaluate
def evaluate_model(X, y, dataset_name, threshold=0.5):
    y_pred_probs = []
    
    # Loop through each label classifier
    for i, clf in enumerate(multi_output_model.estimators_):
        try:
            proba = clf.predict_proba(X)  # Get probabilities for label i
            if proba.shape[1] == 2:  # Binary classification check
                y_pred_probs.append(proba[:, 1] >= threshold)  # Apply threshold for class 1
            else:
                y_pred_probs.append(proba.argmax(axis=1))  # Use argmax if multi-class
        except AttributeError:
            y_pred_probs.append(clf.predict(X))  # Fallback for classifiers without predict_proba
    
    y_pred = np.array(y_pred_probs).T.astype(int)  # Convert to int and transpose

    # Calculate evaluation metrics
    hamming = hamming_loss(y, y_pred)
    precision = precision_score(y, y_pred, average='micro', zero_division=0)
    recall = recall_score(y, y_pred, average='micro', zero_division=0)
    f1 = f1_score(y, y_pred, average='micro', zero_division=0)
    subset_accuracy = accuracy_score(y, y_pred)

    print(f"{dataset_name} Hamming Loss: {hamming}")
    print(f"{dataset_name} Precision: {precision}")
    print(f"{dataset_name} Recall: {recall}")
    print(f"{dataset_name} F1 Score: {f1}")
    print(f"{dataset_name} Subset Accuracy: {subset_accuracy}")
    print()

# Evaluate on training, validation, and test sets
evaluate_model(X_train, y_train, "Training")
evaluate_model(X_val, y_val, "Validation")
evaluate_model(X_test, y_test, "Testing")


Training Hamming Loss: 1.712838749014159e-05
Training Precision: 0.9998697153279916
Training Recall: 0.9998185413516343
Training F1 Score: 0.999844127685017
Training Subset Accuracy: 0.9995696173083634

Validation Hamming Loss: 0.015945192865317093
Validation Precision: 0.9265006010791468
Validation Recall: 0.7709576485093811
Validation F1 Score: 0.8416027020678184
Validation Subset Accuracy: 0.47914994591199356

Testing Hamming Loss: 0.015978281455279693
Testing Precision: 0.9262761219595752
Testing Recall: 0.770522458949654
Testing F1 Score: 0.8412507685811437
Testing Subset Accuracy: 0.4760483104898901



In [11]:
# Applied class weight for class imbalance
# No threashold
# used multi-output classifier


from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, hamming_loss, precision_score, recall_score, f1_score
from sklearn.multioutput import MultiOutputClassifier
from sklearn.utils.class_weight import compute_class_weight
import pandas as pd
import numpy as np

# Load the data
file_path = r"C:\Users\Muralish\Desktop\Sapphires_Cleaned\combine\final_combined_data.csv"
df = pd.read_csv(file_path)

# Split data into features (X) and target (y)
X = df.iloc[:, 92:120].values  # Features
y = df.iloc[:, 1:92].values  # Targets (binary multi-label data)

# First split: Separate test set (30%)
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Second split: Separate train and validation set
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Compute class weights for each label
class_weights = []
for i in range(y.shape[1]):  # Loop through each label column
    class_weight = compute_class_weight(
        class_weight='balanced',
        classes=np.unique(y_train[:, i]),
        y=y_train[:, i]
    )
    class_weights.append(dict(enumerate(class_weight)))

# Create a list of RandomForestClassifiers with corresponding class weights
rf_models = [
    RandomForestClassifier(random_state=42, class_weight=class_weights[i])
    for i in range(y.shape[1])
]

# Initialize MultiOutputClassifier with the list of weighted RandomForestClassifiers
multi_output_model = MultiOutputClassifier(estimator=RandomForestClassifier(random_state=42), n_jobs=-1)

# Fit the MultiOutputClassifier (weights are applied within individual classifiers)
multi_output_model.fit(X_train, y_train)

# Function to evaluate the model
def evaluate_model(X, y, dataset_name):
    y_pred = multi_output_model.predict(X)
    hamming = hamming_loss(y, y_pred)
    precision = precision_score(y, y_pred, average='micro')
    recall = recall_score(y, y_pred, average='micro')
    f1 = f1_score(y, y_pred, average='micro')
    subset_accuracy = accuracy_score(y, y_pred)
    print(f"{dataset_name} Hamming Loss: {hamming}")
    print(f"{dataset_name} Precision: {precision}")
    print(f"{dataset_name} Recall: {recall}")
    print(f"{dataset_name} F1 Score: {f1}")
    print(f"{dataset_name} Subset Accuracy: {subset_accuracy}")
    print()

# Evaluate on training, validation, and test sets
evaluate_model(X_train, y_train, "Training")
evaluate_model(X_val, y_val, "Validation")
evaluate_model(X_test, y_test, "Testing")


Training Hamming Loss: 1.815097778806049e-05
Training Precision: 0.9998697129030757
Training Recall: 0.9997999302082121
Training F1 Score: 0.9998348203380367
Training Subset Accuracy: 0.9994998255205304

Validation Hamming Loss: 0.016191889905118783
Validation Precision: 0.9296866142089389
Validation Recall: 0.7630154354375313
Validation F1 Score: 0.8381454218824067
Validation Subset Accuracy: 0.46719242535273525

Testing Hamming Loss: 0.016214644359575828
Testing Precision: 0.9295927009037859
Testing Recall: 0.7626570769439543
Testing F1 Score: 0.8378910268452173
Testing Subset Accuracy: 0.4641606730899715

