In [1]:



from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, hamming_loss, precision_score, recall_score, f1_score
from sklearn.multiclass import OneVsRestClassifier
import pandas as pd
import numpy as np

# Updated file path
file_path = r"/content/final_combined_data.csv"

# Load the data
df = pd.read_csv(file_path)

# Split data into features (X) and target (y)
X = df.iloc[:, 92:120].values  # Features
y = df.iloc[:, 1:92].values  # Targets (binary multi-label data)



# First split: Separate test set (30%) from the rest (70% for train + validate)
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Second split: Separate train set (50%) and validation set (20% of the remaining 70%)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Initialize RandomForestClassifier with class balancing
rf_model = RandomForestClassifier(random_state=42, class_weight='balanced')

# Use OneVsRestClassifier for multi-label classification
model = OneVsRestClassifier(rf_model)

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the training, validation, and testing sets
y_train_pred = model.predict(X_train)
y_val_pred = model.predict(X_val)
y_test_pred = model.predict(X_test)

# Calculate metrics for training data
train_hamming = hamming_loss(y_train, y_train_pred)
train_precision = precision_score(y_train, y_train_pred, average='micro')
train_recall = recall_score(y_train, y_train_pred, average='micro')
train_f1 = f1_score(y_train, y_train_pred, average='micro')
train_subset_accuracy = accuracy_score(y_train, y_train_pred)  # Subset accuracy for training

# Calculate metrics for validation data
val_hamming = hamming_loss(y_val, y_val_pred)
val_precision = precision_score(y_val, y_val_pred, average='micro')
val_recall = recall_score(y_val, y_val_pred, average='micro')
val_f1 = f1_score(y_val, y_val_pred, average='micro')
val_subset_accuracy = accuracy_score(y_val, y_val_pred)  # Subset accuracy for validation

# Calculate metrics for testing data
test_hamming = hamming_loss(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred, average='micro')
test_recall = recall_score(y_test, y_test_pred, average='micro')
test_f1 = f1_score(y_test, y_test_pred, average='micro')
test_subset_accuracy = accuracy_score(y_test, y_test_pred)  # Subset accuracy for testing

# Print overall metrics
print(f"Training Hamming Loss: {train_hamming}")
print(f"Training Precision: {train_precision}")
print(f"Training Recall: {train_recall}")
print(f"Training F1 Score: {train_f1}")
print(f"Training Subset Accuracy: {train_subset_accuracy}")
print()

print(f"Validation Hamming Loss: {val_hamming}")
print(f"Validation Precision: {val_precision}")
print(f"Validation Recall: {val_recall}")
print(f"Validation F1 Score: {val_f1}")
print(f"Validation Subset Accuracy: {val_subset_accuracy}")
print()

print(f"Testing Hamming Loss: {test_hamming}")
print(f"Testing Precision: {test_precision}")
print(f"Testing Recall: {test_recall}")
print(f"Testing F1 Score: {test_f1}")
print(f"Testing Subset Accuracy: {test_subset_accuracy}")






Training Hamming Loss: 1.8534449149780078e-05
Training Precision: 0.9997325177061649
Training Recall: 0.999930208212167
Training F1 Score: 0.999831353187134
Training Subset Accuracy: 0.9993718739095033

Validation Hamming Loss: 0.016816174020347522
Validation Precision: 0.9300840859545313
Validation Recall: 0.7503506996545347
Validation F1 Score: 0.8306055678091478
Validation Subset Accuracy: 0.43691477358644193

Testing Hamming Loss: 0.01688138161200992
Testing Precision: 0.9288986123853596
Testing Recall: 0.7501804858189713
Testing F1 Score: 0.8300283328653731
Testing Subset Accuracy: 0.4352422309675668
