In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, hamming_loss, precision_score, recall_score, 
    f1_score, classification_report
)
from sklearn.multioutput import MultiOutputClassifier
import pandas as pd
import numpy as np

# Load the data
file_path = r"C:\Users\Muralish\Desktop\GemAppraisal-DSGP\notebook\Notebook-Norman\Datset used\final_merged.csv"
df = pd.read_csv(file_path)

# Split data into features (X) and target (y)
X = df.iloc[:, 92:110].values  # Features
y = df.iloc[:, 1:92].values  # Targets (binary multi-label data)

# Fill NaN values in target columns with 0
y = np.nan_to_num(y, nan=0)

# First split: Separate test set (30%)
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Second split: Separate train and validation set
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Initialize MultiOutputClassifier with RandomForest 
multi_output_model = MultiOutputClassifier(
    estimator=RandomForestClassifier(random_state=42), n_jobs=-1
)

# Train the model
multi_output_model.fit(X_train, y_train)

# Function to evaluate the model
def evaluate_model(X, y, dataset_name):
    y_pred = multi_output_model.predict(X)
    
    # Compute evaluation metrics
    hamming = hamming_loss(y, y_pred)
    precision = precision_score(y, y_pred, average='micro', zero_division=1)
    recall = recall_score(y, y_pred, average='micro', zero_division=1)
    f1 = f1_score(y, y_pred, average='micro', zero_division=1)
    subset_accuracy = accuracy_score(y, y_pred)

    # Classification report
    report = classification_report(y, y_pred, zero_division=1)
    
    print(f"{dataset_name} Hamming Loss: {hamming}")
    print(f"{dataset_name} Precision: {precision}")
    print(f"{dataset_name} Recall: {recall}")
    print(f"{dataset_name} F1 Score: {f1}")
    print(f"{dataset_name} Subset Accuracy: {subset_accuracy}")
    print("\nClassification Report:\n")
    print(report)
    print("=" * 80)

# Evaluate on training, validation, and test sets
evaluate_model(X_train, y_train, "Training")
evaluate_model(X_val, y_val, "Validation")
evaluate_model(X_test, y_test, "Testing")


Training Hamming Loss: 0.0001336553959409208
Training Precision: 0.9993921184209
Training Recall: 0.9981415611349918
Training F1 Score: 0.9987664483218364
Training Subset Accuracy: 0.9968107360455413

Classification Report:

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16068
           1       1.00      1.00      1.00       201
           2       1.00      0.99      1.00       193
           3       1.00      1.00      1.00      1081
           4       1.00      1.00      1.00       866
           5       1.00      0.99      1.00       186
           6       1.00      1.00      1.00      6670
           7       1.00      1.00      1.00     12940
           8       1.00      1.00      1.00      5940
           9       1.00      1.00      1.00      1233
          10       1.00      1.00      1.00      1882
          11       1.00      1.00      1.00       367
          12       1.00      1.00      1.00     11988
          13      

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, hamming_loss, precision_score, recall_score, 
    f1_score, classification_report
)
from sklearn.multiclass import OneVsRestClassifier
import pandas as pd
import numpy as np

# Load the data
file_path = r"C:\Users\Muralish\Desktop\GemAppraisal-DSGP\notebook\Notebook-Norman\Datset used\final_merged.csv"
df = pd.read_csv(file_path)

# Split data into features (X) and target (y)
X = df.iloc[:, 92:110].values  # Features
y = df.iloc[:, 1:92].values  # Targets (binary multi-label data)

# Fill NaN values in target columns with 0
y = np.nan_to_num(y, nan=0)

# First split: Separate test set (30%)
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Second split: Separate train and validation set
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Initialize OneVsRestClassifier with RandomForest
ovr_model = OneVsRestClassifier(
    estimator=RandomForestClassifier(random_state=42, n_jobs=-1)
)

# Train the model
ovr_model.fit(X_train, y_train)

# Function to evaluate the model
def evaluate_model(X, y, dataset_name):
    y_pred = ovr_model.predict(X)
    
    # Compute evaluation metrics
    hamming = hamming_loss(y, y_pred)
    precision = precision_score(y, y_pred, average='micro', zero_division=1)
    recall = recall_score(y, y_pred, average='micro', zero_division=1)
    f1 = f1_score(y, y_pred, average='micro', zero_division=1)
    subset_accuracy = accuracy_score(y, y_pred)

    # Classification report
    report = classification_report(y, y_pred, zero_division=1)
    
    print(f"{dataset_name} Hamming Loss: {hamming}")
    print(f"{dataset_name} Precision: {precision}")
    print(f"{dataset_name} Recall: {recall}")
    print(f"{dataset_name} F1 Score: {f1}")
    print(f"{dataset_name} Subset Accuracy: {subset_accuracy}")
    print("\nClassification Report:\n")
    print(report)
    print("=" * 80)

# Evaluate on training, validation, and test sets
evaluate_model(X_train, y_train, "Training")
evaluate_model(X_val, y_val, "Validation")
evaluate_model(X_test, y_test, "Testing")




Training Hamming Loss: 0.0001336553959409208
Training Precision: 0.9993921184209
Training Recall: 0.9981415611349918
Training F1 Score: 0.9987664483218364
Training Subset Accuracy: 0.9968107360455413

Classification Report:

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16068
           1       1.00      1.00      1.00       201
           2       1.00      0.99      1.00       193
           3       1.00      1.00      1.00      1081
           4       1.00      1.00      1.00       866
           5       1.00      0.99      1.00       186
           6       1.00      1.00      1.00      6670
           7       1.00      1.00      1.00     12940
           8       1.00      1.00      1.00      5940
           9       1.00      1.00      1.00      1233
          10       1.00      1.00      1.00      1882
          11       1.00      1.00      1.00       367
          12       1.00      1.00      1.00     11988
          13      

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, hamming_loss, precision_score, recall_score, 
    f1_score, classification_report
)
from sklearn.multioutput import MultiOutputClassifier
from sklearn.utils.class_weight import compute_class_weight
import pandas as pd
import numpy as np

# Load the data
file_path = r"C:\Users\Muralish\Desktop\GemAppraisal-DSGP\notebook\Notebook-Norman\Datset used\final_merged.csv"
df = pd.read_csv(file_path)

# Split data into features (X) and target (y)
X = df.iloc[:, 92:110].values  # Features
y = df.iloc[:, 1:92].values  # Targets (binary multi-label data)

# Fill NaN values in target columns with 0
y = np.nan_to_num(y, nan=0)

# First split: Separate test set (30%)
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Second split: Separate train and validation set
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Compute class weights for each label
class_weights = []
for i in range(y_train.shape[1]):
    unique_classes = np.unique(y_train[:, i])
    
    # Skip labels with only one unique class (either all 0s or all 1s)
    if len(unique_classes) == 1:
        class_weights.append(None)  # No weighting for this label
    else:
        weights = compute_class_weight(
            class_weight='balanced',
            classes=unique_classes,
            y=y_train[:, i]
        )
        class_weights.append({unique_classes[j]: weights[j] for j in range(len(unique_classes))})

# Initialize MultiOutputClassifier with class-weighted RandomForest
estimators = []
for i in range(y_train.shape[1]):
    if class_weights[i] is None:
        # No class weighting for this label
        clf = RandomForestClassifier(random_state=42, n_jobs=-1)
    else:
        # Apply class weights
        clf = RandomForestClassifier(random_state=42, n_jobs=-1, class_weight=class_weights[i])
    
    estimators.append(clf)

# Train the model
multi_output_model = MultiOutputClassifier(estimator=RandomForestClassifier(random_state=42, n_jobs=-1))
multi_output_model.fit(X_train, y_train)

# Function to evaluate the model
def evaluate_model(X, y, dataset_name):
    y_pred = multi_output_model.predict(X)
    
    # Compute evaluation metrics
    hamming = hamming_loss(y, y_pred)
    precision = precision_score(y, y_pred, average='micro', zero_division=1)
    recall = recall_score(y, y_pred, average='micro', zero_division=1)
    f1 = f1_score(y, y_pred, average='micro', zero_division=1)
    subset_accuracy = accuracy_score(y, y_pred)

    # Classification report
    report = classification_report(y, y_pred, zero_division=1)
    
    print(f"{dataset_name} Hamming Loss: {hamming}")
    print(f"{dataset_name} Precision: {precision}")
    print(f"{dataset_name} Recall: {recall}")
    print(f"{dataset_name} F1 Score: {f1}")
    print(f"{dataset_name} Subset Accuracy: {subset_accuracy}")
    print("\nClassification Report:\n")
    print(report)
    print("=" * 80)

# Evaluate on training, validation, and test sets
evaluate_model(X_train, y_train, "Training")
evaluate_model(X_val, y_val, "Validation")
evaluate_model(X_test, y_test, "Testing")


Training Hamming Loss: 0.0001336553959409208
Training Precision: 0.9993921184209
Training Recall: 0.9981415611349918
Training F1 Score: 0.9987664483218364
Training Subset Accuracy: 0.9968107360455413

Classification Report:

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16068
           1       1.00      1.00      1.00       201
           2       1.00      0.99      1.00       193
           3       1.00      1.00      1.00      1081
           4       1.00      1.00      1.00       866
           5       1.00      0.99      1.00       186
           6       1.00      1.00      1.00      6670
           7       1.00      1.00      1.00     12940
           8       1.00      1.00      1.00      5940
           9       1.00      1.00      1.00      1233
          10       1.00      1.00      1.00      1882
          11       1.00      1.00      1.00       367
          12       1.00      1.00      1.00     11988
          13      