In [2]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Load the dataset
file_path = r'C:\Users\Muralish\Desktop\Sapphires_Cleaned\combine\combined_data.csv'  # Update with the correct file path
data = pd.read_csv(file_path)

# Step 1: Remove rows with NaN values (if any)
data_clean = data.dropna()

# Extract the 'ItemID' column and features for PCA (excluding 'ItemID')
itemid = data_clean['ItemID']  # Assuming 'ItemID' is the correct column name
features = data_clean.drop(columns=['ItemID'])  # Drop the 'ItemID' column

# Step 2: Normalize the features (Standardization)
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Step 3: Perform PCA (retain 95% of variance)
pca = PCA(n_components=30)  # Retain 95% of the variance
features_pca = pca.fit_transform(features_scaled)

# Convert the PCA results to a DataFrame with the original 'ItemID'
pca_df = pd.DataFrame(features_pca)
pca_df['ItemID'] = itemid  # Add 'ItemID' back to the DataFrame

# Save the PCA results to a CSV file in the same directory as the input file
output_file = r'C:\Users\Muralish\Desktop\Sapphires_Cleaned\combine\updated_final_result.csv'
pca_df.to_csv(output_file, index=False)

print(f"PCA results saved to {output_file}")


PCA results saved to C:\Users\Muralish\Desktop\Sapphires_Cleaned\combine\updated_final_result.csv


In [4]:
import pandas as pd

# Paths to the input files
encoded_csv_path = r'C:\Users\Muralish\Desktop\Sapphires_Cleaned\combine\updated_encoded.csv'
combined_pca_path = r'C:\Users\Muralish\Desktop\Sapphires_Cleaned\combine\updated_PCA.csv'

# Path to save the final combined file
output_path = r'C:\Users\Muralish\Desktop\Sapphires_Cleaned\combine\updated_final_combined_data.csv'

# Load the encoded CSV and combined PCA CSV into DataFrames
encoded_df = pd.read_csv(encoded_csv_path)
combined_pca_df = pd.read_csv(combined_pca_path)

# Ensure the 'Item ID' column exists in both DataFrames
if 'ItemID' in encoded_df.columns and 'ItemID' in combined_pca_df.columns:
    # Merge the DataFrames on 'Item ID'
    merged_df = pd.merge(encoded_df, combined_pca_df, on='ItemID', how='inner')  # Use 'inner' for items present in both files
    
    # Save the merged DataFrame to a new CSV file
    merged_df.to_csv(output_path, index=False)
    print(f"Data successfully merged and saved to: {output_path}")
else:
    print("Error: 'Item ID' column not found in one or both files.")


Data successfully merged and saved to: C:\Users\Muralish\Desktop\Sapphires_Cleaned\combine\updated_final_combined_data.csv


In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, hamming_loss, precision_score, recall_score, f1_score
from sklearn.multioutput import MultiOutputClassifier
from sklearn.utils.class_weight import compute_class_weight
import pandas as pd
import numpy as np
import joblib
import os

# Load the data
file_path = r"C:\Users\Muralish\Desktop\Sapphires_Cleaned\combine\updated_final_combined_data.csv"
df = pd.read_csv(file_path)

# Split data into features (X) and target (y)
X = df.iloc[:, 92:122].values  # Features
y = df.iloc[:, 1:92].values  # Targets (binary multi-label data)

# First split: Separate test set (30%)
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Second split: Separate train and validation set
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Compute class weights for each label
class_weights = []
for i in range(y.shape[1]):  # Loop through each label column
    class_weight = compute_class_weight(
        class_weight='balanced',
        classes=np.unique(y_train[:, i]),
        y=y_train[:, i]
    )
    class_weights.append(dict(enumerate(class_weight)))

# Initialize MultiOutputClassifier with RandomForest
multi_output_model = MultiOutputClassifier(estimator=RandomForestClassifier(random_state=42), n_jobs=-1)

# Train the model
multi_output_model.fit(X_train, y_train)

# Function to evaluate the model
def evaluate_model(X, y, dataset_name):
    y_pred = multi_output_model.predict(X)
    hamming = hamming_loss(y, y_pred)
    precision = precision_score(y, y_pred, average='micro')
    recall = recall_score(y, y_pred, average='micro')
    f1 = f1_score(y, y_pred, average='micro')
    subset_accuracy = accuracy_score(y, y_pred)
    print(f"{dataset_name} Hamming Loss: {hamming}")
    print(f"{dataset_name} Precision: {precision}")
    print(f"{dataset_name} Recall: {recall}")
    print(f"{dataset_name} F1 Score: {f1}")
    print(f"{dataset_name} Subset Accuracy: {subset_accuracy}")
    print()

# Evaluate on training, validation, and test sets
evaluate_model(X_train, y_train, "Training")
evaluate_model(X_val, y_val, "Validation")
evaluate_model(X_test, y_test, "Testing")

# Save the model
save_path = r"C:\Users\Muralish\Desktop\Sapphires_Cleaned\combine\rf_model.joblib"
joblib.dump(multi_output_model, save_path)

print(f"Model saved at: {save_path}")


Training Hamming Loss: 1.8406625362540218e-05
Training Precision: 0.9998766902738871
Training Recall: 0.9997882982435733
Training F1 Score: 0.9998324923051153
Training Subset Accuracy: 0.9994649296266139

Validation Hamming Loss: 0.016398706387190876
Validation Precision: 0.93006314852739
Validation Recall: 0.7585860348256971
Validation F1 Score: 0.8356181058021134
Validation Subset Accuracy: 0.45972479091786767

Testing Hamming Loss: 0.01640626980564557
Testing Precision: 0.9296146928081975
Testing Recall: 0.7588628036368571
Testing F1 Score: 0.8356048731140442
Testing Subset Accuracy: 0.4586511059845298

Model saved at: C:\Users\Muralish\Desktop\Sapphires_Cleaned\combine\rf_model.joblib
