In [14]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix

# Load the dataset
file_path = "Dataset/processed_file.csv" 
df = pd.read_csv(file_path)

# Drop rows where 'price' is missing
df.dropna(subset=['price_SAR'], inplace=True)

# Categorize products based on notable effects and product type
def categorize_product(row):
    notable_effects = row['notable_effects']
    product_type = row['product_type']
    
    if 'hydrating' in notable_effects or 'moisturizing' in notable_effects:
        return 'Hydrating/Moisturizing ' + product_type
    elif 'acne-free' in notable_effects or 'acne-spot' in notable_effects:
        return 'Acne Treatment ' + product_type
    elif 'anti-aging' in notable_effects:
        return 'Anti-Aging ' + product_type
    elif 'brightening' in notable_effects or 'black-spot' in notable_effects:
        return 'Brightening ' + product_type
    elif 'oil-control' in notable_effects:
        return 'Oil Control ' + product_type
    elif 'pore-care' in notable_effects:
        return 'Pore Care ' + product_type
    elif 'skin-barrier' in notable_effects or 'soothing' in notable_effects:
        return 'Sensitive Skin Care ' + product_type
    elif 'uv-protection' in notable_effects:
        return 'Sunscreen/UV Protection ' + product_type
    else:
        return 'General ' + product_type

# Apply categorization
df['product_category'] = df.apply(categorize_product, axis=1)

# Define features and target variable
features = ['Sensitive', 'Combination', 'Oily', 'Dry', 'Normal',  # Skin types
            'product_type_face wash', 'product_type_moisturizer', 'product_type_serum', 'product_type_sunscreen', 'product_type_toner',  # Product types
            'notable_effects_acne-free', 'notable_effects_acne-spot', 'notable_effects_anti-aging', 'notable_effects_balancing',
            'notable_effects_black-spot', 'notable_effects_brightening', 'notable_effects_hydrating', 'notable_effects_moisturizing',
            'notable_effects_no-whitecast', 'notable_effects_oil-control', 'notable_effects_pore-care', 'notable_effects_refreshing',
            'notable_effects_skin-barrier', 'notable_effects_soothing', 'notable_effects_uv-protection']  # Notable effects

target = 'product_category'

# Encode the target variable
label_encoder = LabelEncoder()
df[target] = label_encoder.fit_transform(df[target])

# Split dataset into training and testing
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Random Forest Model
rf_model = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=42)
rf_model.fit(X_train, y_train)

# Make Predictions
y_pred = rf_model.predict(X_test)

# Compute evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted', zero_division=1)
f1 = f1_score(y_test, y_pred, average='weighted')

# Compute specificity properly for multi-class classification
conf_matrix = confusion_matrix(y_test, y_pred)
specificities = []

for class_idx in range(conf_matrix.shape[0]):
    tp = conf_matrix[class_idx, class_idx]
    fn = conf_matrix[class_idx, :].sum() - tp
    fp = conf_matrix[:, class_idx].sum() - tp
    tn = conf_matrix.sum() - (tp + fn + fp)
    
    # Avoid division by zero
    spec = tn / (tn + fp) if (tn + fp) > 0 else 0
    specificities.append(spec)

# Compute the mean specificity across all classes
specificity = np.mean(specificities)

# Organize Metrics into a DataFrame
metrics_df = pd.DataFrame({
    "Metric": ["Accuracy", "Recall", "Precision", "F1 Score", "Specificity"],
    "Value": [accuracy, recall, precision, f1, specificity]
})

# Display results
from IPython.display import display
print("\n📊 Model Performance Metrics:\n")
display(metrics_df)



📊 Model Performance Metrics:



Unnamed: 0,Metric,Value
0,Accuracy,0.983402
1,Recall,0.983402
2,Precision,0.985028
3,F1 Score,0.980423
4,Specificity,0.999344
