In [None]:
# ============================================
# STEP 1: Load Data
import pandas as pd
import numpy as np

df = pd.read_csv("Dataset/processed_file.csv")
print(df.columns.tolist())

# ============================================
# STEP 2: Add Informative Features

# --- Price Range Feature (Bucketed)
def map_price_range(price):
    if price < 50:
        return 'budget'
    elif 50 <= price < 150:
        return 'mid'
    else:
        return 'luxury'

df['price_range'] = df['price_SAR'].apply(map_price_range)

# --- Product Texture Feature (One-hot encode)
# Example values: cream, gel, spray, serum
df['product_texture'] = df['product_texture'].fillna('unknown')  # fill missing if needed

# ============================================
# STEP 3: Feature Encoding & Scaling

# Brand One-Hot
X = pd.get_dummies(df[['brand']], prefix='brand')

# Add Skin Type & Notable Effects
notable_effects_columns = [col for col in df.columns if col.startswith('notable_effects_')]
skin_type_columns = ['Sensitive', 'Combination', 'Oily', 'Dry', 'Normal']
X[notable_effects_columns + skin_type_columns] = df[notable_effects_columns + skin_type_columns]

# Add Price Range One-Hot
X = pd.concat([X, pd.get_dummies(df['price_range'], prefix='price')], axis=1)

# Add Texture One-Hot
X = pd.concat([X, pd.get_dummies(df['product_texture'], prefix='texture')], axis=1)

# Scale features
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# ============================================
# STEP 4: KMeans Clustering
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

kmeans = KMeans(n_clusters=5, random_state=42)
clusters = kmeans.fit_predict(X_scaled)
df['cluster'] = clusters

# ============================================
# STEP 5: Evaluation
score = silhouette_score(X_scaled, clusters)
print(f"Silhouette Score after adding features: {score:.2f}")
print(f"Within-Cluster Sum of Squares: {kmeans.inertia_:.2f}")

# ============================================
# STEP 6: PCA Visualization
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

plt.figure(figsize=(8, 6))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=clusters, palette='viridis', s=60)
plt.title("Clusters After Adding Informative Features")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.legend(title="Cluster")
plt.grid(True)
plt.tight_layout()
plt.show()


['Unnamed: 0', 'product_href', 'product_name', 'product_type', 'brand', 'notable_effects', 'skintype', 'price_SAR', 'description', 'picture_src', 'labels', 'Sensitive', 'Combination', 'Oily', 'Dry', 'Normal', 'product_type_face wash', 'product_type_moisturizer', 'product_type_serum', 'product_type_sunscreen', 'product_type_toner', 'notable_effects_acne-free', 'notable_effects_acne-spot', 'notable_effects_anti-aging', 'notable_effects_balancing', 'notable_effects_black-spot', 'notable_effects_brightening', 'notable_effects_hydrating', 'notable_effects_moisturizing', 'notable_effects_no-whitecast', 'notable_effects_oil-control', 'notable_effects_pore-care', 'notable_effects_refreshing', 'notable_effects_skin-barrier', 'notable_effects_soothing', 'notable_effects_uv-protection']


KeyError: 'price'