In [1]:
# 📦 Importing the necessary libraries

# Data manipulation and analysis
import pandas as pd
import numpy as np

# For visualizing trees and data distributions
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning models and utilities
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix


In [2]:
# 🧠 Step 1: Load product metadata

# In this first step, I’m simply loading the product_info.csv file, which contains structured metadata about each product. 
# This file includes key features like brand_name, price_usd, category, and loves_count, which will help me analyze what makes a product “popular.”

# I use df.head() to make sure it loaded properly and check the structure of the DataFrame before doing anything else.

import pandas as pd

# I’m loading the product info CSV (from your /data/ directory)
df = pd.read_csv('../data/product_info.csv')

# I want to preview it before doing any analysis
df.head()


Unnamed: 0,product_id,product_name,brand_id,brand_name,loves_count,rating,reviews,size,variation_type,variation_value,...,online_only,out_of_stock,sephora_exclusive,highlights,primary_category,secondary_category,tertiary_category,child_count,child_max_price,child_min_price
0,P473671,Fragrance Discovery Set,6342,19-69,6320,3.6364,11.0,,,,...,1,0,0,"['Unisex/ Genderless Scent', 'Warm &Spicy Scen...",Fragrance,Value & Gift Sets,Perfume Gift Sets,0,,
1,P473668,La Habana Eau de Parfum,6342,19-69,3827,4.1538,13.0,3.4 oz/ 100 mL,Size + Concentration + Formulation,3.4 oz/ 100 mL,...,1,0,0,"['Unisex/ Genderless Scent', 'Layerable Scent'...",Fragrance,Women,Perfume,2,85.0,30.0
2,P473662,Rainbow Bar Eau de Parfum,6342,19-69,3253,4.25,16.0,3.4 oz/ 100 mL,Size + Concentration + Formulation,3.4 oz/ 100 mL,...,1,0,0,"['Unisex/ Genderless Scent', 'Layerable Scent'...",Fragrance,Women,Perfume,2,75.0,30.0
3,P473660,Kasbah Eau de Parfum,6342,19-69,3018,4.4762,21.0,3.4 oz/ 100 mL,Size + Concentration + Formulation,3.4 oz/ 100 mL,...,1,0,0,"['Unisex/ Genderless Scent', 'Layerable Scent'...",Fragrance,Women,Perfume,2,75.0,30.0
4,P473658,Purple Haze Eau de Parfum,6342,19-69,2691,3.2308,13.0,3.4 oz/ 100 mL,Size + Concentration + Formulation,3.4 oz/ 100 mL,...,1,0,0,"['Unisex/ Genderless Scent', 'Layerable Scent'...",Fragrance,Women,Perfume,2,75.0,30.0


In [3]:
# 🧪 Step 2: Define popularity based on loves_count

# In this step, I define a binary target column called is_popular based on the loves_count feature.
# Products that are in the top 30% of all loves_count values are labeled as 1 (popular), and the rest as 0.

# This gives the decision tree a clear yes/no classification task.
# Remove missing values from relevant columns
df = df[['brand_name', 'price_usd', 'primary_category', 'loves_count']].dropna()

# Calculate threshold for "popular" — top 30% products by love count
threshold = df['loves_count'].quantile(0.7)

# Create binary target: 1 = popular, 0 = not popular
df['is_popular'] = (df['loves_count'] >= threshold).astype(int)

# Preview the data
df.head()


Unnamed: 0,brand_name,price_usd,primary_category,loves_count,is_popular
0,19-69,35.0,Fragrance,6320,0
1,19-69,195.0,Fragrance,3827,0
2,19-69,195.0,Fragrance,3253,0
3,19-69,195.0,Fragrance,3018,0
4,19-69,195.0,Fragrance,2691,0


In [4]:
df['is_popular'].value_counts()


is_popular
0    5946
1    2548
Name: count, dtype: int64

In [5]:
# 🧼 Step 3: Preprocess features for modeling

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

# I separate features and target
X_raw = df[['brand_name', 'primary_category', 'price_usd']]
y = df['is_popular']

# I define preprocessing steps:
# - One-hot encode categorical features
# - Scale the price numerically
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), ['brand_name', 'primary_category']),
    ('num', StandardScaler(), ['price_usd'])
])

# Apply the transformations
X_processed = preprocessor.fit_transform(X_raw)

# Let’s check the final shape after encoding + scaling
print("Final feature matrix shape:", X_processed.shape)


Final feature matrix shape: (8494, 314)


In [6]:
# 🌳 Step 4: Train a decision tree classifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y, test_size=0.2, random_state=42, stratify=y
)

# I initialize the model — I use a small max_depth to avoid overfitting
tree = DecisionTreeClassifier(max_depth=5, random_state=42)

# Train the model
tree.fit(X_train, y_train)

# Predict on the test set
y_pred = tree.predict(X_test)

# Print evaluation results
print(f"🧠 Test Accuracy: {accuracy_score(y_test, y_pred):.4f}\n")
print("📋 Classification Report:")
print(classification_report(y_test, y_pred))


🧠 Test Accuracy: 0.7234

📋 Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.99      0.83      1189
           1       0.79      0.11      0.19       510

    accuracy                           0.72      1699
   macro avg       0.75      0.55      0.51      1699
weighted avg       0.74      0.72      0.64      1699

