# Perfume Segmentation K-Means

## Setup Database

In [162]:
# Import libraries
import pandas as pd
from sqlalchemy import create_engine
from dotenv import load_dotenv
import os

load_dotenv()

True

In [163]:
# Read database configuration from environment variables
database_config = {
  'hostname': os.getenv('DB_HOSTNAME'),
  'port': os.getenv('DB_PORT'),
  'username': os.getenv('DB_USERNAME'),
  'password': os.getenv('DB_PASSWORD'),
  'database': os.getenv('DB_DATABASE')
}

In [164]:
# Create a connection to the database
connection_str = f"mysql+pymysql://{database_config['username']}:{database_config['password']}@{database_config['hostname']}:{database_config['port']}/{database_config['database']}"
print("Connection String:", connection_str)  # Debug the connection string
engine = create_engine(connection_str)

Connection String: mysql+pymysql://root:@10.2.18.133:3306/perfume_segmentation


In [165]:
# Load perfumes table
query = """
SELECT
    s.id,
    s.age,
    s.gender,
    s.profession_id,
    s.perfume_id,
    p.name AS perfume_name,
    pr.name AS profession_name
FROM sales AS s
LEFT JOIN perfumes AS p ON s.perfume_id = p.id
LEFT JOIN professions AS pr ON s.profession_id = pr.id
"""
df_sales = pd.read_sql(query, engine)

df_sales.head()


Unnamed: 0,id,age,gender,profession_id,perfume_id,perfume_name,profession_name
0,1,54,0,0,0,Bacarat,Ibu Rumah Tangga
1,2,49,0,0,1,D&G,Ibu Rumah Tangga
2,3,26,0,1,2,Paris Hilton,Pegawai Negeri
3,4,40,0,2,1,D&G,Karyawan
4,5,52,0,2,0,Bacarat,Karyawan


## Preprocessing

In [167]:
from sklearn.cluster import KMeans

In [168]:
# Select features from the DataFrame
X = df_sales[['age', 'gender', 'profession_id', 'perfume_id']].copy()

# Specify the sales IDs to use as initial centroids
initial_centroid_ids = [5, 10, 15, 20, 25]

# Extract the rows with these sales IDs to obtain the initial centroids
# Make sure that the 'id' column corresponds to your sales id
initial_centroids = df_sales[df_sales['id'].isin(initial_centroid_ids)][['age', 'gender', 'profession_id', 'perfume_id']].values

# Initialize KMeans with the predefined centroids
k = 5  # Total number of clusters
kmeans = KMeans(n_clusters=k, init=initial_centroids, n_init=1)

# Train the KMeans model on the selected features
kmeans.fit(X)

# Retrieve the cluster labels and add them to the DataFrame
labels = kmeans.labels_
df_sales['cluster'] = labels

# Display the first few cluster assignments
df_sales['cluster'].head()


0    0
1    0
2    4
3    3
4    0
Name: cluster, dtype: int32

In [171]:
# Get cluster centroids
centroids = kmeans.cluster_centers_

# Get cluster labels
df_sales.groupby('cluster').agg({
    'age': 'mean',
    'perfume_id': 'mean',
    'gender': 'mean',
    'profession_id': 'mean',
})

Unnamed: 0_level_0,age,perfume_id,gender,profession_id
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,50.269231,1.423077,0.230769,1.538462
1,57.47619,1.571429,0.380952,2.285714
2,18.882979,1.680851,0.5,3.265957
3,40.875,1.5625,0.5625,1.59375
4,29.409091,1.545455,0.431818,2.386364
