In [2]:
import pandas as pd
import os

os.environ['OMP_NUM_THREADS'] = '1'

DADOS = '../dados/Mall_Customers_without_CustomerID.csv'

df = pd.read_csv(DADOS)
df.head()

Unnamed: 0,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,Male,19,15,39
1,Male,21,15,81
2,Female,20,16,6
3,Female,23,16,77
4,Female,31,17,40


In [7]:
# importar as bibliotecas de preparação dos dados
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# criar o objeto de preparação dos dados
preprocessing = ColumnTransformer(
    [
        ('standard', StandardScaler(), ['Age', 'Annual Income (k$)', 'Spending Score (1-100)']),
        ('onehot', OneHotEncoder(), ['Gender'])
    ]
)

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

RANDOM_STATE = 42

pipeline = Pipeline(
    [
        ('preprocessing', preprocessing),
        ('pca', PCA(n_components=3, random_state=RANDOM_STATE)),
        ('clustering', KMeans(n_clusters=5, random_state=RANDOM_STATE, n_init = 10))
    ]
)

pipeline.fit(df)

In [9]:
import joblib

joblib.dump(pipeline, '../modelos/pipeline_preprocessing_pca_clustering.pkl')

['../modelos/pipeline_preprocessing_pca_clustering.pkl']

In [10]:
pipeline.get_feature_names_out()

array(['kmeans0', 'kmeans1', 'kmeans2', 'kmeans3', 'kmeans4'],
      dtype=object)

In [12]:
pipeline['clustering'].labels_

array([1, 3, 1, 3, 1, 3, 1, 3, 0, 3, 0, 3, 0, 3, 1, 3, 1, 3, 0, 3, 1, 3,
       0, 3, 0, 3, 0, 3, 1, 3, 0, 3, 0, 3, 0, 3, 0, 3, 1, 3, 0, 3, 0, 1,
       0, 3, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 4, 1, 4, 2, 4, 2, 4, 2, 4,
       1, 4, 2, 4, 2, 4, 2, 4, 2, 4, 1, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4,
       2, 4, 2, 4, 2, 4, 0, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4,
       2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4,
       2, 4], dtype=int32)

In [None]:
df_clustered = df.copy()
df_clustered['cluster'] = pipeline['clustering'].labels_

df_clustered.head()
