# Customer Segmentation 

- Reference: [Mastering Customer Segmentation with LLM](https://towardsdatascience.com/mastering-customer-segmentation-with-llm-3d9008235f41)
- Dataset: [Banking Dataset - Marketing Targets](https://www.kaggle.com/datasets/prakharrathi25/banking-dataset-marketing-targets)

## Introduction
- A customer segmentation project can be approached in multiple ways:
    - Kmeans
    - K-Prototype
    - LLM + Kmeans

In [4]:
import pandas as pd # dataframe manipulation
import numpy as np # linear algebra

# data visualization
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import shap

In [23]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


In [8]:
df = pd.read_csv('../../data/banking_marketing_targets/train.csv', sep=';')
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [18]:
num_cols = df.select_dtypes(include=np.number).columns.tolist()
cat_cols = df.select_dtypes(exclude=np.number).columns.tolist()

In [24]:
num_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean')),
    ('scale', MinMaxScaler())
])

cat_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('encode', OrdinalEncoder())
])

col_transformer = ColumnTransformer(transformers=[
    ('num_pipeline', num_pipeline, num_cols),
    ('cat_pipeline', cat_pipeline, cat_cols)
    ],
    remainder='drop', # ignore other columns in a dataframe
    n_jobs=-1)        # use all processors to run in parallel

In [30]:
col_transformer.fit(df)
data = pd.DataFrame(col_transformer.transform(df), columns = col_transformer.get_feature_names_out())
data.head()

Unnamed: 0,num_pipeline__age,num_pipeline__balance,num_pipeline__day,num_pipeline__duration,num_pipeline__campaign,num_pipeline__pdays,num_pipeline__previous,cat_pipeline__job,cat_pipeline__marital,cat_pipeline__education,cat_pipeline__default,cat_pipeline__housing,cat_pipeline__loan,cat_pipeline__contact,cat_pipeline__month,cat_pipeline__poutcome,cat_pipeline__y
0,0.519481,0.092259,0.133333,0.05307,0.0,0.0,0.0,4.0,1.0,2.0,0.0,1.0,0.0,2.0,8.0,3.0,0.0
1,0.337662,0.073067,0.133333,0.030704,0.0,0.0,0.0,9.0,2.0,1.0,0.0,1.0,0.0,2.0,8.0,3.0,0.0
2,0.194805,0.072822,0.133333,0.015453,0.0,0.0,0.0,2.0,1.0,1.0,0.0,1.0,1.0,2.0,8.0,3.0,0.0
3,0.376623,0.086476,0.133333,0.018707,0.0,0.0,0.0,1.0,1.0,3.0,0.0,1.0,0.0,2.0,8.0,3.0,0.0
4,0.194805,0.072812,0.133333,0.04026,0.0,0.0,0.0,11.0,2.0,3.0,0.0,0.0,0.0,2.0,8.0,3.0,0.0


## K-means