<font size="+3"><strong>Customer Segmentation</strong></font>

An automobile company has plans to enter new markets with their existing products (P1, P2, P3, P4 and P5). After intensive market research, they’ve deduced that the behavior of new market is similar to their existing market.

In their existing market, the sales team has classified all customers into 4 segments (A, B, C, D ). Then, they performed segmented outreach and communication for different segment of customers. This strategy has work exceptionally well for them. They plan to use the same strategy on new markets and have identified 2627 new potential customers.

You are required to help the manager to predict the right group of the new customers.

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from scipy.stats.mstats import trimmed_var
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

/kaggle/input/customer/sample_submission.csv
/kaggle/input/customer/Train.csv
/kaggle/input/customer/Test.csv


# Prepare Data
As always, we'll start by bringing our data into the project using a wrangle function.

## Import

In [2]:
def wrangle(filepath , cols=None):

    """Read SCF data file into ``DataFrame``.
    
    Returns only df.

    Parameters
    ----------
    filepath : str
        Location of CSV file.
    columns :  list
         Delete the columns
    """
    # read data from path
    df = pd.read_csv(filepath)
    
    # drops nulls values
    df.dropna(axis=0 , inplace=True)
    
    #drops columns 
    if cols is not  None :
         df.drop(columns=cols , axis=1 , inplace=True )
        
    # drop duplicated data 
    df = df.drop_duplicates()
    
    
    return df

In [3]:
df_train = wrangle('/kaggle/input/customer/Train.csv' , cols=['ID' ,'Segmentation' ])
df_test = wrangle('/kaggle/input/customer/Test.csv' ,cols='ID' )
df = pd.concat([df_train , df_test] )
df

Unnamed: 0,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1
0,Male,No,22,No,Healthcare,1.0,Low,4.0,Cat_4
2,Female,Yes,67,Yes,Engineer,1.0,Low,1.0,Cat_6
3,Male,Yes,67,Yes,Lawyer,0.0,High,2.0,Cat_6
5,Male,Yes,56,No,Artist,0.0,Average,2.0,Cat_6
6,Male,No,32,Yes,Healthcare,1.0,Low,3.0,Cat_6
...,...,...,...,...,...,...,...,...,...
2621,Female,No,35,Yes,Entertainment,1.0,Low,2.0,Cat_6
2622,Male,No,29,No,Healthcare,9.0,Low,4.0,Cat_6
2623,Female,No,35,Yes,Doctor,1.0,Low,1.0,Cat_6
2625,Male,Yes,47,Yes,Executive,1.0,High,5.0,Cat_4


In [4]:
df.drop_duplicates(inplace=True)

# Data Info

In [5]:
def data_info(df):

    """Read SCF data file into ``DataFrame``.
    
    Returns only df.

    Parameters
    ----------
    df : DataFrame
        Location of CSV file.

    """
    # print info a bout data
    print(df.info())
    
    # print the describe of data 
    print("Data describe: ")
    print(df.describe())
    
    # unique values 
    print(df.nunique())
    
    

In [6]:
data_info(df)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7633 entries, 0 to 2626
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Gender           7633 non-null   object 
 1   Ever_Married     7633 non-null   object 
 2   Age              7633 non-null   int64  
 3   Graduated        7633 non-null   object 
 4   Profession       7633 non-null   object 
 5   Work_Experience  7633 non-null   float64
 6   Spending_Score   7633 non-null   object 
 7   Family_Size      7633 non-null   float64
 8   Var_1            7633 non-null   object 
dtypes: float64(2), int64(1), object(6)
memory usage: 596.3+ KB
None
Data describe: 
               Age  Work_Experience  Family_Size
count  7633.000000      7633.000000  7633.000000
mean     43.244334         2.794576     2.881174
std      16.099644         3.484314     1.576542
min      18.000000         0.000000     1.000000
25%      31.000000         0.000000     2.000000
50%      

In [7]:
import pandas as pd

def encode_columns(df, trimmed=True, return_feat_names=True):
    """Returns the five highest-variance features of ``df``.

    Parameters
    ----------
    df : pandas.DataFrame
        The input DataFrame containing the features.

    trimmed : bool, default=True
        If ``True``, calculates trimmed variance, removing bottom and top 10%
        of observations.

    return_feat_names : bool, default=False
        If ``True``, returns feature names as a ``list``. If ``False``,
        returns ``Series``, where index is feature names and values are
        variances.
    """
    encoded_df = df.copy()
    
    # Define the encoding maps
    map_Var_1 = {}
    for i, item in enumerate(set(encoded_df['Var_1'])):
        map_Var_1[item] = i
    map_Spending_Score = {"Low": 0, "Average": 1, "High": 2}
    map_Profession = {}
    for i, item in enumerate(set(encoded_df['Profession'])):
        map_Profession[item] = i
    map_gender = {"Male": 1, "Female": 0}
    map_YES_NO = {"Yes": 1, "No": 0} 
    
    # Apply the encoding to the respective columns
    encoded_df['Var_1'] = encoded_df['Var_1'].map(map_Var_1)
    encoded_df['Spending_Score'] = encoded_df['Spending_Score'].map(map_Spending_Score)
    encoded_df['Profession'] = encoded_df['Profession'].map(map_Profession)
    encoded_df['Gender'] = encoded_df['Gender'].map(map_gender)
    encoded_df['Ever_Married'] = encoded_df['Ever_Married'].map(map_YES_NO)
    encoded_df['Graduated'] = encoded_df['Graduated'].map(map_YES_NO)
    return encoded_df

In [8]:
df = encode_columns(df)
df

Unnamed: 0,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1
0,1,0,22,0,4,1.0,0,4.0,6
2,0,1,67,1,1,1.0,0,1.0,2
3,1,1,67,1,5,0.0,2,2.0,2
5,1,1,56,0,0,0.0,1,2.0,2
6,1,0,32,1,4,1.0,0,3.0,2
...,...,...,...,...,...,...,...,...,...
2618,0,1,42,1,0,0.0,0,2.0,2
2621,0,0,35,1,2,1.0,0,2.0,2
2622,1,0,29,0,4,9.0,0,4.0,2
2623,0,0,35,1,6,1.0,0,1.0,2


## Variance Bar Chart

Create a get_high_var_features function that returns the five highest-variance features in a DataFrame. Use the docstring for guidance.

In [9]:
def get_high_var_features( trimmed=True, return_feat_names=True):
    """
    Returns the five highest-variance features of `df`.

    Parameters
    ----------
    df : pandas.DataFrame
        The DataFrame containing the features.

    trimmed : bool, default=True
        If `True`, calculates trimmed variance, removing bottom and top 10%
        of observations.

    return_feat_names : bool, default=True
        If `True`, returns feature names as a list. If `False`,
        returns Series, where index is feature names and values are
        variances.
    """
    numeric_columns = df.select_dtypes(include=[np.number])
    if trimmed:
        top_five_features = numeric_columns.apply(trimmed_var).sort_values()
    else:
        top_five_features = numeric_columns.var().sort_values()

    if return_feat_names:
        return top_five_features.index.tolist()
    else:
        return top_five_features


In [10]:
high_var_features = get_high_var_features(return_feat_names=False)
print(high_var_features)

Graduated            0.225109
Ever_Married         0.235926
Gender               0.246515
Spending_Score       0.353835
Family_Size          1.174333
Var_1                1.745979
Profession           4.885401
Work_Experience      6.770380
Age                120.832966
dtype: float64


Create a serve_bar_chart function that returns a plotly express bar chart of the five highest-variance features. You should use get_high_var_features as a helper function. Follow the docstring for guidance.

In [11]:
import plotly.express as px

def serve_bar_chart(trimmed=True):
    """Returns a horizontal bar chart of five highest-variance features.

    Parameters
    ----------
    trimmed : bool, default=True
        If ``True``, calculates trimmed variance, removing bottom and top 10%
        of observations.
    """
    top_five_features = get_high_var_features(trimmed=trimmed, return_feat_names=False)

    fig = px.bar(x=top_five_features, y=top_five_features.index, orientation="h")
    fig.update_layout(xaxis_title="Variance", yaxis_title="Features")
    return fig


In [12]:
serve_bar_chart(trimmed=True)

## K-means Slider and Metrics

Okay, so now our app has a radio button, but that's only one thing for a viewer to interact with. Buttons are fun, but what if we made a slider to help people see what it means for the number of clusters to change. Let's do it!

Again, start by adding some objects to the layout.

In [13]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import ipywidgets as widgets
from ipywidgets import interact
import pandas as pd
import plotly.express as px

# Generate sample data
np.random.seed(0)
n_samples = 200
n_features = 4
df = df.copy()

@interact(num_clusters=widgets.IntSlider(min=2, max=10, step=1, value=3))
def kmeans_clustering(num_clusters):
    # Perform k-means clustering with the selected number of clusters
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    kmeans.fit(data)
    labels = kmeans.labels_
    centers = kmeans.cluster_centers_
    
    # Apply PCA for dimensionality reduction
    pca = PCA(n_components=2)
    data_pca = pca.fit_transform(data)
    
    # Plot the clusters based on the reduced dimensions
    features = get_high_var_features(trimmed=True, return_feat_names=True)
    X = df[features]
    
    transformer = PCA(n_components=2, random_state=42)
    X_t = transformer.fit_transform(X)
    
    X_pca = pd.DataFrame(X_t, columns=["PCA1", "PCA2"])
    
    model = get_model_metrics(trimmed=True, k=num_clusters, return_metrics=False)
    X_pca['labels'] = model.named_steps["kmeans"].labels_.astype(str)
    X_pca.sort_values("labels", inplace=True)
    
    fig = px.scatter(
        data_frame=X_pca,
        x="PCA1",
        y="PCA2",
        color="labels",
        title="PCA Representation of Clusters"
    )
    
    fig.update_layout(xaxis_title="PCA1", yaxis_title="PCA2")
    fig.show()


interactive(children=(IntSlider(value=3, description='num_clusters', max=10, min=2), Output()), _dom_classes=(…

In [14]:
def get_model_metrics(trimmed=True , k=2 , return_metrics=False):

    """Build ``KMeans`` model based on five highest-variance features in ``df``.

    Parameters
    ----------
    trimmed : bool, default=True
        If ``True``, calculates trimmed variance, removing bottom and top 10%
        of observations.

    k : int, default=2
        Number of clusters.

    return_metrics : bool, default=False
        If ``False`` returns ``KMeans`` model. If ``True`` returns ``dict``
        with inertia and silhouette score.

    """
    features = get_high_var_features(trimmed=trimmed , return_feat_names=True )
    X = df[features]
    model = make_pipeline( StandardScaler() ,  KMeans(n_clusters=k , random_state=42 ))
    model.fit(X)
    if return_metrics:
        i = model.named_steps["kmeans"].inertia_
        ss = silhouette_score(X , model.named_steps["kmeans"].labels_)
        metrics = {
            "inertia" : round(i) , 
            "silhouette" : round(ss , 3)
        }
        return metrics
    return model

In [15]:
model = get_model_metrics(trimmed=True , k=2, return_metrics=True)
model

{'inertia': 55425, 'silhouette': 0.22}

## PCA Scatter Plot

We just made a slider that can change the inertia and silhouette scores, but not everyone will be able to understand what those changing numbers mean. Let's make a scatter plot to help them along.

In [16]:
def get_pca_labels(trimmed=True , k=2):

    """
    ``KMeans`` labels.

    Parameters
    ----------
    trimmed : bool, default=True
        If ``True``, calculates trimmed variance, removing bottom and top 10%
        of observations.

    k : int, default=2
        Number of clusters.
    """
    features = get_high_var_features(trimmed=trimmed, return_feat_names=True)
    X = df[features]
    
    transformer = PCA(n_components=2 , random_state=42) 
    
    X_t = transformer.fit_transform(X)
    
    X_pca = pd.DataFrame(X_t , columns=["PCA1" , "PCA2"])
    
    model = get_model_metrics(trimmed=trimmed , k=k , return_metrics=False)
    X_pca['labels'] = model.named_steps["kmeans"].labels_.astype(str)
    X_pca.sort_values("labels" , inplace=True )
    return X_pca

In [17]:
get_pca_labels().head()

Unnamed: 0,PCA1,PCA2,labels
2690,25.836006,-0.59499,0
3457,-2.415063,5.507039,0
6163,-2.389481,4.520283,0
6165,7.852507,-2.356072,0
3453,-0.086337,-2.312588,0


In [18]:
def serve_scatter_plot(trimmed=True , k=2):

    """Build 2D scatter plot of ``df`` with ``KMeans`` labels.

    Parameters
    ----------
    trimmed : bool, default=True
        If ``True``, calculates trimmed variance, removing bottom and top 10%
        of observations.

    k : int, default=2
        Number of clusters.
    """
    fig = px.scatter(
          data_frame=get_pca_labels(trimmed=trimmed , k=k ),
        x="PCA1" , 
        y="PCA2" , 
        color="labels" ,
        title="PCA Representation of Clusters"
    
    
    )
    fig.update_layout(xaxis_title="PCA1" , yaxis_title="PCA2" )
    return fig

In [19]:
serve_scatter_plot(trimmed=True , k=2)

In [20]:
features = get_high_var_features(trimmed=True , return_feat_names=True )
X = df[features]
model = make_pipeline( StandardScaler() ,  KMeans(n_clusters=3, random_state=42 ))
model.fit(X)
    
labels = model.named_steps['kmeans'].labels_
print(labels[:10])

[1 0 2 2 0 0 2 2 0 1]
