In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from collections import Counter

In [2]:
df = pd.read_csv('../data/processed/clustered_data.csv')

- To identify the commonalities and differences within each cluster, we can compute the mean and mode values of features for each cluster.

In [3]:
features = ['Age', 'Gender', 'Income Level', 'Time Spent Online (hrs/weekday)', 'Time Spent Online (hrs/weekend)', 'Likes and Reactions', 'Click-Through Rates (CTR)']
numeric_features = ['Time Spent Online (hrs/weekday)', 'Time Spent Online (hrs/weekend)', 'Likes and Reactions', 'Click-Through Rates (CTR)']
categorical_features = ['Age', 'Gender', 'Income Level']

In [4]:
# Compute the mean values of numerical features for each cluster
cluster_means = df.groupby('Cluster')[numeric_features].mean()

# Compute the mode values of categorical features for each cluster
for feature in categorical_features:
    mode_series = df.groupby('Cluster')[feature].agg(lambda x: x.mode()[0])
    cluster_means[feature] = mode_series

cluster_means

Unnamed: 0_level_0,Time Spent Online (hrs/weekday),Time Spent Online (hrs/weekend),Likes and Reactions,Click-Through Rates (CTR),Age,Gender,Income Level
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,3.911111,5.212963,2409.62037,0.149588,25-34,Female,80k-100k
1,1.559394,6.002424,5005.121212,0.179836,35-44,Male,80k-100k
2,3.019737,2.584211,6861.587719,0.170614,25-34,Male,20k-40k
3,3.080882,5.77451,7457.602941,0.067971,25-34,Female,100k+
4,1.809626,3.839572,3021.219251,0.056594,45-54,Female,0-20k


**These are the average behavior and characteristics of users within each cluster:**

- **Cluster 0 - Social Savants:** Users in this cluster, predominantly females aged 25-34 with an income of 80k-100k, spend approximately 3.91 hours on weekdays and 5.21 hours on weekends online. They have moderate levels of likes and reactions and a relatively higher click-through rate (CTR).

- **Cluster 1 - Digital Mavericks:** Mostly males aged 35-44 with an income of 80k-100k, users here spend less time on weekdays online but have longer weekend activity. They show higher levels of likes and reactions and a slightly higher CTR.

- **Cluster 2 - Zen Navigators:** With a majority of males aged 25-34 and an income of 20k-40k, users in this cluster spend a moderate amount of time online on both weekdays and weekends. They have the highest levels of likes and reactions but a relatively lower CTR.

- **Cluster 3 - Exploratory Enthusiasts:** Predominantly females aged 25-34 with a higher income of 100k+, users in this cluster spend a similar amount of time online as Cluster 2 but have higher levels of likes and reactions. However, they have a lower CTR.

- **Cluster 4 - Frugal Explorers:** Users here, mostly females aged 45-54 with an income of 0-20k, spend the least amount of time online on both weekdays and weekends. They have lower levels of likes and reactions and the lowest CTR among all clusters.

##### Create a radar charts that reflect the segments and their distinctive characteristics.

In [5]:
# Preparing data for radar chart
features_to_plot = ['Time Spent Online (hrs/weekday)', 'Time Spent Online (hrs/weekend)', 'Likes and Reactions', 'Click-Through Rates (CTR)']
labels = np.array(features_to_plot)

# Creating a dataframe for the radar chart
radar_df = cluster_means[features_to_plot].reset_index()

# Normalizing the data
radar_df_normalized = radar_df.copy()
for feature in features_to_plot:
    radar_df_normalized[feature] = (radar_df[feature] - radar_df[feature].min()) / (radar_df[feature].max() - radar_df[feature].min())

# Adding a full circle for plotting
radar_df_normalized = pd.concat([radar_df_normalized, radar_df_normalized.iloc[0]], ignore_index=True)

# Assigning names to segments
segment_names = ['Social Savants', 'Digital Mavericks', 'Zen Navigators', 'Exploratory Enthusiasts', 'Frugal Explorers']

# Create a radar chart
fig = go.Figure()

# Loop through each segment to add to the radar chart
for i, segment in enumerate(segment_names):
    fig.add_trace(go.Scatterpolar(
        r=radar_df_normalized.iloc[i][features_to_plot].values.tolist() + [radar_df_normalized.iloc[i][features_to_plot].values[0]], # Add the first value at the end to close the radar chart
        theta=labels.tolist() + [labels[0]], # Add the first label at the end to close the radar chart
        fill='toself',
        name=segment,
        hoverinfo='text',
        text=[f"{label}: {value:.2f}" for label, value in zip(features_to_plot, radar_df_normalized.iloc[i][features_to_plot].iloc)] + [f"{labels[0]}: {radar_df_normalized.iloc[i][features_to_plot].iloc[0]:.2f}"] # Adding hover text for each feature
    ))

# Update the layout to finalize the radar chart
fig.update_layout(
    polar=dict(
        radialaxis=dict(
            visible=True,
            range=[0, 1]
        )
    ),
    showlegend=True,
    title='User Segments Profile'
)

fig.show()


In [6]:
df.head(10)

Unnamed: 0,User ID,Age,Gender,Location,Language,Education Level,Likes and Reactions,Followed Accounts,Device Usage,Time Spent Online (hrs/weekday),Time Spent Online (hrs/weekend),Click-Through Rates (CTR),Conversion Rates,Ad Interaction Time (sec),Income Level,Top Interests,Cluster
0,1,25-34,Female,Suburban,Hindi,Technical,5640,190,Mobile Only,4.5,1.7,0.193,0.067,25,20k-40k,Digital Marketing,2
1,2,65+,Male,Urban,Hindi,PhD,9501,375,Tablet,0.5,7.7,0.114,0.044,68,0-20k,Data Science,1
2,3,45-54,Female,Suburban,Spanish,Technical,4775,187,Mobile Only,4.5,5.6,0.153,0.095,80,60k-80k,Fitness and Wellness,0
3,4,35-44,Female,Rural,Spanish,PhD,9182,152,Desktop Only,3.1,4.2,0.093,0.061,65,100k+,"Gaming, DIY Crafts",3
4,5,25-34,Female,Urban,English,Technical,6848,371,Mobile Only,2.0,3.8,0.175,0.022,99,20k-40k,"Fitness and Wellness, Investing and Finance, G...",2
5,6,25-34,Female,Suburban,Hindi,Master,6203,257,Mobile + Desktop,3.8,3.3,0.209,0.048,44,100k+,"Gourmet Cooking, Software Engineering, Eco-Fri...",2
6,7,18-24,Female,Suburban,Hindi,Bachelor,1573,136,Mobile + Desktop,2.8,7.9,0.172,0.068,8,100k+,"Gardening, Digital Marketing, Music Production",0
7,8,55-64,Male,Suburban,Hindi,PhD,3343,272,Desktop Only,2.7,1.9,0.128,0.032,35,40k-60k,"Music Production, Photography, Gaming, Travel ...",2
8,9,45-54,Female,Urban,Spanish,High School,2281,49,Mobile + Desktop,1.0,3.2,0.115,0.008,159,20k-40k,"Eco-Friendly Living, Gardening",4
9,10,45-54,Male,Rural,Hindi,Bachelor,9741,421,Mobile + Desktop,2.3,4.4,0.161,0.058,157,100k+,"Digital Marketing, Travel and Adventure",2


##### Analyze interests within clusters

In [7]:
def calculate_frequency_of_interests(data):
    """
    Calculate frequency of interests within each cluster.
    
    Parameters:
        data (DataFrame): DataFrame containing user data with 'Top Interests' and 'Cluster' columns.
    
    Returns:
        dict: A dictionary where keys are cluster labels and values are Counter objects containing interest frequencies.
    """
    interest_frequencies = {}
    for cluster_label, cluster_data in data.groupby('Cluster'):
        interests_list = cluster_data['Top Interests'].str.split(', ').sum()
        interest_frequencies[cluster_label] = Counter(interests_list)
    return interest_frequencies

interest_frequencies = calculate_frequency_of_interests(df)
interest_frequencies


{0: Counter({'Gaming': 47,
          'Reading and Literature': 42,
          'Music Production': 40,
          'Investing and Finance': 38,
          'Pet Care': 37,
          'Digital Marketing': 36,
          'DIY Crafts': 35,
          'Travel and Adventure': 34,
          'Gardening': 33,
          'Eco-Friendly Living': 33,
          'Data Science': 33,
          'Gourmet Cooking': 33,
          'Software Engineering': 29,
          'Fitness and Wellness': 28,
          'Fashion Modelling': 28,
          'Photography': 25}),
 1: Counter({'Fitness and Wellness': 36,
          'Digital Marketing': 34,
          'Software Engineering': 30,
          'Fashion Modelling': 30,
          'Gardening': 29,
          'Eco-Friendly Living': 28,
          'Investing and Finance': 28,
          'DIY Crafts': 27,
          'Gourmet Cooking': 27,
          'Pet Care': 26,
          'Photography': 25,
          'Reading and Literature': 24,
          'Music Production': 21,
          'Data Scienc

In [8]:
def plot_top_interests_by_cluster_plotly(data, n=10):
    """
    Plot the top N interests for each cluster using Plotly Express.
    
    Parameters:
        data (DataFrame): DataFrame containing user data with 'Top Interests' and 'Cluster' columns.
        n (int): Number of top interests to plot (default is 10).
    """
    for cluster_label, cluster_data in data.groupby('Cluster'):
        interests_list = cluster_data['Top Interests'].str.split(', ').sum()
        top_interests = Counter(interests_list).most_common(n)
        interests, frequencies = zip(*top_interests)
        fig = px.bar(x=interests, y=frequencies, labels={'x': 'Interest', 'y': 'Frequency'},
                     title=f'Top {n} Interests for Cluster {cluster_label}')
        fig.update_xaxes(tickangle=45)
        fig.show()

plot_top_interests_by_cluster_plotly(df, n=15)
