In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans

In [2]:
df = pd.read_csv('../data/raw/user_profiles_for_ads.csv')

In [3]:
data = df.copy()

## User Profiling and Segmentation for targeted ad campaigns

- By leveraging the insights gained from user demographics, online behavior, interaction patterns, and potential interests, we can create user profiles and segment our user base into distinct groups. This approach allows for more targeted ad campaigns, ultimately leading to increased user engagement and conversion rates.

##### Segmentation Criteria
User segmentation can be based on various criteria, including:

- **Demographics:** Age, gender, income level, education level.
- **Behavioral:** Time spent online, likes and reactions, click-through rates (CTR), conversion rates.
- **Interests:** Analyzing user behavior to identify potential interests and preferences.

In [4]:
# Selecting features for clustering
features = ['Age', 'Gender', 'Income Level', 'Time Spent Online (hrs/weekday)', 'Time Spent Online (hrs/weekend)', 'Likes and Reactions', 'Click-Through Rates (CTR)']

# Separating the features for clustering
X = data[features]

# Defining preprocessing for numerical and categorical features
numeric_features = ['Time Spent Online (hrs/weekday)', 'Time Spent Online (hrs/weekend)', 'Likes and Reactions', 'Click-Through Rates (CTR)']
numeric_transformer = StandardScaler()

categorical_features = ['Age', 'Gender', 'Income Level']
categorical_transformer = OneHotEncoder()

# Combining preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Creating a preprocessing and clustering pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('cluster', KMeans(n_clusters=5, random_state=42, n_init=10))])

pipeline.fit(X)


In [5]:
cluster_labels = pipeline.named_steps['cluster'].labels_
data['Cluster'] = cluster_labels

In [7]:
data.head(10)

Unnamed: 0,User ID,Age,Gender,Location,Language,Education Level,Likes and Reactions,Followed Accounts,Device Usage,Time Spent Online (hrs/weekday),Time Spent Online (hrs/weekend),Click-Through Rates (CTR),Conversion Rates,Ad Interaction Time (sec),Income Level,Top Interests,Cluster
0,1,25-34,Female,Suburban,Hindi,Technical,5640,190,Mobile Only,4.5,1.7,0.193,0.067,25,20k-40k,Digital Marketing,2
1,2,65+,Male,Urban,Hindi,PhD,9501,375,Tablet,0.5,7.7,0.114,0.044,68,0-20k,Data Science,1
2,3,45-54,Female,Suburban,Spanish,Technical,4775,187,Mobile Only,4.5,5.6,0.153,0.095,80,60k-80k,Fitness and Wellness,0
3,4,35-44,Female,Rural,Spanish,PhD,9182,152,Desktop Only,3.1,4.2,0.093,0.061,65,100k+,"Gaming, DIY Crafts",3
4,5,25-34,Female,Urban,English,Technical,6848,371,Mobile Only,2.0,3.8,0.175,0.022,99,20k-40k,"Fitness and Wellness, Investing and Finance, G...",2
5,6,25-34,Female,Suburban,Hindi,Master,6203,257,Mobile + Desktop,3.8,3.3,0.209,0.048,44,100k+,"Gourmet Cooking, Software Engineering, Eco-Fri...",2
6,7,18-24,Female,Suburban,Hindi,Bachelor,1573,136,Mobile + Desktop,2.8,7.9,0.172,0.068,8,100k+,"Gardening, Digital Marketing, Music Production",0
7,8,55-64,Male,Suburban,Hindi,PhD,3343,272,Desktop Only,2.7,1.9,0.128,0.032,35,40k-60k,"Music Production, Photography, Gaming, Travel ...",2
8,9,45-54,Female,Urban,Spanish,High School,2281,49,Mobile + Desktop,1.0,3.2,0.115,0.008,159,20k-40k,"Eco-Friendly Living, Gardening",4
9,10,45-54,Male,Rural,Hindi,Bachelor,9741,421,Mobile + Desktop,2.3,4.4,0.161,0.058,157,100k+,"Digital Marketing, Travel and Adventure",2


- The K-Means clustering algorithm successfully segmented our user base into five distinct clusters (0-4). Each cluster exhibits unique characteristics based on the chosen features, including age, gender, income level, online behavior (time spent online, likes, reactions), and engagement metrics. The analysis of these characteristics will help gain valuable insights into the preferences and behaviors of each user segment. These insights will serve as the foundation for developing targeted advertising campaigns tailored to resonate effectively with each segment.

In [15]:
data[data['Cluster'] == 0].sample(20)

Unnamed: 0,User ID,Age,Gender,Location,Language,Education Level,Likes and Reactions,Followed Accounts,Device Usage,Time Spent Online (hrs/weekday),Time Spent Online (hrs/weekend),Click-Through Rates (CTR),Conversion Rates,Ad Interaction Time (sec),Income Level,Top Interests,Cluster
972,973,45-54,Male,Rural,Mandarin,Master,645,445,Tablet,4.5,3.1,0.199,0.003,141,40k-60k,"Fashion Modelling, Reading and Literature, Gar...",0
832,833,18-24,Male,Urban,English,PhD,634,457,Tablet,4.3,6.7,0.073,0.025,131,100k+,"Data Science, Eco-Friendly Living, Pet Care",0
134,135,55-64,Male,Urban,Mandarin,High School,1412,85,Mobile + Desktop,3.2,3.8,0.141,0.067,18,60k-80k,"Fashion Modelling, Photography",0
429,430,18-24,Female,Suburban,Mandarin,Bachelor,2007,297,Desktop Only,2.8,7.4,0.175,0.056,155,100k+,Investing and Finance,0
513,514,55-64,Female,Urban,Spanish,Master,2218,144,Desktop Only,3.6,5.3,0.143,0.03,57,0-20k,DIY Crafts,0
294,295,18-24,Female,Urban,Spanish,PhD,4186,150,Desktop Only,4.8,5.9,0.244,0.058,140,80k-100k,"Gaming, Pet Care",0
303,304,45-54,Female,Rural,Hindi,Technical,3184,117,Mobile Only,2.9,4.8,0.21,0.099,9,80k-100k,"Reading and Literature, Gaming, Photography, T...",0
268,269,25-34,Female,Urban,English,Master,3776,202,Tablet,3.5,6.3,0.208,0.093,92,60k-80k,"Fashion Modelling, Investing and Finance, Eco-...",0
69,70,65+,Female,Rural,Mandarin,Master,1331,115,Mobile Only,2.8,5.9,0.077,0.088,151,60k-80k,Investing and Finance,0
156,157,45-54,Female,Rural,Hindi,High School,4280,303,Mobile Only,4.9,4.9,0.198,0.068,146,100k+,"Pet Care, Gardening, Gaming, DIY Crafts",0


In [13]:
#data.to_csv('../data/processed/clustered_data.csv', index=False)