User profiling and segmentation  Phase 2

In [55]:
# import all necessary libraries
import os
import pickle
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
import numpy as np


In [56]:
# Load the dataset
df = pd.read_csv("user_profiles_for_ads.csv")


In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   User ID                          1000 non-null   int64  
 1   Age                              1000 non-null   object 
 2   Gender                           1000 non-null   object 
 3   Location                         1000 non-null   object 
 4   Language                         1000 non-null   object 
 5   Education Level                  1000 non-null   object 
 6   Likes and Reactions              1000 non-null   int64  
 7   Followed Accounts                1000 non-null   int64  
 8   Device Usage                     1000 non-null   object 
 9   Time Spent Online (hrs/weekday)  1000 non-null   float64
 10  Time Spent Online (hrs/weekend)  1000 non-null   float64
 11  Click-Through Rates (CTR)        1000 non-null   float64
 12  Conversion Rates     

In [58]:
df.describe()

Unnamed: 0,User ID,Likes and Reactions,Followed Accounts,Time Spent Online (hrs/weekday),Time Spent Online (hrs/weekend),Click-Through Rates (CTR),Conversion Rates,Ad Interaction Time (sec)
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,500.5,4997.084,251.438,2.7575,4.6016,0.125333,0.049805,91.425
std,288.819436,2838.494365,141.941557,1.279735,2.026234,0.071187,0.02867,51.497965
min,1.0,101.0,10.0,0.5,1.0,0.0,0.0,5.0
25%,250.75,2661.25,126.0,1.7,2.9,0.065,0.026,45.75
50%,500.5,5002.5,245.5,2.8,4.7,0.128,0.049,90.0
75%,750.25,7348.75,377.0,3.8,6.4,0.186,0.073,137.25
max,1000.0,9973.0,498.0,5.0,8.0,0.25,0.1,179.0


In [59]:
df.head()

Unnamed: 0,User ID,Age,Gender,Location,Language,Education Level,Likes and Reactions,Followed Accounts,Device Usage,Time Spent Online (hrs/weekday),Time Spent Online (hrs/weekend),Click-Through Rates (CTR),Conversion Rates,Ad Interaction Time (sec),Income Level,Top Interests
0,1,25-34,Female,Suburban,Hindi,Technical,5640,190,Mobile Only,4.5,1.7,0.193,0.067,25,20k-40k,Digital Marketing
1,2,65+,Male,Urban,Hindi,PhD,9501,375,Tablet,0.5,7.7,0.114,0.044,68,0-20k,Data Science
2,3,45-54,Female,Suburban,Spanish,Technical,4775,187,Mobile Only,4.5,5.6,0.153,0.095,80,60k-80k,Fitness and Wellness
3,4,35-44,Female,Rural,Spanish,PhD,9182,152,Desktop Only,3.1,4.2,0.093,0.061,65,100k+,"Gaming, DIY Crafts"
4,5,25-34,Female,Urban,English,Technical,6848,371,Mobile Only,2.0,3.8,0.175,0.022,99,20k-40k,"Fitness and Wellness, Investing and Finance, G..."


In [60]:
# Create folders to save models and pipelines
os.makedirs("models", exist_ok=True)
os.makedirs("pipelines", exist_ok=True)

In [61]:
# Columns selected for categorical & numerical
categorical_cols = ["Gender", "Location", "Language", "Education Level", "Device Usage", "Income Level"]

numerical_cols = [
    "Likes and Reactions", "Followed Accounts",
    "Time Spent Online (hrs/weekday)", "Time Spent Online (hrs/weekend)",
    "Click-Through Rates (CTR)", "Conversion Rates", "Ad Interaction Time (sec)"
]

In [62]:
# Preprocessing for categorical and numerical columns
categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

numerical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

In [63]:
# Combine preprocessors
preprocessor = ColumnTransformer([
    ("num", numerical_transformer, numerical_cols),
    ("cat", categorical_transformer, categorical_cols)
])

In [64]:
preprocessor

In [65]:
# Split into features and interests
X_base = df.drop(columns=["User ID", "Top Interests"])

interests_raw = df["Top Interests"]

In [66]:
X_base.head()

Unnamed: 0,Age,Gender,Location,Language,Education Level,Likes and Reactions,Followed Accounts,Device Usage,Time Spent Online (hrs/weekday),Time Spent Online (hrs/weekend),Click-Through Rates (CTR),Conversion Rates,Ad Interaction Time (sec),Income Level
0,25-34,Female,Suburban,Hindi,Technical,5640,190,Mobile Only,4.5,1.7,0.193,0.067,25,20k-40k
1,65+,Male,Urban,Hindi,PhD,9501,375,Tablet,0.5,7.7,0.114,0.044,68,0-20k
2,45-54,Female,Suburban,Spanish,Technical,4775,187,Mobile Only,4.5,5.6,0.153,0.095,80,60k-80k
3,35-44,Female,Rural,Spanish,PhD,9182,152,Desktop Only,3.1,4.2,0.093,0.061,65,100k+
4,25-34,Female,Urban,English,Technical,6848,371,Mobile Only,2.0,3.8,0.175,0.022,99,20k-40k


In [67]:
interests_raw.head()

Unnamed: 0,Top Interests
0,Digital Marketing
1,Data Science
2,Fitness and Wellness
3,"Gaming, DIY Crafts"
4,"Fitness and Wellness, Investing and Finance, G..."


In [68]:
# Transform base features
X_base_transformed = preprocessor.fit_transform(X_base)

In [69]:
X_base_transformed

array([[ 0.22661229, -0.4330567 ,  1.36229142, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.58752108,  0.87094879, -1.76491987, ...,  0.        ,
         0.        ,  0.        ],
       [-0.07827922, -0.45420273,  1.36229142, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [ 0.26044996, -0.23569371, -0.51403536, ...,  0.        ,
         0.        ,  0.        ],
       [-0.70674345,  0.06035078,  1.04957029, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.1377884 ,  1.00487368, -0.51403536, ...,  1.        ,
         0.        ,  0.        ]])

In [70]:
# Save preprocessing pipeline
with open("pipelines/preprocessing_pipeline.pkl", "wb") as f:
    pickle.dump(preprocessor, f)

In [71]:
def custom_tokenizer(text):
    """Tokenizes text by splitting on commas and spaces."""
    return text.split(", ")

# Process 'Top Interests'
vectorizer = CountVectorizer(tokenizer=custom_tokenizer)  # Use the named function
interests_vectorized = vectorizer.fit_transform(interests_raw).toarray()

# Save interest vectorizer
with open("pipelines/interests_vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)



In [72]:
# Combine features
X_combined = np.hstack([X_base_transformed, interests_vectorized])


In [73]:
X_combined

array([[ 0.22661229, -0.4330567 ,  1.36229142, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.58752108,  0.87094879, -1.76491987, ...,  0.        ,
         0.        ,  0.        ],
       [-0.07827922, -0.45420273,  1.36229142, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.26044996, -0.23569371, -0.51403536, ...,  0.        ,
         0.        ,  0.        ],
       [-0.70674345,  0.06035078,  1.04957029, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.1377884 ,  1.00487368, -0.51403536, ...,  0.        ,
         0.        ,  0.        ]])

In [74]:
# Reduce dimensionality for clustering
pca = PCA(n_components=10)
X_pca = pca.fit_transform(X_combined)

# Save PCA transformer
with open("pipelines/pca_transformer.pkl", "wb") as f:
    pickle.dump(pca, f)

In [75]:
# Train and save clustering models
models = {
    "kmeans": KMeans(n_clusters=5, random_state=42),
    "agglomerative": AgglomerativeClustering(n_clusters=5),
    "dbscan": DBSCAN(eps=3, min_samples=5)
}

In [76]:
for name, model in models.items():
    model.fit(X_pca)
    with open(f"models/{name}_model.pkl", "wb") as f:
        pickle.dump(model, f)

print("✅ All preprocessing, feature engineering, and models saved successfully.")

✅ All preprocessing, feature engineering, and models saved successfully.


new user input:

In [77]:
# using function

import pickle
import numpy as np
import pandas as pd

# Load components once (you can move this to a separate module or init block)
with open("pipelines/preprocessing_pipeline.pkl", "rb") as f:
    preprocessor = pickle.load(f)

with open("pipelines/interests_vectorizer.pkl", "rb") as f:
    interest_vectorizer = pickle.load(f)

with open("pipelines/pca_transformer.pkl", "rb") as f:
    pca = pickle.load(f)

with open("models/kmeans_model.pkl", "rb") as f:
    model = pickle.load(f)

def predict_user_segment(user_input: dict) -> int:
    """
    Predicts the user segment for a given user profile input.

    Parameters:
    - user_input (dict): Dictionary containing user profile fields.

    Returns:
    - int: Segment number assigned to the user.
    """
    # Convert input to DataFrame
    user_df = pd.DataFrame([user_input])

    # Extract interests and base features
    interests_input = user_df["Top Interests"]
    base_features = user_df.drop(columns=["Top Interests"])

    # Transform each part
    base_transformed = preprocessor.transform(base_features)
    interests_transformed = interest_vectorizer.transform(interests_input).toarray()

    # Combine and apply PCA
    combined_features = np.hstack([base_transformed, interests_transformed])
    features_pca = pca.transform(combined_features)

    # Predict segment
    segment = model.predict(features_pca)

    return int(segment[0])


In [78]:
user_data = {
    "Gender": "Male",
    "Location": "Suburban",
    "Language": "English",
    "Education Level": "Postgraduate",
    "Device Usage": "Desktop",
    "Income Level": "High",
    "Likes and Reactions": 120,
    "Followed Accounts": 30,
    "Time Spent Online (hrs/weekday)": 3.2,
    "Time Spent Online (hrs/weekend)": 4.1,
    "Click-Through Rates (CTR)": 0.09,
    "Conversion Rates": 0.015,
    "Ad Interaction Time (sec)": 32.0,
    "Top Interests": "Business, Finance, Travel"
}

segment_id = predict_user_segment(user_data)
print(f"🧠 User assigned to Segment: {segment_id}")


🧠 User assigned to Segment: 2
