# Libraries

In [2]:
from datetime import datetime, timedelta
import os
import altair as alt
import numpy as np
import pandas as pd
from sklearn.cluster import DBSCAN
import umap

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import StandardScaler

In [4]:
from umap import UMAP

# Load and prepare data

In [5]:
user_df = pd.read_csv("https://storage.googleapis.com/superlinked-notebook-user-acquisiton-analytics/user_acquisiton_data.csv")

In [6]:
user_df.shape
user_df.head()

Unnamed: 0,id,signup_date,ad_creative,activity
0,0,1693044869,Join a dynamic gaming universe where every cha...,0.295882
1,1,1694581577,Join a thriving gaming community and challenge...,1.138306
2,2,1693557423,Experience the rush of competitive gaming! Joi...,0.005266
3,3,1693313203,Experience the rush of competitive gaming! Joi...,0.297846
4,4,1692863645,Join a dynamic gaming universe where every cha...,0.118947


Signup date is a timestamp. I thought about changeing it to date but for clustering this form should be better (we basically calculate distances for which a numeric variable is better). 

## Vectorize text

I haven't done this before, if it is utter bullshit I am deeply sorry. I will use the sbert package with the recommended "all-mpnet-base-v2" model. It should be the best for general purposes, however it might be very slow. If it is too slow indeed, will use "all-MiniLM-L6-v2" instead.
As i see there are 12 versions of ad_creative. The easiest would be to treat this as a simple categorical variable and use some encoding from the previous class. But vectorization sounds more fun.

In [7]:
user_df["ad_creative"].value_counts()

ad_creative
Unleash your gaming potential! Upgrade to premium for 2 months free and dominate the competition with XYZCr$$d!                                                  1785
Ready to level up? Join XYZCr$$d now for intense gaming battles and exclusive rewards!                                                                           1312
Embark on your gaming journey with XYZCr$$d! Join now and experience the thrill of competing against the finest players worldwide!                                715
Join a dynamic gaming universe where every challenge pushes your skills to the limit! Connect with like-minded gamers and conquer the virtual realm together!     671
Step into the arena of endless possibilities! Join now to engage in exhilarating gaming competitions and forge friendships with fellow gamers along the way!      541
Calling all gamers! Don't miss out on your chance to excel. Join XYZCr$$d today for unparalleled gaming excitement and endless challenges!                    

### Dummy example from sbert homepage:

In [8]:

model = SentenceTransformer("all-MiniLM-L6-v2")

# Our sentences to encode
sentences = [
    "This framework generates embeddings for each input sentence",
    "Sentences are passed as a list of string.",
    "The quick brown fox jumps over the lazy dog."
]

# Sentences are encoded by calling model.encode()
embeddings = model.encode(sentences)

# Print the embeddings
for sentence, embedding in zip(sentences, embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

Sentence: This framework generates embeddings for each input sentence
Embedding: [-1.37173459e-02 -4.28515188e-02 -1.56286024e-02  1.40537797e-02
  3.95537801e-02  1.21796258e-01  2.94333864e-02 -3.17523889e-02
  3.54959629e-02 -7.93139860e-02  1.75878592e-02 -4.04370055e-02
  4.97259274e-02  2.54912041e-02 -7.18700588e-02  8.14968869e-02
  1.47070049e-03  4.79627140e-02 -4.50336300e-02 -9.92174819e-02
 -2.81769671e-02  6.45046532e-02  4.44670469e-02 -4.76217121e-02
 -3.52952629e-02  4.38671932e-02 -5.28565831e-02  4.33033478e-04
  1.01921469e-01  1.64072085e-02  3.26996446e-02 -3.45986821e-02
  1.21339206e-02  7.94871151e-02  4.58346587e-03  1.57778263e-02
 -9.68205091e-03  2.87625752e-02 -5.05806208e-02 -1.55793773e-02
 -2.87906546e-02 -9.62282810e-03  3.15556638e-02  2.27348655e-02
  8.71449336e-02 -3.85027416e-02 -8.84718597e-02 -8.75498541e-03
 -2.12342888e-02  2.08923370e-02 -9.02077407e-02 -5.25732227e-02
 -1.05638960e-02  2.88310722e-02 -1.61454901e-02  6.17840327e-03
 -1.23234

In [9]:
len(embeddings[2])
# This specific model creates a 384 dimensional representation from these sentences

384

### Use own data

In [10]:
model = SentenceTransformer("all-mpnet-base-v2")
# Tensorization was recommended by chatgpt
embeddings = model.encode(user_df["ad_creative"],convert_to_tensor=True)

In [11]:
len(embeddings[6])
# This creates a 768 dimensional space

768

In [12]:
# As I understand all these would become features for the clustering. The superlinked version did not work for me, so i try to do it without it.
embeddings[0]

tensor([-6.7157e-03, -4.5748e-03, -3.6820e-02,  1.2844e-02, -8.9428e-03,
        -4.6321e-03, -6.9624e-02,  2.3077e-03, -1.5313e-02,  5.3179e-02,
        -2.8946e-03, -6.0466e-02,  9.7875e-03,  9.7021e-02,  5.2312e-02,
        -4.5470e-02,  5.9949e-03, -3.1639e-02, -4.5347e-02, -5.1283e-03,
        -1.3029e-03, -9.9728e-03, -5.1288e-02,  3.6355e-02,  4.5777e-02,
        -3.0231e-02,  2.5194e-02,  4.4343e-02,  4.2983e-02, -5.3917e-02,
         1.0463e-02, -1.1307e-01,  2.4688e-02,  2.9605e-02,  1.7974e-06,
        -1.2655e-02, -2.5933e-02, -2.2989e-02, -3.0663e-02, -2.0052e-02,
        -1.3364e-03,  7.3335e-02, -3.3265e-02,  7.3368e-03, -3.1518e-02,
        -5.6642e-02,  4.8134e-02, -8.9021e-03,  4.8401e-02, -1.8054e-03,
         4.4015e-04,  1.3599e-02, -4.5749e-02, -3.2194e-02, -1.4481e-02,
        -2.5192e-02, -4.3038e-03,  4.7047e-02,  7.8961e-02, -5.6039e-02,
         2.0307e-02, -1.9138e-02, -1.9204e-02, -2.2772e-02,  2.6886e-02,
         4.2982e-02,  8.7360e-03,  1.8750e-02, -2.2

In [13]:
user_df.columns

Index(['id', 'signup_date', 'ad_creative', 'activity'], dtype='object')

In [14]:
# This was also recommended by chatgpt (and we also talked about it in class), 
# that the features should have similar similar scaling. My worry is that each sentence from the original data 
# now has more than 700 features, that makes them more "important" relative to the other 2 features in the clustering
scaler = StandardScaler()
scaled_embeddings = scaler.fit_transform(embeddings)
scaled_features = scaler.fit_transform(user_df[["signup_date","activity"]].values)


In [15]:
#Whatever :D
scaled_features

array([[-1.77405203, -0.05029302],
       [-1.44592907,  3.10656403],
       [-1.66460981, -1.13933192],
       ...,
       [ 0.49684875,  0.0422679 ],
       [ 0.84226176, -0.38274762],
       [ 0.40053266, -0.55139086]])

In [16]:
check_inverse = scaler.inverse_transform(scaled_features)
check_inverse[0]

array([1.69304487e+09, 2.95881631e-01])

In [17]:
user_df.iloc[0]
#Scaling looks ok for the date and activity features (i still don't care that date is a timestamp, maybe I should)

id                                                             0
signup_date                                           1693044869
ad_creative    Join a dynamic gaming universe where every cha...
activity                                                0.295882
Name: 0, dtype: object

In [18]:
# Data for clustering:
scaled_data = pd.concat([pd.DataFrame(scaled_features),pd.DataFrame(scaled_embeddings)],axis=1)
scaled_data.shape

(8000, 770)

# Clustering

## DBSCAN:

In [19]:
dbscan = DBSCAN(eps=0.5, min_samples=4)
clusters = dbscan.fit_predict(scaled_data)

In [20]:
# With baseline parameters dbscan found 25 clusters - a bit too many (we had 12 sentences).
# Probably too many central nodes with min_samples = 4. 
clusters.max()

23

In [21]:
# With arbitrary 400 as min_samples we get 5 clusters which seems better intuitively
dbscan = DBSCAN(eps=0.5, min_samples=400)
db_clusters = dbscan.fit_predict(scaled_data)
db_clusters.max()


3

In [22]:
scaled_data["db_cluster"] = db_clusters
scaled_data["db_cluster"].value_counts()

db_cluster
-1    4484
 0    1439
 2    1070
 1     592
 3     415
Name: count, dtype: int64

## HDBSCAN:

# Dimension Reduction and ploting

In [30]:
umap_fit = UMAP(random_state=0, transform_seed=0, n_jobs=1, n_neighbors=10,min_dist= 0.1)
umap_data = umap_fit.fit_transform(scaled_data)
umap_data = pd.DataFrame(umap_data,columns= ["dim1","dim2"])



array([[ 12.397387  ,  -8.472926  ],
       [  0.72780305, -12.2400055 ],
       [  6.692195  ,  -7.031122  ],
       ...,
       [ -1.6576385 ,  -6.6819143 ],
       [ -0.9640971 ,  -2.3229218 ],
       [ 15.645117  ,   5.78951   ]], dtype=float32)