In [None]:
!pip install relevanceai
!pip install vectorhub
!pip install transformers
!pip install sentence-transformers

## 🎰 set parameters

To create your clustering application, I will first import my data, preprocess it and encode a part of it.
After hacing uploaded it to relevanceai, I will apply a clustering algorithm to create a sharable application.
The following paramters will be used for the entire project.

In [1]:
PROJECT_ID = '<project-name>'
API_KEY = '<api-key>'
ENCODING_FIELDS = ['text'] #they need to be a list
DATASET_NAME = 'steam_reviews' #name of the dataset file in local, just for reference
DATASET_ID = 'steam-reviews' #name of the dataset_id on relevanceai
MODEL = 'all-MiniLM-L6-v2'
VECTOR_SUFFIX = '_sentence_transformers_vector_'
FIELDS = ['text']
CLUSTERS = 240
AGG = {}

## 🏅 relevanceai login

To login in relevanceai you will need a 
Once you initiate a client, you will be able to send and edit data on our servers.

In [None]:
#relevanceai client
import relevanceai

client = relevanceai.Client(PROJECT_ID, API_KEY)

## 🧶 preparing the data

Most of the preprocessing work is done on a pandas DataFrame. 
However, because this format cannot be directly uploaded to relevanceai, we will need to convert our DataFrame into a list of dictionaries and then upload it.

In [None]:
import nltk
import pandas as pd
import numpy as np
nltk.download('punkt')
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
from vectorhub.encoders.text.sentence_transformers import SentenceTransformer2Vec
from sklearn.metrics.pairwise import euclidean_distances

model = SentenceTransformer2Vec(MODEL)

def zeroshot(df_text, df_text_vectors, model, top_common, top_sample):
    #df_text is the list of text
    #df_text is the list of vecotrized text

    #tokenize all words
    all_words = []
    for t in df_text:
        all_words += nltk.tokenize.word_tokenize(t)
    all_words

    #frequency dictionary
    all_words_dist = nltk.FreqDist(w.lower() for w in all_words)
    all_words_except_stop_dist = nltk.FreqDist(w.lower() for w in all_words if w not in stopwords and w.isalnum() and len(w) != 1)

    #dictionary of vectorized top frequent words
    dictionary_words = [{"_id": i,"label": w[0], "label_vector_": model.encode(w[0])} for i, w in enumerate(all_words_except_stop_dist.most_common(top_common))]

    #
    closest_topn_index = np.argsort(euclidean_distances(
        [d for d in df_text_vectors], 
        np.array([vectorized_word["label_vector_"] for vectorized_word in dictionary_words])
    ), axis=1)[:, :top_sample]

    word_list = list()
    count = 0
    for vector in df_text_vectors:
        tags = []
        for ind in closest_topn_index[count]:
            tags.append(dictionary_words[ind]["label"])
        word_list.append(tags)
        count += 1

    #we obtain a list of lists, long as the sample itself
    return word_list

In [None]:
import pandas as pd

#load data
#dataset source: https://www.kaggle.com/luthfim/steam-reviews-dataset
df = pd.read_csv('steam_reviews.csv')
if 'Unnamed: 0' in df.columns:
    df = df.drop('Unnamed: 0', axis=1)
df = df.dropna()
df = df.reset_index()
df.columns = ['_id']+list(df.columns)[1:]
df.rename(columns={'review': 'text'}, inplace=True)
df

### 📝 preprocessing

In [None]:
df = df[['funny', 'helpful', 'hour_played', 'text', 'title', 'recommendation']]
df = df.dropna()
df = df.sample(10, random_state=35)
df['title'] = df['title'].apply(lambda x : x.lower())

#encode
df['text_sentence_transformers_vector_'] = df['text'].apply(lambda x : model.encode(x))

#df
df['zeroshot_list'] = zeroshot(df['text'], df['text_sentence_transformers_vector_'], model, 5000, 10)
df = pd.concat([df, pd.get_dummies(df['recommendation'])], axis=1)
df

### 🧩 create list of dictionaries

The only format we can upload to relevanceai is a list of dictionaries.
We need to convert our pandas dataframe into this format to proceed with our project. 
Notice how we are converting our data into this format by using small batches. 

In [None]:
#convert to df_ready in batches
rows_ = list()
for rows in range(0, len(df), 20):
    rows_.append(rows)
rows_.append(len(df))

df_ready = list()
for r in range(len(rows_)-1):
    #print(rows_[r], rows_[r+1])
    df_ready += df[rows_[r]:rows_[r+1]].to_dict(orient='records')
df_ready

## 🧬 encoding

Because encoding is usually a long process, with a speed that is highly dependent on the encoder we choose, we might want to use a progressbar. 
After our data will be encoded, a new field with the suffix "sentence_transformers_vector_" will be created for each sample.
After encoding, we will upload batches of our data using bulk_insert.

In [10]:
def batch_splitting(len_df, range_len):
    range_list = list()
    if range_len >= len_df:
        range_list.append([0, len_df])
    else:
        for a in range(int(len_df/range_len)):
            range_list.append([a*range_len, (a+1)*range_len])
        range_list.append([range_list[-1][1], len_df])
    return range_list

In [None]:
from vectorhub.encoders.text.sentence_transformers import SentenceTransformer2Vec
import progressbar
import relevanceai

#clean dataset, otherwise repeated clustering throws error
#client.datasets.delete(dataset_id=DATASET_ID) #in case we want a fresh start
batches = batch_splitting(len_df=len(df_ready), range_len=5000)

bar = progressbar.ProgressBar(maxval=len(batches), widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])
model = SentenceTransformer2Vec(MODEL)

#encoding
df_ready_encoded = list()
bar.start()
counter = 0
for batch in batches:
    bar.update(counter)
    current_vectors = model.encode_documents(documents=df_ready[batch[0]:batch[1]], fields=ENCODING_FIELDS)
    df_ready_encoded += current_vectors
    counter += 1
bar.finish()

#we operate on df_ready
df_ready = df_ready_encoded

#upload
bar.start()
counter = 0
for batch in batches:
    bar.update(counter)
    client.datasets.bulk_insert(dataset_id=DATASET_ID, documents=df_ready[batch[0]:batch[1]])
    counter += 1
bar.finish()

## 🪄 clustering

To perform a clustering on our data, we first need to calculate the centroids. The data will be uploaded to relevanceai.
Then, we will need to call another method that perform the clustering, with the alias containing the number of clusters we just used to find the centroids.
We can call this function with a different number of cluster as many times as we wish.

In [None]:
import relevanceai

# Vector field based on which clustering is done - (Currently only one vector is supported)
vector_field = 'descriptiontextmulti_vector_'

#calculate centroids
centroids = client.vector_tools.cluster.kmeans_cluster(
    dataset_id = DATASET_ID, 
    #vector_fields=[[f'{x}' for x in ENCODING_FIELDS][0]], 
    vector_fields=[x+VECTOR_SUFFIX for x in ENCODING_FIELDS], #potential bug when in our dataset we do not have a text field
    k = CLUSTERS)

#creates clusters but only gives the centroids
#clustering results is uploaded on the database

client.datasets.schema(DATASET_ID)

client.services.cluster.centroids.list_closest_to_center(
  dataset_id=DATASET_ID,
  #vector_fields=[[f'{x}' for x in ENCODING_FIELDS][0]], 
  vector_fields=[x+VECTOR_SUFFIX for x in ENCODING_FIELDS],
  page_size=40,
  #cluster_ids=[], # Leave this as an empty list if you want all of the clusters
  alias=f"kmeans_{CLUSTERS}" #change to 'kmeans_10' 
)

## 🥡 sharing

It's time to share our app! By running the code below, you will be given a public link that you can share with your friends.
Know that by using the clustering dashboard once you login to relevanceai, you can manually edit the parameters of your clustering application.

In [None]:
import relevanceai
import requests

response = requests.post(
    "https://gateway-api-aueast.relevance.ai/latest/deployables/create", 
    json={"dataset_id":
            f"{DATASET_ID}","configuration":
                {
                    "collection_name": f"{DATASET_ID}",
                    "type": "cluster", 
                    "deployable_name": f"{DATASET_ID}", 
                    "project_id": f"{PROJECT_ID}",
                    "cluster": {
                        "alias": f"kmeans_{CLUSTERS}",
                        #"vector_field": [f'{x}' for x in ENCODING_FIELDS][0]} #
                        "vector_field": [x+VECTOR_SUFFIX for x in ENCODING_FIELDS][0]}
            ,"clusters-card-builder": 
                {
                    "previewComponents":
                        [{
                            "previewType": "centroids",
                            "previewFields": FIELDS, #any way to select all fields?
                            "tabName": "",
                            "displayType": "default"
                        },
                        AGG,
                        {}
                        ],
                    "expandedComponents":[]},
                    "deployable_logo":
                        None},
                    "upsert":
                        False},
    headers={"Content-Type":"application/json", "Authorization": "michelangioloma:TDFvdm9Yd0J1M3VVTEcyQTV3VWs6b3RnRXJnUW9SQ2kwMVVJWVZ2VzVtZw"}
)

DEPLOYABLE_ID = response.json()['deployable_id']

#
response = requests.post(
    f"https://gateway-api-aueast.relevance.ai/latest//deployables/{DEPLOYABLE_ID}/share", 
    headers={"Content-Type":"application/json", "Authorization": "michelangioloma:TDFvdm9Yd0J1M3VVTEcyQTV3VWs6b3RnRXJnUW9SQ2kwMVVJWVZ2VzVtZw"}
)
response.json()

#
response = requests.get(
    f"https://gateway-api-aueast.relevance.ai/v2/deployables/{DEPLOYABLE_ID}/get", 
    headers={"Content-Type":"application/json", "Authorization": "michelangioloma:TDFvdm9Yd0J1M3VVTEcyQTV3VWs6b3RnRXJnUW9SQ2kwMVVJWVZ2VzVtZw"}
)
response.json()

SHARABLE_API = response.json()['api_key']
SHARABLE_API

with open(f'datasets/app_{DATASET_ID}.txt', 'a') as file:
    file.truncate(0)
    file.write(f'https://cloud.relevance.ai/dataset/{DATASET_ID}/deploy/cluster/{PROJECT_ID}/{SHARABLE_API}/{DEPLOYABLE_ID}')

with open('app_url.txt', 'a') as file:
    file.write(f'https://cloud.relevance.ai/dataset/{DATASET_ID}/deploy/cluster/{PROJECT_ID}/{SHARABLE_API}/{DEPLOYABLE_ID}\n')

print('done')