# **Training Data**

In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np

In [2]:
# Inicializar o modelo
model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')

In [3]:
data = [['I love the ambiance of this place!', 'positive'],
        ['The service was terrible and slow.', 'negative'],
        ['The food was absolutely delicious!', 'positive'],
        ['I wouldn’t recommend this restaurant to anyone.', 'negative'],
        ['The staff were very friendly and helpful.', 'positive'],
        ['The product quality is not worth the price.', 'negative'],
        ['I’m extremely satisfied with my purchase!', 'positive'],
        ['The packaging was damaged when it arrived.', 'negative'],
        ['This is the best experience I’ve ever had!', 'positive'],
        ['The software keeps crashing and is very frustrating.', 'negative']]

df_label = pd.DataFrame(data, columns=['text', 'category'])

In [4]:
data = [['I love the ambiance of this place!'],
        ['The service was terrible and slow.'],
        ['The food was absolutely delicious!'],
        ['I wouldn’t recommend this restaurant to anyone.'],
        ['The staff were very friendly and helpful.'],
        ['The product quality is not worth the price.'],
        ['I’m extremely satisfied with my purchase!'],
        ['The packaging was damaged when it arrived.'],
        ['This is the best experience I’ve ever had!'],
        ['The software keeps crashing and is very frustrating.']]

df = pd.DataFrame(data, columns=['text'])

In [5]:
df

Unnamed: 0,text
0,I love the ambiance of this place!
1,The service was terrible and slow.
2,The food was absolutely delicious!
3,I wouldn’t recommend this restaurant to anyone.
4,The staff were very friendly and helpful.
5,The product quality is not worth the price.
6,I’m extremely satisfied with my purchase!
7,The packaging was damaged when it arrived.
8,This is the best experience I’ve ever had!
9,The software keeps crashing and is very frustr...


In [6]:
def generate_embeddings(query):
    # Inicializar o modelo dentro da função
    embeddings = model.encode(query)
    return embeddings

In [173]:

# Generate Embeddings for all the products
df['Text_Embedding'] = df['text'].apply(generate_embeddings)

In [174]:
df.head()

Unnamed: 0,text,Text_Embedding
0,I love the ambiance of this place!,"[0.03190428, 0.035732128, 0.07044879, 0.020851..."
1,The service was terrible and slow.,"[0.040091757, 0.06595178, 0.030180786, -0.0040..."
2,The food was absolutely delicious!,"[-0.055612817, 0.0837042, 0.014392061, 0.04984..."
3,I wouldn’t recommend this restaurant to anyone.,"[-0.0009020116, 0.017153729, 0.032585476, 0.04..."
4,The staff were very friendly and helpful.,"[-0.026263287, 0.027818386, 0.010420601, 0.004..."


In [175]:
import faiss                   # make faiss available

In [176]:
ncentroids = 2
niter = 30
verbose = True
d = 384
kmeans = faiss.Kmeans(d, ncentroids, niter=niter, verbose=verbose)
kmeans.train(df['Text_Embedding'].to_list())

Clustering 10 points in 384D to 2 clusters, redo 1 times, 30 iterations
  Preprocessing in 0.00 s
  Iteration 29 (0.00 s, search 0.00 s): objective=6.07015 imbalance=1.040 nsplit=0       




6.070149898529053

In [177]:
print(kmeans.centroids.shape)
print(kmeans.centroids[0])

(2, 384)
[-4.84316936e-03  1.24052558e-02  2.90985983e-02 -2.03665532e-02
 -3.58103551e-02 -3.23633738e-02 -3.91662605e-02  9.72917769e-03
 -1.15436278e-02 -2.88394652e-03  1.01477588e-02  4.91911247e-02
  9.52031836e-03  1.55684175e-02 -2.66564991e-02 -2.51307264e-02
  8.28880966e-02 -4.25429009e-02  1.44767892e-02  1.99354580e-03
 -2.62457002e-02 -1.11181671e-02 -1.56391896e-02  3.27581689e-02
 -2.50892453e-02 -1.33982860e-02 -2.60737110e-02  3.44846398e-04
 -1.48956878e-02 -3.67165878e-02 -1.46786897e-02  2.20413581e-02
 -4.14154120e-03 -1.26406727e-02  2.41413657e-02 -9.80910566e-03
  3.33294943e-02 -2.36753877e-02 -2.62889341e-02  2.56552044e-02
  1.45993941e-03  2.33527534e-02 -1.37303043e-02 -1.69169903e-02
  1.95293855e-02 -5.37581183e-02 -3.02132219e-03 -2.71758586e-02
  8.97781700e-02 -3.76449432e-04 -1.66797694e-02 -3.44264619e-02
  2.38980018e-02 -4.92204241e-02  1.22692212e-02  6.24720659e-03
 -3.97338457e-02  3.73761132e-02  1.76548362e-02  7.05070496e-02
  1.65554993e-02

In [178]:
D, I = kmeans.index.search(np.array(df['Text_Embedding'].to_list()), 1)

In [179]:
D,I

(array([[0.54259324],
        [0.53530073],
        [0.46643206],
        [0.58682436],
        [0.71821046],
        [0.54453206],
        [0.59245396],
        [0.73199034],
        [0.66975343],
        [0.6820593 ]], dtype=float32),
 array([[1],
        [0],
        [1],
        [0],
        [1],
        [0],
        [1],
        [1],
        [1],
        [0]]))

In [180]:
df_label

Unnamed: 0,text,category
0,I love the ambiance of this place!,positive
1,The service was terrible and slow.,negative
2,The food was absolutely delicious!,positive
3,I wouldn’t recommend this restaurant to anyone.,negative
4,The staff were very friendly and helpful.,positive
5,The product quality is not worth the price.,negative
6,I’m extremely satisfied with my purchase!,positive
7,The packaging was damaged when it arrived.,negative
8,This is the best experience I’ve ever had!,positive
9,The software keeps crashing and is very frustr...,negative
