In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sentence_transformers import SentenceTransformer

# Dados de entrada
data = [
    ['I love the ambiance of this place!'],
    ['The service was terrible and slow.'],
    ['The food was absolutely delicious!'],
    ['I wouldn’t recommend this restaurant to anyone.'],
    ['The staff were very friendly and helpful.'],
    ['The product quality is not worth the price.'],
    ['I’m extremely satisfied with my purchase!'],
    ['The packaging was damaged when it arrived.'],
    ['This is the best experience I’ve ever had!'],
    ['The software keeps crashing and is very frustrating.']
]

# Cria o DataFrame
df = pd.DataFrame(data, columns=['text'])



In [2]:
#model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')tgsc/sentence-transformer-ult5-pt-small
#model = SentenceTransformer("all-mpnet-base-v2", device='cuda')
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2', device='cuda')

def generate_embeddings(query):
    embeddings = model.encode(query)
    return embeddings

In [3]:
# Aplica a função de embeddings à coluna 'text'
df['embeddings'] = df['text'].apply(generate_embeddings)

# Converte a coluna de embeddings para um array 2D
embeddings = np.stack(df['embeddings'].values)

# Verifica o shape do array gerado
print("[Debug] Shape dos embeddings:", embeddings.shape)

[Debug] Shape dos embeddings: (10, 512)


#Redução de dimensionalidade

In [4]:
# Aplica PCA para reduzir para 2 dimensões
pca = PCA(n_components=2, svd_solver='full')
X_transformed = pca.fit_transform(embeddings)

# Adiciona os resultados do PCA ao DataFrame
df['pca_1'] = X_transformed[:, 0]
df['pca_2'] = X_transformed[:, 1]

# Salvar as colunas pca_1 e pca_2 sem cabeçalhos
df[['pca_1', 'pca_2']].to_csv("pca_columns.txt", sep=" ", index=False, header=False)

# Exibe o DataFrame com os resultados
print(df)

                                                text  \
0                 I love the ambiance of this place!   
1                 The service was terrible and slow.   
2                 The food was absolutely delicious!   
3    I wouldn’t recommend this restaurant to anyone.   
4          The staff were very friendly and helpful.   
5        The product quality is not worth the price.   
6          I’m extremely satisfied with my purchase!   
7         The packaging was damaged when it arrived.   
8         This is the best experience I’ve ever had!   
9  The software keeps crashing and is very frustr...   

                                          embeddings     pca_1     pca_2  
0  [-0.114987895, 0.054891624, -0.120584846, -0.0... -0.668121 -0.589272  
1  [-0.022050558, 0.058607098, -0.0022807487, 0.0...  0.363624  0.546539  
2  [-0.050669707, 0.035486836, 0.13426976, 0.0078...  0.903365 -0.588423  
3  [-0.046081286, 0.008064925, -0.019613393, 0.00... -0.194045  0.106042  
4  [-0.0

In [5]:
embeddings_df = pd.DataFrame(df['embeddings'].tolist())
# Suponha que 'embeddings' seja seu array numpy
np.savetxt("data/IN/dataset.txt", embeddings_df, delimiter=" ", fmt="%.8f")
embeddings_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
0,-0.114988,0.054892,-0.120585,-0.038141,0.089333,2.5767989999999995e-38,0.0,0.035315,-0.031261,-0.002928,...,0.003928,0.004961,-0.125372,0.036127,-0.019403,0.122741,5.642782e-08,-0.142997,0.018872,-0.199172
1,-0.022051,0.058607,-0.002281,0.01323,-0.023395,2.591603e-38,0.0,0.025354,-0.038747,0.044132,...,0.086991,-0.129323,-0.036984,-0.044692,-0.010516,0.128196,7.262082e-08,0.117528,0.030939,0.002116
2,-0.05067,0.035487,0.13427,0.007806,-0.006268,2.783926e-38,0.0,-0.035701,-0.012368,0.051586,...,0.043263,0.039972,-0.106636,-0.084593,0.016588,0.167881,6.516004e-08,0.013047,0.080556,-0.001138
3,-0.046081,0.008065,-0.019613,0.000263,0.002652,2.426801e-38,0.0,0.039669,-0.040398,-0.005899,...,0.01532,0.085754,-0.065161,-0.001537,0.015685,-0.038363,7.449154e-08,0.025306,0.0412,0.027596
4,-0.023653,0.020638,0.088438,-0.051336,0.080819,3.4869209999999997e-38,0.0,-0.030209,-0.030276,0.009914,...,-0.001439,0.094043,0.082544,0.032242,-0.007666,0.074183,7.185998e-08,-0.009116,0.020187,-0.084984
5,-0.022421,0.027058,-0.159451,-0.039147,0.063447,1.9015839999999997e-38,0.0,0.026492,-0.029174,0.021886,...,-0.000895,-0.039403,-0.098098,-0.028655,0.015817,0.122156,6.597845e-08,0.003931,0.082309,-0.005566
6,-0.043928,0.022936,-0.106378,-0.018616,0.058714,3.864475e-38,0.0,0.038168,-0.054883,-0.04097,...,0.040955,-0.023781,-0.123494,0.036703,0.002049,0.064416,7.019757e-08,0.063994,0.014842,-0.032273
7,-0.036745,0.025482,0.061017,-0.030252,0.034858,1.613828e-38,0.0,-0.037675,-0.01794,-0.018163,...,0.064017,0.044021,-0.100573,-0.008209,0.034025,0.028289,7.388584e-08,-0.005637,0.050059,-0.045505
8,-0.027847,0.010018,-0.037201,-0.057642,-0.036082,2.5461019999999996e-38,0.0,0.039977,-0.083168,-0.02399,...,0.069678,0.001013,-0.078929,0.028546,-0.036268,0.029302,6.700546e-08,0.039691,0.035775,-0.043491
9,-0.003905,0.042979,-0.060527,-0.002128,-0.006415,1.781443e-38,0.0,-0.03754,0.020116,0.012881,...,-0.012932,-0.004943,-0.007981,-0.020711,-0.033827,0.06881,6.917067e-08,-0.045292,0.057981,-0.056361
