In [2]:
import pandas as pd
df = pd.read_csv('SampleDataset.csv', encoding='latin-1')

In [3]:
df.head()

Unnamed: 0,Sno,TECHNOLOGY,ORIGINAL/SYNONYM,QUESTION,SOLUTION
0,1,PowerBI,Original,Issue with Power bi date slicer,If you are experiencing an issue with the date...
1,2,PowerBI,Synonym,What are some common issues that can arise wit...,same as above
2,3,PowerBI,Synonym,How do I resolve issues with the date slicer i...,Same as above
3,4,PowerBI,Synonym,I'm having trouble getting the date slicer in ...,Same as above
4,5,PowerBI,Synonym,What could be causing the date slicer in Power...,Same as above


In [6]:
df['Sentence'] = df['TECHNOLOGY'] + " - " + df['QUESTION']
df.head()


Unnamed: 0,Sno,TECHNOLOGY,ORIGINAL/SYNONYM,QUESTION,SOLUTION,Sentence
0,1,PowerBI,Original,Issue with Power bi date slicer,If you are experiencing an issue with the date...,PowerBI - Issue with Power bi date slicer
1,2,PowerBI,Synonym,What are some common issues that can arise wit...,same as above,PowerBI - What are some common issues that can...
2,3,PowerBI,Synonym,How do I resolve issues with the date slicer i...,Same as above,PowerBI - How do I resolve issues with the dat...
3,4,PowerBI,Synonym,I'm having trouble getting the date slicer in ...,Same as above,PowerBI - I'm having trouble getting the date ...
4,5,PowerBI,Synonym,What could be causing the date slicer in Power...,Same as above,PowerBI - What could be causing the date slice...


In [9]:
sentences = []
for row in df['Sentence']:
    sentences.append(row)
sentences[1:8]

['PowerBI - What are some common issues that can arise with the date slicer in Power BI, and how can I troubleshoot them?',
 'PowerBI - How do I resolve issues with the date slicer in Power BI when it is not \nfiltering data correctly or not displaying all available dates?',
 "PowerBI - I'm having trouble getting the date slicer in Power BI to work properly. Are \nthere any specific settings or configurations I should check to resolve the issue?",
 'PowerBI - What could be causing the date slicer in Power BI to malfunction, and what \nsteps can I take to fix it?',
 'PowerBI - Can you provide any guidance on troubleshooting problems with the date \nslicer in Power BI, particularly when it comes to issues with filtering or interaction with other visuals and filters?',
 'PowerBI - The date slicer is not filtering data correctly',
 'PowerBI - What should I do if my Power BI date slicer is not filtering data correctly?']

In [11]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = model.encode(sentences)
print(embeddings)

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

[[-0.05455767  0.07401679  0.00639784 ... -0.04954587 -0.05768153
  -0.02505915]
 [-0.04970559  0.03084412  0.01144503 ... -0.0610906  -0.07576085
  -0.01194992]
 [-0.0043903   0.06759154  0.003647   ... -0.05403585 -0.07599874
   0.00057957]
 ...
 [-0.01956022 -0.0618501  -0.05478379 ... -0.03913625  0.02141253
  -0.06274448]
 [-0.00899356 -0.08685962 -0.05075131 ... -0.04794435  0.02407048
  -0.0740986 ]
 [-0.03058353 -0.07275827 -0.0599315  ... -0.06807294  0.05030227
  -0.06599381]]


In [12]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

# Perform pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

# Normalize embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

print("Sentence embeddings:")
print(sentence_embeddings)


Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Sentence embeddings:
tensor([[-0.0546,  0.0740,  0.0064,  ..., -0.0495, -0.0577, -0.0251],
        [-0.0497,  0.0308,  0.0114,  ..., -0.0611, -0.0758, -0.0119],
        [-0.0044,  0.0676,  0.0036,  ..., -0.0540, -0.0760,  0.0006],
        ...,
        [-0.0196, -0.0619, -0.0548,  ..., -0.0391,  0.0214, -0.0627],
        [-0.0090, -0.0869, -0.0508,  ..., -0.0479,  0.0241, -0.0741],
        [-0.0306, -0.0728, -0.0599,  ..., -0.0681,  0.0503, -0.0660]])


In [13]:
print(len(sentence_embeddings))
print(len(sentence_embeddings[0]))

360
384


In [14]:
from sklearn.metrics.pairwise import cosine_similarity
print(cosine_similarity(embeddings[0].reshape(1,-1),embeddings[1].reshape(1,-1)))
print(cosine_similarity(embeddings[0].reshape(1,-1),embeddings[102].reshape(1,-1)))

[[0.91687405]]
[[0.1974166]]
