In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = AutoTokenizer.from_pretrained("bigcode/starencoder")
model = AutoModel.from_pretrained("bigcode/starencoder")

Downloading (…)lve/main/config.json: 100%|██████████| 667/667 [00:00<?, ?B/s] 
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading pytorch_model.bin: 100%|██████████| 499M/499M [02:12<00:00, 3.76MB/s] 


In [9]:
def get_similarity(text1, text2):
    # Tokenize the texts
    encoded_text1 = tokenizer.encode(text1, add_special_tokens=True)
    encoded_text2 = tokenizer.encode(text2, add_special_tokens=True)

    # Define the maximum length for padding
    max_length = max(len(encoded_text1), len(encoded_text2))

    # Pad the encoded texts manually
    padded_text1 = encoded_text1 + [0] * (max_length - len(encoded_text1))
    padded_text2 = encoded_text2 + [0] * (max_length - len(encoded_text2))

    # Convert the padded texts to tensors
    padded_text1_tensor = torch.tensor(padded_text1).unsqueeze(0)
    padded_text2_tensor = torch.tensor(padded_text2).unsqueeze(0)

    # Generate the embeddings for the padded texts using the model
    with torch.no_grad():
        embeddings_text1 = model(padded_text1_tensor).last_hidden_state.mean(dim=1)
        embeddings_text2 = model(padded_text2_tensor).last_hidden_state.mean(dim=1)

    # Calculate the cosine similarity between the embeddings
    similarity_score = cosine_similarity(embeddings_text1, embeddings_text2)[0][0]
    return similarity_score

In [10]:
# Define your texts
text1 = "I like Wombats for a reason."
text2 = "A very different one speaking about animals having square poop"

similarity_score = get_similarity(text1=text1, text2=text2)

# Print the similarity score
print("Similarity score:", similarity_score)


Similarity score: 0.92662966


In [15]:
crawler1_path = "testing_embeding/crawler1.py"
crawler2_path = "testing_embeding/crawler2.py"
snake_path = "testing_embeding/snake.py"

code_examples = []

with open(crawler1_path, 'r') as f:
    code_examples.append(('crawler1', f.read()))
with open(crawler2_path, 'r') as f:
    code_examples.append(('crawler2', f.read()))
with open(snake_path, 'r') as f:
    code_examples.append(('snake', f.read()))

In [16]:
from itertools import permutations

# Get all permutations of pairs
pair_permutations = list(permutations(code_examples, 2))

# Print the pair permutations
for pair in pair_permutations:
    print(f'similarity between {pair[0][0]} and {pair[1][0]}: {get_similarity(text1=pair[0][1], text2=pair[1][1])}')


similarity between crawler1 and crawler2: 0.9679924249649048
similarity between crawler1 and snake: 0.5613067746162415
similarity between crawler2 and crawler1: 0.9679924249649048
similarity between crawler2 and snake: 0.6073487401008606
similarity between snake and crawler1: 0.5613067746162415
similarity between snake and crawler2: 0.6073487401008606
