In [9]:
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec

In [10]:
def get_sentence_embedding(sentence, model):
        embeddings = [model.wv[word] for word in sentence if word in model.wv]
        if embeddings:
            return torch.tensor(embeddings).mean(dim=0)
        else:
            return torch.zeros(model.vector_size)

def generate_embeddings(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.readlines()

    processed_text = [line.strip().split() for line in text]

    # Train a word2vec model
    word2vec_model = Word2Vec(sentences=processed_text, vector_size=100, window=5, min_count=1, workers=4)

    # Get the embeddings for each sentence
    sentence_embeddings = [get_sentence_embedding(line, word2vec_model) for line in processed_text]

    return torch.stack(sentence_embeddings)

# KNP

In [11]:
knp_embeddings = generate_embeddings('../results/X86_64/strace_KNP.txt')
knp_embeddings.shape

torch.Size([9731, 100])

# N Queens

In [12]:
n_queens_embeddings = generate_embeddings('../results/X86_64/strace_N_Queens.txt')
n_queens_embeddings.shape

torch.Size([106, 100])

# TSP

In [13]:
tsp_embeddings = generate_embeddings('../results/X86_64/strace_TSP.txt')
tsp_embeddings.shape

torch.Size([222, 100])

# MATRIX MULT

In [14]:
matrix_mult = generate_embeddings('../results/X86_64/strace_MATRIX_MULT.txt')
matrix_mult.shape

torch.Size([15193, 100])