In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch

submission = pd.read_csv("submissions/yandex_gpt_raw.csv")
train_solutions = pd.read_csv("data/with_pyright/train_solutions_with_pyright.csv")
pyright_solutions = pd.read_csv("data/with_pyright/test_solutions_with_pyright.csv")
submission = pd.merge(submission, pyright_solutions[["id", "message"]], left_on="solution_id", right_on="id", how="left").drop("id", axis=1)


# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

# Load AutoModel from huggingface model repository
tokenizer = AutoTokenizer.from_pretrained("ai-forever/sbert_large_nlu_ru")
model = AutoModel.from_pretrained("ai-forever/sbert_large_nlu_ru")

# Function to compute embeddings for a text
def compute_embeddings(text):
    # Tokenize sentences
    encoded_input = tokenizer(text, padding=True, truncation=True, max_length=24, return_tensors='pt')
    
    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    # Perform pooling. In this case, mean pooling
    return mean_pooling(model_output, encoded_input['attention_mask']).numpy().squeeze()

# Apply function to the text column and create a new embeddings column
train_solutions['good_embedding'] = train_solutions['author_comment'].apply(compute_embeddings)


In [None]:
submission["good_embedding"] = submission["author_comment"].apply(compute_embeddings)

In [None]:
submission

Unnamed: 0,solution_id,author_comment,author_comment_embedding,message,good_embedding
0,0,"Обратите внимание на то, что в вашем решении н...",-0.6617977023124695 -1.63694167137146 -0.18260...,,"[1.2556282, 0.016494494, 0.5806775, 0.6323035,..."
1,1,Обратите внимание на закрывающую скобку в стро...,-0.35723087191581726 -0.8950250148773193 -0.01...,"""("" was not closed","[0.8313722, 0.0013448062, 0.25305587, 0.513377..."
2,2,"Обратите внимание на то, что в вашем решении н...",-0.7075666189193726 -1.4730929136276245 -0.093...,"""("" was not closed","[0.9395901, 0.15751688, 0.728238, 0.8257442, 0..."
3,3,"Обратите внимание, что в вашем решении перепут...",-0.7298892736434937 -1.2177058458328247 0.2146...,,"[1.1250482, -0.16553128, 0.574012, 1.0059338, ..."
4,4,"Обратите внимание на то, как вычисляется стоим...",-0.5466601252555847 -1.130811095237732 0.69227...,,"[0.776521, -0.116654016, 0.40440896, 0.4520562..."
...,...,...,...,...,...
320,725,В вашем решении функция success всегда возвращ...,0.007248252630233765 -1.2673029899597168 0.402...,,"[1.0665845, 0.015698072, 0.27115342, 0.1053895..."
321,726,В вашем решении функция success должна возвращ...,-0.4889887273311615 -1.7024019956588745 0.2202...,,"[0.88942987, 0.0808178, 0.03722991, 0.01825778..."
322,727,В вашем решении не было обнаружено синтаксичес...,-0.32445967197418213 -0.6892445683479309 -0.14...,,"[1.2841376, -0.17404707, 0.7482147, 0.71791357..."
323,728,В вашем решении функция success всегда будет в...,-0.3303411900997162 -1.1162859201431274 0.3479...,,"[1.0292956, -0.100415654, 0.2885362, 0.2195555..."


In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Function to replace texts based on cosine similarity
def replace_text_based_on_similarity(train, submission):
    train = train.copy()
    submission = submission.copy()
    # train['author_comment_embedding'] = train['author_comment_embedding'].apply(lambda x: convert_embedding(x))
    # submission['author_comment_embedding'] = submission['author_comment_embedding'].apply(lambda x: convert_embedding(x))
    # Loop over rows in submission where true_false is True
    for idx, row in submission[submission['message'].notna()].iterrows():
        # Calculate cosine similarities between the current row's embedding and all true text embeddings
        similarities = cosine_similarity([row['good_embedding']], train[train['message'].notna()]['good_embedding'].tolist())
        
        # Find the index of the most similar embedding in train
        most_similar_idx = np.argmax(similarities)

        
        # Replace the text in submission with the most similar true_text from train
        submission.at[idx, 'author_comment'] = train.at[most_similar_idx, "author_comment"]

    return submission

# Example usage
df2_updated = replace_text_based_on_similarity(train_solutions, submission)

# Now df2 will have the most similar true_text from df1 where true_false is True


In [None]:
from app.utils.submit import embedding2string, get_sentence_embedding

Loading models...OK


In [None]:
df2_updated["author_comment_embedding"] = df2_updated["author_comment"].apply(lambda x: embedding2string(get_sentence_embedding(x)))

In [None]:
df2_updated.drop(["message", "good_embedding"], axis=1).to_csv("submissions/replaced_stuff.csv", index=False)