In [None]:
!pip install -U sentence-transformers

In [None]:
SEED = 42
BATCH_SIZE = 16
EPOCHS = 10
WARMUP_STEPS = 750

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import csv
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import defaultdict
from pathlib import Path

os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':16:8'

In [None]:
ROOT_PATH = Path("/content/drive/MyDrive/dacon/240326_CodeSimilarityJudgment")
DATA_PATH = Path("./data")

!unzip -qq {ROOT_PATH / 'open.zip'} -d {DATA_PATH}

In [None]:
import torch
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, models, InputExample, losses, util
from sentence_transformers.evaluation import BinaryClassificationEvaluator
from sklearn.metrics.pairwise import paired_cosine_distances

In [None]:
def seed_everything():
  random.seed(SEED)
  np.random.seed(SEED)
  torch.manual_seed(SEED)
  torch.cuda.manual_seed_all(SEED)
  torch.backends.cudnn.benchmark = False
  torch.use_deterministic_algorithms(True)
  os.environ["PYTHONHASHSEED"] = str(SEED)
  os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"

seed_everything()

In [None]:
def make_train_dict():
  train_dict = defaultdict(list)

  for num_problem in tqdm(range(1, 501), desc="Similarity 1"):
    for num_detail in range(1, 126):
      code1_path = DATA_PATH / f"train_code/problem{num_problem:0>{3}}/problem{num_problem:0>{3}}_{num_detail}.cpp"
      code2_path = DATA_PATH / f"train_code/problem{num_problem:0>{3}}/problem{num_problem:0>{3}}_{num_detail + 125}.cpp"
      train_dict["code1_path"].append(code1_path)
      train_dict["code2_path"].append(code2_path)

      with open(code1_path, "r") as f:
        code1 = f.read()
        code1 = remove_comments(code1)
      with open(code2_path, "r") as f:
        code2 = f.read()
        code2 = remove_comments(code2)
      train_dict["code1"].append(code1)
      train_dict["code2"].append(code2)

      train_dict["similar"].append(1.0)

  for num_problem in tqdm(range(1, 251), desc="Similarity 0"):
    for num_detail in range(251, 501):
      code1_path = DATA_PATH / f"train_code/problem{num_problem:0>{3}}/problem{num_problem:0>{3}}_{num_detail}.cpp"
      code2_path = DATA_PATH / f"train_code/problem{num_problem + 250:0>{3}}/problem{num_problem + 250:0>{3}}_{num_detail}.cpp"
      train_dict["code1_path"].append(code1_path)
      train_dict["code2_path"].append(code2_path)

      with open(code1_path, "r") as f:
        code1 = f.read()
      with open(code2_path, "r") as f:
        code2 = f.read()
      train_dict["code1"].append(code1)
      train_dict["code2"].append(code2)

      train_dict["similar"].append(0.0)
  return train_dict

In [None]:
class CustomEvaluator(BinaryClassificationEvaluator):
    def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
        scores = self.compute_metrices(model)
        # 원래 main_score는 Average Precision으로 설정되어 있음, 이를 accuracy로 변경
        main_score = max(scores[short_name]["accuracy"] for short_name in scores)
        file_output_data = [epoch, steps]
        for header_name in self.csv_headers:
            if "_" in header_name:
                sim_fct, metric = header_name.split("_", maxsplit=1)
                file_output_data.append(scores[sim_fct][metric])
        if output_path is not None and self.write_csv:
            csv_path = os.path.join(output_path, self.csv_file)
            if not os.path.isfile(csv_path):
                with open(csv_path, newline="", mode="w", encoding="utf-8") as f:
                    writer = csv.writer(f)
                    writer.writerow(self.csv_headers)
                    writer.writerow(file_output_data)
            else:
                with open(csv_path, newline="", mode="a", encoding="utf-8") as f:
                    writer = csv.writer(f)
                    writer.writerow(file_output_data)
        return main_score

In [None]:
def model_train(train_csv, output_path=None):
  os.makedirs(ROOT_PATH / "checkpoints", exist_ok=True)
  train_df = train_csv.sample(n=120000, random_state=SEED)
  valid_df = train_csv.drop(train_df.index)

  train_examples = []
  for idx, row in train_df.iterrows():
    train_examples.append(InputExample(texts=[row["code1"], row["code2"]], label=row["similar"]))

  train_dataloader = DataLoader(train_examples, batch_size=BATCH_SIZE, shuffle=True)

  valid_evaluator = CustomEvaluator(
      sentences1=valid_df['code1'].values.tolist(),
      sentences2=valid_df['code2'].values.tolist(),
      labels=valid_df['similar'].values.tolist(),
      batch_size=BATCH_SIZE,
      show_progress_bar=True,
      write_csv=True,
  )

  word_embedding_model = models.Transformer('microsoft/codereviewer')
  pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
  model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

  train_loss = losses.CosineSimilarityLoss(model)

  model.fit(
      train_objectives=[(train_dataloader, train_loss)],
      evaluator=valid_evaluator,
      epochs=EPOCHS,
      warmup_steps=WARMUP_STEPS,
      output_path=str(ROOT_PATH / "checkpoints" / output_path),
      use_amp=True,
      show_progress_bar=True,
  )
  torch.cuda.empty_cache()

In [None]:
train_dict = make_train_dict()
train_csv = pd.DataFrame(train_dict).sample(frac=1, random_state=SEED).reset_index(drop=True)

Similarity 1: 100%|██████████| 500/500 [00:06<00:00, 81.28it/s]
Similarity 0: 100%|██████████| 250/250 [00:04<00:00, 61.52it/s]


In [None]:
model_train(train_csv, output_path="CodeReviewer_remove_comments")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/7500 [00:00<?, ?it/s]

Batches:   0%|          | 0/625 [00:00<?, ?it/s]

Iteration:   0%|          | 0/7500 [00:00<?, ?it/s]

Batches:   0%|          | 0/625 [00:00<?, ?it/s]

Iteration:   0%|          | 0/7500 [00:00<?, ?it/s]

Batches:   0%|          | 0/625 [00:00<?, ?it/s]

Iteration:   0%|          | 0/7500 [00:00<?, ?it/s]

Batches:   0%|          | 0/625 [00:00<?, ?it/s]

Iteration:   0%|          | 0/7500 [00:00<?, ?it/s]

Batches:   0%|          | 0/625 [00:00<?, ?it/s]

Iteration:   0%|          | 0/7500 [00:00<?, ?it/s]

Batches:   0%|          | 0/625 [00:00<?, ?it/s]

Iteration:   0%|          | 0/7500 [00:00<?, ?it/s]

Batches:   0%|          | 0/625 [00:00<?, ?it/s]

Iteration:   0%|          | 0/7500 [00:00<?, ?it/s]

Batches:   0%|          | 0/625 [00:00<?, ?it/s]

Iteration:   0%|          | 0/7500 [00:00<?, ?it/s]

Batches:   0%|          | 0/625 [00:00<?, ?it/s]

Iteration:   0%|          | 0/7500 [00:00<?, ?it/s]

Batches:   0%|          | 0/625 [00:00<?, ?it/s]

In [None]:
model_path = ROOT_PATH / "checkpoints" / 'CodeReviewer'
model = SentenceTransformer(str(model_path))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
eval_df = pd.read_csv(model_path / "eval" / "binary_classification_evaluation_results.csv")
cossim_threshold = eval_df["cossim_accuracy_threshold"].values[-1]

In [None]:
test_df = pd.read_csv(DATA_PATH / "test.csv")

sentences1 = test_df["code1"].values.tolist()
sentences2 = test_df["code2"].values.tolist()
sentences = list(set(sentences1 + sentences2))

In [None]:
embeddings = model.encode(
    sentences,
    device=torch.device("cuda:0"),
    batch_size=128,
    show_progress_bar=True,
)
np.save(ROOT_PATH / 'preds' / "embeddings.npy", embeddings)

emb_dict = {sent: emb for sent, emb in zip(sentences, embeddings)}
embeddings1 = [emb_dict[sent] for sent in sentences1]
embeddings2 = [emb_dict[sent] for sent in sentences2]

Batches:   0%|          | 0/1784 [00:00<?, ?it/s]

In [None]:
score = paired_cosine_distances(embeddings1, embeddings2)
pred = np.where(score > cossim_threshold, 0, 1)

In [None]:
submission = pd.read_csv(DATA_PATH / "sample_submission.csv")
submission["similar"] = pred
submission.to_csv(ROOT_PATH / "submission.csv", index=False)