In [None]:
%%capture
!pip install catboost sentence_transformers

In [None]:
import pandas as pd
import numpy as np
import torch
from IPython.display import display, HTML
import plotly.express as px
import plotly.graph_objects as go
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sentence_transformers import SentenceTransformer
from transformers import AutoModel, AutoTokenizer


PATH = ''
RANDOM_STATE = 42

In [None]:
df = pd.read_csv('data/dataset.csv')
df.head()

Unnamed: 0,code,code_no_comments,by_human
0,void DebugInfoFinder::processInstruction(const...,void DebugInfoFinder::processInstruction(const...,False
1,std::chrono::milliseconds getDefaultDebuginfod...,std::chrono::milliseconds getDefaultDebuginfod...,False
2,unsigned newRegUnit(unsigned Weight) {\n //...,unsigned newRegUnit(unsigned Weight) {\n \n...,False
3,VPValue *vputils::getOrCreateVPValueForSCEVExp...,VPValue *vputils::getOrCreateVPValueForSCEVExp...,False
4,template <class Tr>\ntypename RegionBase<Tr>::...,template <class Tr>\ntypename RegionBase<Tr>::...,False


In [None]:
# Transformer-like API исключительно для Salesforce/codet5p-110m-embedding
class SentenceTransformerAPI:
    def __init__(self, checkpoint):
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True)
        self.model = AutoModel.from_pretrained(checkpoint, trust_remote_code=True).to(self.device)

    def encode(self, sentences, convert_to_tensor=False):
        if isinstance(sentences, str):
            sentences = [sentences]

        embeddings = []
        for sentence in sentences:
            inputs = self.tokenizer.encode(sentence, return_tensors="pt", padding=True, truncation=True).to(self.device)

            with torch.no_grad():
                outputs = self.model(inputs)
                embeddings.append(outputs[0].cpu().detach().numpy())

        return embeddings

In [None]:
# Разделение данных
X, y = data['code'].values, data['by_human'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
                                                    random_state=RANDOM_STATE,
                                                    stratify=data['by_human'])

In [None]:
# Эмбеддеры
embedders = {
    "BoW": CountVectorizer(max_features=4096),
    "Salesforce/codet5p-110m-embedding": SentenceTransformerAPI('Salesforce/codet5p-110m-embedding'),
    "intfloat/multilingual-e5-large": SentenceTransformer('intfloat/multilingual-e5-large', trust_remote_code=True),
    "jinaai/jina-embeddings-v2-base-code": AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-code',
                                                                     trust_remote_code=True).to('cuda'),
    # эти не поддерживают SentenceTransformer, но оно вроде как там как-то автоматически собирается под интерфейс
    "microsoft/codebert-base": SentenceTransformer('microsoft/codebert-base', trust_remote_code=True),
    "microsoft/graphcodebert-base": SentenceTransformer('microsoft/graphcodebert-base', trust_remote_code=True),
}

# Классификаторы
classifiers = {
    "LogReg": LogisticRegression(),
    "SVC": SVC(),
    "RandomForest": RandomForestClassifier(),
    "BayesClassifier": MultinomialNB(),  # Только для BoW
    "CatBoost": CatBoostClassifier(task_type='GPU', verbose=False)
}

# Подготовка данных для BoW
X_train_bow = embedders["BoW"].fit_transform(X_train)
X_test_bow = embedders["BoW"].transform(X_test)



In [None]:
import time
import gc

def evaluate_classifier(embedding_train, y_train,
                        embedding_test, y_test,
                        classifier, param_grid):
    grid_search = GridSearchCV(classifier, param_grid, scoring='f1',
                               cv=3, n_jobs=-1, verbose=4)
    grid_search.fit(embedding_train, y_train)
    best_clf = grid_search.best_estimator_

    # Оценка на тестовой выборке
    y_test_pred = best_clf.predict(embedding_test)
    f1_test = f1_score(y_test, y_test_pred)
    roc_auc_test = roc_auc_score(y_test, y_test_pred)

    return grid_search.best_params_, f1_test, roc_auc_test


# Оценка всех комбинаций эмбеддеров и классификаторов
results = []
for emb_name, embedder in tqdm(embedders.items()):
    print(f'Working with embedder {emb_name}...')
    if emb_name == "BoW":
        X_train_embedding, X_test_embedding = X_train_bow, X_test_bow
    elif emb_name == "Salesforce/codet5p-110m-embedding":
        X_train_embedding = embedder.encode(X_train, convert_to_tensor=True)
        X_test_embedding = embedder.encode(X_test, convert_to_tensor=True)
    else:
        X_train_embedding = embedder.encode(X_train, max_length=1024, convert_to_tensor=True).cpu().detach().numpy()
        X_test_embedding = embedder.encode(X_test, max_length=1024, convert_to_tensor=True).cpu().detach().numpy()

    for clf_name, classifier in tqdm(classifiers.items()):
        print(f'Working with classifier {clf_name}...')
        if clf_name == "BayesClassifier" and emb_name != "BoW":
            continue
        param_grid = {
                      "LogReg": {
                          'C': [0.01, 0.1, 1],
                      },
                      "SVC": {
                          'C': [0.01, 0.1, 1],
                          'kernel': ['rbf', 'sigmoid'],
                      },
                      "RandomForest": {
                          'n_estimators': [5, 10, 20],
                          'max_depth': [5, 10],
                          'bootstrap': [True, False]
                      },
                      "BayesClassifier": {
                          'alpha': [0.01, 0.5, 2.0]
                      },
                      "CatBoost": {
                          'depth': [2, 4, 6],
                      },
                  }[clf_name]

        best_params, f1_test, roc_auc_test = evaluate_classifier(
            X_train_embedding, y_train, X_test_embedding, y_test, classifier, param_grid)

        result_entry = {
            "embedder": emb_name,
            "classifier": clf_name,
            "f1_test": f1_test,
            "roc_auc_test": roc_auc_test,
            "best_params": best_params
        }

        html_content = f"""
                          <div style='font-size: 20px; font-weight: bold;'>
                                  "embedder": "{result_entry['embedder']}",
                                  "classifier": "{result_entry['classifier']}",
                                  "f1_test": {result_entry['f1_test']},
                                  "roc_auc_test": {result_entry['roc_auc_test']},
                                  "best_params": {result_entry['best_params']}
                          </div>
                        """
        display(HTML(html_content))

        results.append(result_entry)

In [None]:
# Логи на всякий случай
results = [
{'embedder': 'BoW',  'classifier': 'LogReg',
  'f1_test': 0.9079627714581179,
  'roc_auc_test': 0.9103471636269229,
  'best_params': {'C': 1}},
 {'embedder': 'BoW',
  'classifier': 'SVC',
  'f1_test': 0.8505303760848603,
  'roc_auc_test': 0.843952829882521,
  'best_params': {'C': 1, 'kernel': 'rbf'}},
 {'embedder': 'BoW',
  'classifier': 'RandomForest',
  'f1_test': 0.8107606679035251,
  'roc_auc_test': 0.7946489420393328,
  'best_params': {'bootstrap': False, 'max_depth': 10, 'n_estimators': 20}},
 {'embedder': 'BoW',
  'classifier': 'BayesClassifier',
  'f1_test': 0.895361380798274,
  'roc_auc_test': 0.9022501947166872,
  'best_params': {'alpha': 0.01}},
 {'embedder': 'BoW',
  'classifier': 'CatBoost',
  'f1_test': 0.911854103343465,
  'roc_auc_test': 0.9123815473486078,
  'best_params': {'depth': 6, 'l2_leaf_reg': 1}},
 {'embedder': 'Salesforce/codet5p-110m-embedding',
  'classifier': 'LogReg',
  'f1_test': 0.8384458077709611,
  'roc_auc_test': 0.8408718439670279,
  'best_params': {'C': 1}},
 {'embedder': 'Salesforce/codet5p-110m-embedding',
  'classifier': 'SVC',
  'f1_test': 0.9067796610169493,
  'roc_auc_test': 0.9113308885571493,
  'best_params': {'C': 1, 'kernel': 'rbf'}},
 {'embedder': 'Salesforce/codet5p-110m-embedding',
  'classifier': 'RandomForest',
  'f1_test': 0.8953367875647669,
  'roc_auc_test': 0.8982605309275006,
  'best_params': {'bootstrap': False, 'max_depth': 10, 'n_estimators': 20}},
 {'embedder': 'Salesforce/codet5p-110m-embedding',
  'classifier': 'CatBoost',
  'f1_test': 0.9151138716356108,
  'roc_auc_test': 0.9173955020445251,
  'best_params': {'depth': 6, 'l2_leaf_reg': 5}},
 {'embedder': 'intfloat/multilingual-e5-large',
  'classifier': 'LogReg',
  'f1_test': 0.8667366211962224,
  'roc_auc_test': 0.872065051599922,
  'best_params': {'C': 1}},
 {'embedder': 'intfloat/multilingual-e5-large',
  'classifier': 'SVC',
  'f1_test': 0.8816503800217155,
  'roc_auc_test': 0.8901595054196145,
  'best_params': {'C': 1, 'kernel': 'rbf'}},
 {'embedder': 'intfloat/multilingual-e5-large',
  'classifier': 'RandomForest',
  'f1_test': 0.8976545842217483,
  'roc_auc_test': 0.9032684007269423,
  'best_params': {'bootstrap': False, 'max_depth': 10, 'n_estimators': 20}},
 {'embedder': 'intfloat/multilingual-e5-large',
  'classifier': 'CatBoost',
  'f1_test': 0.9327731092436974,
  'roc_auc_test': 0.9355082105536443,
  'best_params': {'depth': 6, 'l2_leaf_reg': 1}},
 {'embedder': 'microsoft/codebert-base',  'classifier': 'LogReg',
  'f1_test': 0.9336057201225741,
  'roc_auc_test': 0.9345285422210684,
  'best_params': {'C': 1}},
 {'embedder': 'microsoft/codebert-base',
  'classifier': 'SVC',
  'f1_test': 0.821656050955414,
  'roc_auc_test': 0.83076483092101,
  'best_params': {'C': 1, 'kernel': 'rbf'}},
 {'embedder': 'microsoft/codebert-base',
  'classifier': 'RandomForest',
  'f1_test': 0.8981288981288982,
  'roc_auc_test': 0.901278639579412,
  'best_params': {'bootstrap': False, 'max_depth': 10, 'n_estimators': 20}},
 {'embedder': 'microsoft/codebert-base',
  'classifier': 'CatBoost',
  'f1_test': 0.9272349272349273,
  'roc_auc_test': 0.9294760498474718,
  'best_params': {'depth': 6, 'l2_leaf_reg': 1}},
 {'embedder': 'microsoft/graphcodebert-base',
  'classifier': 'LogReg',
  'f1_test': 0.9498464687819856,
  'roc_auc_test': 0.9506393197897061,
  'best_params': {'C': 1}},
 {'embedder': 'microsoft/graphcodebert-base',
  'classifier': 'SVC',
  'f1_test': 0.9363449691991786,
  'roc_auc_test': 0.9375446225741546,
  'best_params': {'C': 1, 'kernel': 'rbf'}},
 {'embedder': 'microsoft/graphcodebert-base',
  'classifier': 'RandomForest',
  'f1_test': 0.9179646936656283,
  'roc_auc_test': 0.9204136106964367,
  'best_params': {'bootstrap': False, 'max_depth': 10, 'n_estimators': 20}},
 {'embedder': 'microsoft/graphcodebert-base',
  'classifier': 'CatBoost',
  'f1_test': 0.9507186858316221,
  'roc_auc_test': 0.9516433277081847,
  'best_params': {'depth': 6, 'l2_leaf_reg': 1}},
{'embedder': 'jinaai/jina-embeddings-v2-base-code',
  'classifier': 'LogReg',
  'f1_test': 0.9195402298850575,
  'roc_auc_test': 0.922421626533394,
  'best_params': {'C': 1}},
 {'embedder': 'jinaai/jina-embeddings-v2-base-code',
  'classifier': 'SVC',
  'f1_test': 0.913232104121475,
  'roc_auc_test': 0.9193649802038035,
  'best_params': {'C': 1, 'kernel': 'rbf'}},
 {'embedder': 'jinaai/jina-embeddings-v2-base-code',
  'classifier': 'RandomForest',
  'f1_test': 0.8955223880597015,
  'roc_auc_test': 0.9012542999935095,
  'best_params': {'bootstrap': False, 'max_depth': 10, 'n_estimators': 20}},
 {'embedder': 'jinaai/jina-embeddings-v2-base-code',
  'classifier': 'CatBoost',
  'f1_test': 0.9007470651013874,
  'roc_auc_test': 0.9062885376776789,
  'best_params': {'depth': 6}}
]

In [None]:
def plot_table(table, table_cols, table_index, title):
    fig = go.Figure(data=go.Heatmap(
        z=table,
        x=table_cols,
        y=table_index,
        colorscale='Greens',
        zmin=0,
        zmax=1,
        text=np.round(table, 2).astype(str),
        hovertemplate='%{y}, %{x}: %{text:.2f}<extra></extra>',
        texttemplate="%{text:.2f}",
        textfont={"size": 12},
    ))
    fig.update_layout(
        title=title,
        xaxis_nticks=36
    )
    fig.show()

In [None]:
# Создание таблиц
results_df = pd.DataFrame(results)
f1_table = results_df.pivot(index='embedder', columns='classifier', values='f1_test')
roc_auc_table = results_df.pivot(index='embedder', columns='classifier', values='roc_auc_test')

# Вывод таблиц
plot_table(f1_table, f1_table.columns, f1_table.index, 'F1 Score Table')
plot_table(roc_auc_table, roc_auc_table.columns, roc_auc_table.index, 'ROC-AUC Table')