# Упражнение 6

In [1]:
import json
import pandas as pd
import numpy as np

import random
from random import choices

import torch
from transformers import AutoTokenizer, AutoModel

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings("ignore")

Используемый датасет для генерации сниппетов кода: [POJ-104](https://arxiv.org/pdf/1409.5718)\
[(ссылка на репозиторий с датасетом и скриптом для генерации)](https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/Clone-detection-POJ-104)

In [2]:
with open('train.jsonl') as f:
    dataset = [json.loads(line) for line in f.readlines()]

Экземпляры сгенерированного датасета имеют следующий вид:
- index: идентификатор
- label: тип задачи
- code: сниппет кода

In [3]:
df = pd.DataFrame(dataset).set_index('index')
df

Unnamed: 0_level_0,label,code
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1,"\nint f(int a,int x)\n{\n\tint count=1,i;\n\tf..."
1,1,"int sum=0;\nvoid f(int n,int i)\n{\n if(n==..."
2,1,"int t, a, flag, a1;\n/*bool prime(int k)\n{\n\..."
3,1,"void qut(int a,int b); ..."
4,1,"int reek(int i,int j)\n{\n\tint k,g=0;\n\tfor ..."
...,...,...
31995,64,struct point\n{\nint x;\nint y;\nint z;\n}; ...
31996,64,struct distance\n{\n\tint point1;\n\tint point...
31997,64,struct point\n{\n\tint x[2];\n\tint y[2];\n\ti...
31998,64,"int main()\n{\n\tint n,a[10],b[10],c[10],i,j,k..."


Исследуемая модель — CodeBERT [(предобученная реализация)](https://github.com/microsoft/CodeBERT)

In [4]:
tokenizer = AutoTokenizer.from_pretrained('microsoft/codebert-base')
model = AutoModel.from_pretrained('microsoft/codebert-base')

Поскольку возможная длина последовательности, обрабатываемой этой моделью, ограничена сверху 512 токенами, нужно удалить слишком большие сниппеты и перебалансировать классы задач. Чтобы не удалить слишком много классов и при этом оставить достаточно сниппетов для работы модели, ограничимся 50 сниппетами на класс.

In [5]:
df = df[df.apply(lambda item: len(tokenizer.tokenize(item["code"])) <= 512, axis = 1)]
df["samples"] = df.code.apply(lambda item: [item])
df = df.groupby("label").agg({"samples": "sum"})
df = df[df.apply(lambda item: len(item["samples"]) >= 50, axis = 1)]
df["samples"] = df.samples.apply(lambda item: item[:50])
df

Token indices sequence length is longer than the specified maximum sequence length for this model (763 > 512). Running this sequence through the model will result in indexing errors


Unnamed: 0_level_0,samples
label,Unnamed: 1_level_1
1,"[\nint f(int a,int x)\n{\n\tint count=1,i;\n\t..."
10,"[int main()\n{\nint n;\nscanf(""%d"",&n);\nint a..."
11,"[int main()\n{\n\tint y,m,d,t=0;\n\tcin>>y>>m>..."
12,"[\n\nint main()\n{\n int a[20],i,j,n,court;..."
13,"[int main ()\n{\n\t int a,b,j,i,leap=0,f=1;\..."
...,...
62,[/*\n * longwords.cpp\n *\n * Created on: 201...
63,"[int main()\n{\nint k,c[101][101],i,j,a[101][1..."
7,"[int main()\n{\n char a[256],b[256],c[256];..."
8,"[void f0(int a[],int x)\n{\n\tint i=0;\n fo..."


Получим эмбеддинги и агрегируем их функцией среднего для уменьшения размерности.

In [6]:
def create_embeddings(snippets):
    for snippet in snippets:
        tokenized_code = tokenizer.tokenize(snippet)
        if len(tokenized_code) < 512:
            tokenized_code += ["<pad>"] * (512 - len(tokenized_code))
        tokens = [tokenizer.cls_token] + tokenized_code + [tokenizer.sep_token]
        ids = tokenizer.convert_tokens_to_ids(tokens)
        
        try:
            embedding = model(torch.tensor(ids)[None,:])[0]
            yield embedding.data.numpy()
        except IndexError as err:
            pass

dfs = []
for i in range(len(df)):
    label = int(df.iloc[i].name)
    embeddings = create_embeddings(df.iloc[i].samples)
    vectors = [embedding.mean(axis=1).flatten().tolist() for embedding in embeddings]
    new_df = pd.DataFrame(data=vectors)
    new_df["label"] = label
    dfs.append(new_df)
df = pd.concat(dfs).sample(frac = 1)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,759,760,761,762,763,764,765,766,767,label
16,-0.046608,-0.101427,0.165237,-0.097304,-0.841153,-0.339591,-0.017945,-0.050901,0.370469,0.267817,...,-0.018993,-0.060022,0.562284,-0.289058,0.176323,0.712105,-0.810440,-0.305035,0.363829,8
27,-0.224591,0.029329,0.246461,0.115313,-0.556875,-0.184173,-0.049726,0.130996,0.261115,0.259249,...,-0.081576,-0.294464,0.419384,-0.302097,0.252418,0.577733,-0.882185,-0.352438,0.463739,63
28,-0.039265,-0.102487,0.237857,0.005223,-0.772260,-0.378563,0.065659,-0.038790,0.292994,0.350067,...,-0.015127,0.076834,0.703605,-0.164754,0.024786,0.784288,-0.736391,-0.178759,0.327727,47
18,-0.323927,-0.033853,0.265369,0.308005,-0.466328,-0.250715,-0.036021,0.184990,0.317297,0.218505,...,-0.118461,-0.387041,0.430966,-0.255063,0.225331,0.552029,-0.935489,-0.367940,0.521883,15
13,0.039580,-0.044730,0.257957,-0.013514,-0.910213,-0.319187,0.025502,-0.065720,0.296454,0.362432,...,0.024862,0.130787,0.616944,-0.189569,0.015012,0.717648,-0.792348,-0.135635,0.340567,47
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37,-0.148676,-0.115005,0.221281,0.030652,-0.846053,-0.411768,-0.020923,-0.063999,0.295364,0.401259,...,-0.001972,0.083097,0.865308,-0.119964,0.011703,0.621355,-0.744609,-0.216432,0.333408,61
36,-0.115154,-0.026686,0.205577,-0.013512,-0.920153,-0.364002,-0.006557,-0.034339,0.380583,0.301870,...,-0.067945,-0.139569,0.576302,-0.267897,0.135453,0.878846,-0.746965,-0.307765,0.397136,36
6,-0.247766,-0.074462,0.233341,0.346934,-0.476585,-0.249513,0.070066,0.177445,0.294835,0.198040,...,-0.065032,-0.274442,0.419498,-0.258119,0.245928,0.697203,-0.789247,-0.361545,0.511446,9
0,-0.381327,0.073453,0.249348,0.187019,-0.315114,-0.175771,-0.065317,0.199142,0.363379,0.269300,...,-0.166479,-0.458557,0.519246,-0.260608,0.244586,0.460396,-0.948124,-0.424811,0.658145,59


Обучим классификатор на полученных данных и проверим его работу на них.

In [7]:
df = pd.concat(dfs).sample(frac = 1)
X = df.drop(["label"], axis = 1).values
y = df["label"].values

clf = KNeighborsClassifier(n_neighbors=5, metric="cosine")
clf = clf.fit(X, y)
y_pred = clf.predict(X)

accuracy_score(y, y_pred)

0.4995147201552895

Уменьшим число классов и проверим, как это повлияет на работу классификатора.

In [8]:
third = len(df.label.unique()) // 3
df_third = df[df.label.isin(df.label.unique()[:third])]
X = df_third.drop(["label"], axis = 1).values
y = df_third["label"].values

clf = clf.fit(X, y)
y_pred = clf.predict(X)

accuracy_score(y, y_pred)

0.6341708542713568

Ограничим количество классов небольшой константой, например, 3:

In [9]:
df_three = df[df.label.isin(df.label.unique()[:3])]
X = df_three.drop(["label"], axis = 1).values
y = df_three["label"].values

clf = clf.fit(X, y)
y_pred = clf.predict(X)

accuracy_score(y, y_pred)

0.918918918918919

Проверим множество разных комбинаций троек классов на классификаторе:

In [10]:
scores = []
for i in range(1000):
    df_three = df[df.label.isin(random.choices(df.label.unique(), k=3))]
    X = df_three.drop(["label"], axis = 1).values
    y = df_three["label"].values

    clf = clf.fit(X, y)
    y_pred = clf.predict(X)

    scores.append(accuracy_score(y, y_pred))

pd.Series(scores).describe()

count    1000.000000
mean        0.840286
std         0.067278
min         0.646667
25%         0.793333
50%         0.846667
75%         0.891892
max         1.000000
dtype: float64

Таким образом, модель CodeBERT может использоваться для задачи clone detection при малом количестве возможных классов.