In [2]:
from transformers import AutoTokenizer, AutoModel
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

import pandas as pd
import json
import numpy as np
from sklearn import preprocessing
from torch import nn
from tqdm.notebook import tqdm

import ast

In [3]:
BATCH_SIZE = 2
TRAIN_TEST_SPLIT = 0.9
DS_PATH = "/home/deadman445/PycharmProjects/CuArgPred/data/_all_data2.csv"
EPOCHS = 3
FREQ_LIMIT = 600
FREQ_CUT_SYMBOL = "<UNK>"
NaN_symbol = ''

In [4]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
bert = AutoModel.from_pretrained("microsoft/codebert-base", torchscript=True)

In [5]:
def la(data_batch_i):
  r = []
  for i in data_batch_i:
        if not ((i == NaN_enc[0] or i==FREQ_CUT_ENC[0]) and len(data_batch_i)==1):
            r.append(i)
  if len(r) == 0:
    return pd.NA
  return r


data = pd.read_csv(DS_PATH)
data['arg_types'] = data['arg_types'].apply(eval)
data = data[data.arg_types.astype(bool)]
df_labels = pd.DataFrame(data['arg_types'].values.tolist())

df_labels[pd.isnull(df_labels)]  = NaN_symbol
df_labels = df_labels.apply(lambda x: x.mask(x.map(x.value_counts())<FREQ_LIMIT, FREQ_CUT_SYMBOL))
enc = preprocessing.LabelEncoder()
all_types = df_labels.apply(pd.Series).stack().values
enc.fit(all_types)
np.save('classes.npy', enc.classes_)
FREQ_CUT_ENC = enc.transform([FREQ_CUT_SYMBOL])
NaN_enc = enc.transform([NaN_symbol])
print(enc.inverse_transform(NaN_enc), enc.inverse_transform(FREQ_CUT_ENC))
print(f'Enc for "NaN" {NaN_enc}, Enc for FREQ_CUT_SYMBOL {FREQ_CUT_ENC}')
df3 = df_labels.apply(enc.transform)
data['labels'] = df3.values.tolist()

data['labels'] = data['labels'].apply(la)
data = data.dropna(subset=['labels'], axis=0)



def train_test_by_repo(data, split=0.75):
    train_l = []
    test_l = []
    c = 0
    train_len = split * len(data)
    for name, i in data.groupby(['repo']).count().sample(frac=1).iterrows():
        if train_len > c:
            train_l.append(name)
            c += i['author']
        else:
            test_l.append(name)
    return data.loc[data['repo'].isin(train_l)], data.loc[data['repo'].isin(test_l)]



train_ds, test_ds = train_test_by_repo(data, TRAIN_TEST_SPLIT)


len(enc.classes_)

[''] ['<UNK>']
Enc for "NaN" [0], Enc for FREQ_CUT_SYMBOL [1]


21

In [5]:
with open("types.txt", 'w') as f:
    for i in enc.classes_:
        f.write(i)
        f.write('\n')

In [30]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")
    
# device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: GeForce RTX 2060 SUPER


In [31]:
bert.to(device)

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0): RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Drop

In [9]:
def process_elem(data_batch_i):
    sentence_line =  tokenizer(data_batch_i['body'], return_tensors='pt', padding='max_length', truncation=True)
    sentence_line1 = tokenizer(data_batch_i['body'], padding='max_length', truncation=True,  return_offsets_mapping=True, return_length=True)
    args = get_names(data_batch_i['body'])
    labels = dict(zip([i[0] for i in args], data_batch_i['labels']))
    args = offset2ind(args, sentence_line1)
    ids = torch.zeros_like(sentence_line['input_ids'])
    for i in args:
        ids[0][i[1]]=labels.get(i[0], NaN_enc[0])
    return sentence_line, ids

def offset2ind(args, tokens):
    def find(tok, lis):
        r = []
        for i in lis:
            if i[0]>=tok[1][0] and i[1]<=tok[1][1]:
                r.append(i)
                break
        b = [lis.index(i) for i in r]
        return b
    return [(i[0], find(i,tokens['offset_mapping'])) for i in args]


def get_names(src):
    ret = []
    line_lengths = [len(i) for i in src.split('\n')]
    line_lengths = [0] + line_lengths
    for i in range(1,len(line_lengths)):
        line_lengths[i] += line_lengths[i-1]+1
    
    try:
        for node in ast.walk(ast.parse(src)):
            if isinstance(node, ast.arg):
                ret.append((node.arg,(line_lengths[node.lineno-1]+node.col_offset, line_lengths[node.lineno-1]+node.end_col_offset)))
        return ret
    except:
        print("Could Not process the code")
        return ret
    
process_elem(data.iloc[75437])

({'input_ids': tensor([[    0,  9232, 17344,  1215, 12247,  1215, 49247,  1640, 43968,     6,
           3438,  1215, 48211,     6,  3438,  1215, 48317,     6, 50118,  1437,
           1437,  1437,  1437,  1437,  1437,  1437,  1437,  1437,  1437,  1437,
           1437,  1437,  1437,  1437,  1437,  1437,  1437,  1437,  1437,  1437,
           1437,  1437,  1437,  1437, 49503,     6,   449,   605, 48204,     6,
          26437,     6, 50118,  1437,  1437,  1437,  1437,  1437,  1437,  1437,
           1437,  1437,  1437,  1437,  1437,  1437,  1437,  1437,  1437,  1437,
           1437,  1437,  1437,  1437,  1437,  1437,  1437,  1437, 14084,     6,
          21021,     6,  1579,  3256, 50118,  1437,  1437,  1437,   849, 14943,
           1579,  5448,    15, 13954, 50118,  1437,  1437,  1437,   849,   407,
            960,    71,    14,    40,    28,  4758,  3804, 50118,  1437,  1437,
           1437,   849,   178,    67,  2928, 50118,  1437,  1437,  1437, 26225,
           1215, 42441,  5

In [33]:
class JITDataDataset(Dataset):

    def __init__(self, df):
        self.data = df

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        data_batch = self.data.iloc[idx, :]
        full_sentence, ids = process_elem(data_batch)
        return (full_sentence['input_ids'].squeeze().to(device),
                full_sentence['attention_mask'].squeeze().to(device),
                (ids > 0).squeeze().to(device),
                ids.squeeze().to(device))

In [42]:
train = DataLoader(JITDataDataset(train_ds), batch_size=64,
                        shuffle=True, num_workers=0)

In [22]:
# class JITModel(torch.nn.Module):
#     def __init__(self, bert, out_dim):
#         super().__init__()
#         self.out_dim = out_dim
#         self.bert = bert
#         self.dense = nn.Linear(768, out_dim)

#     def forward(self, a,b):
        
#         emb = self.bert(a, attention_mask=b)[0]
#         out = self.dense(emb)
#         return out


# model = JITModel(bert, len(enc.classes_))
# model.to(device)
# print()

# a = next(iter(train))
# traced_model = torch.jit.trace(model, [a[0],a[1]])
# torch.jit.save(traced_model, "eeee.pt")

#   ^
#   |
#   |
# Код сверху - пидор




In [44]:
class JITModel(torch.nn.Module):
    def __init__(self, bert, out_dim, a):
        super().__init__()
        self.out_dim = out_dim
        self.bert = bert
        self.bert = torch.jit.trace(self.bert, [a[0], a[1]])
        self.dense = nn.Linear(768, out_dim)

    def forward(self, a,b,c,d):
        
        emb = self.bert(a, attention_mask=b)[0]
        out = self.dense(emb)
        mask = c.unsqueeze(-1).expand(out.size())
        masked = torch.masked_select(out, mask).reshape(len(torch.masked_select(d, c)),self.out_dim)
        return masked

a = next(iter(train))

model = JITModel(bert, len(enc.classes_),a)
model.to(device)
print()


# traced_model = torch.jit.script(model)
# torch.jit.save(traced_model, "eeee.pt")




In [40]:
next(model.parameters()).device

device(type='cuda', index=0)

In [38]:
for param in model.bert.parameters():
    param.requires_grad = False

In [45]:
opti = torch.optim.Adam(model.parameters(), lr = 2e-5)
pbar = tqdm(total=len(train))
losses = []
accuracy = []
for i,a in enumerate(train):
    out = model.forward(a[0], a[1], a[2], a[3])
    
#     mask = a[2].unsqueeze(-1).expand(out.size())
#     masked = torch.masked_select(out, mask).reshape(len(torch.masked_select(a[3], a[2])),len(enc.classes_))
    
    
    labels = torch.masked_select(a[3], a[2])
    loss = F.cross_entropy(out, labels)
    loss.backward()

    if torch.isnan(loss):
#         print(a)
        pass
    else:
        accuracy.append(sum(torch.argmax(F.softmax(out), dim=1) == labels).detach()/len(labels))
        losses.append(loss.detach())
    opti.step()
    if i % 1 ==0:
        pbar.set_description(f"Loss : { sum(losses)/len(losses)}, acc: {sum(accuracy)/len(accuracy)}")
    pbar.update(1)
pbar.close()

  0%|          | 0/1616 [00:00<?, ?it/s]

  accuracy.append(sum(torch.argmax(F.softmax(out), dim=1) == labels).detach()/len(labels))


KeyboardInterrupt: 

In [None]:
pr_av = lambda x : sum(x)/len(x)

In [None]:
test = DataLoader(JITDataDataset(test_ds), batch_size=1, num_workers=0)

In [None]:
pbar = tqdm(total=len(train))
test_top_5s = []
test_accuracy = []
test_losses = []

for i,a in enumerate(train):
    out = model.forward(a[0], a[1], a[2], a[3])
    labels = torch.masked_select(a[3], a[2])
#     loss = F.cross_entropy(out, labels)

    test_accuracy.append(sum(torch.argmax(F.softmax(out), dim=1) == labels).detach()/len(labels))
    test_losses.append(loss.detach())
    top5s = torch.topk(out, 5).indices
    correct_top5 = 0
    for i in range(len(labels)):
        if labels[i] in top5s[i]:
            correct_top5 += 1
    test_top_5s.append(correct_top5/len(labels))
    
    if i % 20 ==0:
        pbar.set_description(f"Loss : { pr_av(test_losses)}, acc: {pr_av(test_accuracy)}, top5s: {pr_av(test_top_5s)}")
    pbar.update(1)
pbar.close()