In [1]:
import pandas as pd
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"

In [6]:
from torch.nn import CrossEntropyLoss, MSELoss
from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler,TensorDataset
from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
                              RobertaConfig, RobertaModel, RobertaTokenizer, RobertaForCausalLM, AutoTokenizer, RobertaForMaskedLM)
import torch

In [2]:
df = pd.read_json('/mnt/opt/alexw/Experements/SolveITnow/leetcode-solutions.json')
df['code_with_problem'] = df['code_with_problem'].apply(lambda x: x.split('\n\n')[0])
df = df[['code_with_problem', 'code_only']]
df.columns = ['request', 'code']

In [3]:
df['request'] = df['request'].str.replace('#', '')

In [4]:
import torch
import torch.nn as nn
class Model(nn.Module):   
    def __init__(self, encoder):
        super(Model, self).__init__()
        self.encoder = encoder

        
    def forward(self, code_inputs=None, nl_inputs=None): 
        if code_inputs is not None:
            outputs = self.encoder(code_inputs,attention_mask=code_inputs.ne(1))[0]
            outputs = (outputs*code_inputs.ne(1)[:,:,None]).sum(1)/code_inputs.ne(1).sum(-1)[:,None]
            return torch.nn.functional.normalize(outputs, p=2, dim=1)
        else:
            outputs = self.encoder(nl_inputs,attention_mask=nl_inputs.ne(1))[0]
            outputs = (outputs*nl_inputs.ne(1)[:,:,None]).sum(1)/nl_inputs.ne(1).sum(-1)[:,None]
            return torch.nn.functional.normalize(outputs, p=2, dim=1)

In [7]:
ft_model = "unixcoder_final.bin"
model_name_or_path = "microsoft/unixcoder-base"
print("tokenizer")
tokenizer = RobertaTokenizer.from_pretrained(model_name_or_path)
print("config")
config = RobertaConfig.from_pretrained(model_name_or_path)
print("model")
model = RobertaModel.from_pretrained(model_name_or_path)

model = Model(model)
checkpoint = torch.load(ft_model, map_location='cuda')
model.load_state_dict(checkpoint)
model = model.to("cuda") #.eval()

tokenizer
config
model


In [8]:
def convert_examples_to_features(nl, code, code_length=256, nl_length = 128):
    code_tokens = tokenizer.tokenize(code)[:code_length-4]
    code_tokens =[tokenizer.cls_token,"<encoder-only>",tokenizer.sep_token]+code_tokens+[tokenizer.sep_token]
    code_ids = tokenizer.convert_tokens_to_ids(code_tokens)
    padding_length = code_length - len(code_ids)
    code_ids += [tokenizer.pad_token_id]*padding_length
    
    
    nl_tokens = tokenizer.tokenize(nl)[:nl_length-4]
    nl_tokens = [tokenizer.cls_token,"<encoder-only>",tokenizer.sep_token]+nl_tokens+[tokenizer.sep_token]
    nl_ids = tokenizer.convert_tokens_to_ids(nl_tokens)
    padding_length = nl_length - len(nl_ids)
    nl_ids += [tokenizer.pad_token_id]*padding_length
    return code_ids, nl_ids

In [9]:
def convert_code_to_features(code, code_length=256):
    code_tokens = tokenizer.tokenize(code)[:code_length-4]
    code_tokens =[tokenizer.cls_token,"<encoder-only>",tokenizer.sep_token]+code_tokens+[tokenizer.sep_token]
    code_ids = tokenizer.convert_tokens_to_ids(code_tokens)
    padding_length = code_length - len(code_ids)
    code_ids += [tokenizer.pad_token_id]*padding_length
    return code_ids

def convert_nl_to_features(nl, nl_length = 128):
    nl_tokens = tokenizer.tokenize(nl)[:nl_length-4]
    nl_tokens = [tokenizer.cls_token,"<encoder-only>",tokenizer.sep_token]+nl_tokens+[tokenizer.sep_token]
    nl_ids = tokenizer.convert_tokens_to_ids(nl_tokens)
    padding_length = nl_length - len(nl_ids)
    nl_ids += [tokenizer.pad_token_id]*padding_length
    return  nl_ids

In [10]:
def generate_and_tokenize_prompt(data_point):
    code = data_point["code"]
    nl = data_point["request"]
    code_ids, nl_ids = convert_examples_to_features(nl, code)
    return {"code_ids":code_ids, "nl_ids":nl_ids}

In [11]:
def code_ids_to_emmb(code_ids):
    code_inputs = torch.tensor([code_ids]).to(device)
    with torch.no_grad():
        code_vec = model(code_inputs=code_inputs)
    return code_vec.cpu().numpy()

In [13]:
df['code_ids'] = df["code"].apply(convert_code_to_features)

In [14]:
device = 'cuda'
df['code_emmb'] = df["code_ids"].apply(code_ids_to_emmb)

In [17]:
nl = "Given an array of integers `nums` and an integer `target`, return _indices of the two numbers such that they add up to `target`_."

In [19]:
def get_predict(nl, code_vecs, cont_return=7, device='cuda'):
    nl_ids = convert_nl_to_features(nl)
    nl_inputs = torch.tensor([nl_ids]).to(device)
    nl_vecs = []
    with torch.no_grad():
        nl_vec = model(nl_inputs=nl_inputs) 
        nl_vecs.append(nl_vec.cpu().numpy())

    code_vecs = np.concatenate(code_vecs,0)
    nl_vecs = np.concatenate(nl_vecs,0)
    scores = np.matmul(nl_vecs,code_vecs.T)
    sort_ids = np.argsort(scores, axis=-1, kind='quicksort', order=None)[:,::-1]
    return sort_ids[0][:cont_return]

In [20]:
import numpy as np
sort_ids = get_predict(nl, df['code_emmb'].values.tolist())

In [21]:
print(df['code'][sort_ids].iloc[0])

```python
def twoSum(nums, target):
    map = {}
    for i, num in enumerate(nums):
        complement = target - num
        if complement in map:
            return [map[complement], i]
        map[num] = i
    return []
```


