In [None]:
import numpy as np
import os
import pandas as pd
import random
import re
import torch
import warnings
from tqdm.auto import tqdm
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification

warnings.filterwarnings('ignore')

BATCH_SIZE = 16
MAX_LEN = 512
SEED = 42

In [None]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore
seed_everything(SEED)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
test_df = pd.read_csv('./data/test.csv')
test_df = test_df.drop('pair_id', axis=1)

In [None]:
def remove_annotation(x):
    x = x+'\n'
    x = re.sub(r'\n.*\\\n','\n',x)
    x = re.sub(r'//.*\n','\n',x)
    x = re.sub(r'/\*.*\*/','',x)
    x = re.sub(r'#if 0.*#endif','',x)
    return x

In [None]:
def remove_links(x):
    x = re.sub(r'https*\S+', ' ', x) # remove links
    x = re.sub(r'http*\S+', ' ', x)
    return x

In [None]:
def standardize_sign(x):
    x = re.sub(r' +', ' ', x)
    x = re.sub(r' \+ | \+|\+ ','+',x)
    x = re.sub(r' - | -|- ','-',x)
    x = re.sub(r' \* | \*|\* ','*',x)
    x = re.sub(r' / | /|/ ','/',x)
    x = re.sub(r' % | %|% ','%',x)
    x = re.sub(r' = | =|= ','=',x)
    x = re.sub(r' > | >|> ','>',x)
    x = re.sub(r' < | <|< ','=',x)
    x = re.sub(r' !','!',x)
    x = re.sub(r' & | &|& ','&',x)
    x = re.sub(r' \| | \||\| ', '|', x)
    x = re.sub(r' : | :|: ',':',x)
    x = re.sub(r' \? | \?|\? ','?',x)
    return x

In [None]:
def remove_std(x):
    x = re.sub(r'std::','',x)
    x = re.sub(r'using namespace std;','',x)
    return x

In [None]:
def remove_include(x):
    x = re.sub(r'#include.*>','',x)
    return x

In [None]:
def text_clean(x):
    x = x.lower() # lowercase everything
    x = x.encode('ascii', 'ignore').decode()  # remove unicode characters
    x = remove_std(x)
    x = remove_links(x)
    x = remove_include(x)
    x = remove_annotation(x)
    x = standardize_sign(x)
    x = re.sub(r'\n', ' ', x)
    x = re.sub(r'\t', ' ', x)
    x = re.sub(r' +', ' ', x)
    return x

In [None]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/graphcodebert-base")
tokenizer.truncation_side = 'left'
model = AutoModelForSequenceClassification.from_pretrained("microsoft/graphcodebert-base", num_labels=2)
model.to(device)

In [None]:
class Datasets(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        code1, code2 = self.df.iloc[idx]
        code1 = text_clean(code1)
        code2 = text_clean(code2)
        return code1, code2

In [None]:
test_dataset = Datasets(test_df)
test_loader = DataLoader(test_dataset, batch_size = BATCH_SIZE, shuffle=False, num_workers=0)

In [None]:
def inference(model, model_name):
    preds = np.array([])
    probs = np.array([[1,1]])
    for code1, code2 in tqdm(test_loader):
        encoded_list = []
        attention_mask_list = []

        for c1, c2 in zip(code1, code2):
            tokenized = tokenizer(c1, c2, max_length=MAX_LEN, padding='max_length', truncation=True)
            encoded_list.append(tokenized['input_ids'])
            attention_mask_list.append(tokenized['attention_mask'])

        input_ids = torch.tensor(encoded_list)
        input_mask = torch.tensor(attention_mask_list)
        input_ids, input_mask = input_ids.to(device), input_mask.to(device)

        with torch.inference_mode():
            outputs = model(input_ids, attention_mask=input_mask)
        logits = outputs['logits']
        logits = logits.detach().cpu()
        prob = F.softmax(logits)
        pred = np.argmax(prob,axis=1)
        probs = np.append(probs, prob, axis=0)
        preds = np.append(preds, pred)
    submission = pd.read_csv('./data/sample_submission.csv')
    submission['similar'] = preds
    submission.to_csv('./submission/{model_name}.csv'.format(model_name=model_name), index=False)
    submission['probs_0'] = probs[1:,0]
    submission['probs_1'] = probs[1:,1]
    submission.to_csv('./probs/{model_name}.csv'.format(model_name=model_name), index=False)

In [None]:
model_name = ''

In [None]:
check_point = torch.load(f'./savepoint/{model_name}.pt')
model.load_state_dict(check_point['State_dict'])
inference(model, model_name)