In [2]:
import numpy as np
import pandas as pd
import re
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("neulab/codebert-cpp")

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
train_df = pd.read_csv('./data/train_seed42.csv')

In [None]:
def remove_links(x):
    x = re.sub(r'https*\S+', ' ', x) # remove links
    x = re.sub(r'http*\S+', ' ', x)
    return x

In [None]:
def remove_annotation(x):
    x = x+'\n'
    x = re.sub(r'\n.*\\\n','\n',x)
    x = re.sub(r'//.*\n','\n',x)
    x = re.sub(r'/\*.*\*/','',x)
    x = re.sub(r'#if 0.*#endif','',x)
    return x

In [None]:
def standardize_sign(x):
    x = re.sub(r' +', ' ', x)
    x = re.sub(r' \+ | \+|\+ ','+',x)
    x = re.sub(r' - | -|- ','-',x)
    x = re.sub(r' \* | \*|\* ','*',x)
    x = re.sub(r' / | /|/ ','/',x)
    x = re.sub(r' % | %|% ','%',x)
    x = re.sub(r' = | =|= ','=',x)
    x = re.sub(r' > | >|> ','>',x)
    x = re.sub(r' < | <|< ','=',x)
    x = re.sub(r' !','!',x)
    x = re.sub(r' & | &|& ','&',x)
    x = re.sub(r' \| | \||\| ', '|', x)
    x = re.sub(r' : | :|: ',':',x)
    x = re.sub(r' \? | \?|\? ','?',x)
    return x

In [None]:
def remove_std(x):
    x = re.sub(r'std::','',x)
    x = re.sub(r'using namespace std;','',x)
    return x

In [None]:
def remove_include(x):
    x = re.sub(r'#include.*>','',x)
    return x

In [None]:
def text_clean(x):
    x = x.lower() # lowercase everything
    x = x.encode('ascii', 'ignore').decode()  # remove unicode characters
    x = remove_links(x)
    x = remove_annotation(x)
    x = standardize_sign(x)
    x = re.sub(r'\n', ' ', x)
    x = re.sub(r'\t', ' ', x)
    x = re.sub(r' +', ' ', x)
    return x

In [None]:
train_df['code1'] = list(map(text_clean, train_df['code1']))
train_df['code2'] = list(map(text_clean, train_df['code2']))

In [None]:
train_df['len_code1'] = [len(code) for code in train_df['code1']]
train_df['len_code2'] = [len(code) for code in train_df['code2']]
train_df['len_token_code1'] = [len(tokenizer.encode(code)) for code in train_df['code1']]
train_df['len_token_code2'] = [len(tokenizer.encode(code)) for code in train_df['code2']]

In [None]:
train_df.describe()