In [2]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import utils

from tqdm import tqdm
tqdm.pandas(desc='Applying')
import re

from pandarallel import pandarallel

In [3]:
main_df = utils.read_csv("data/full_unlabeled.csv")

In [4]:
main_df.columns

Index(['directory', 'repo_id', 'file_name', 'extension', 'no_lines',
       'max_line_len', 'generation_keywords', 'license_whitelist_keywords',
       'license_blacklist_keywords', 'icarus_module_spans', 'icarus_exception',
       'verilator_xml_output_path', 'verilator_exception', 'code'],
      dtype='object')

## Mining snippets

In [5]:
module_re = re.compile(r'module (.*?)endmodule',re.DOTALL | re.MULTILINE)
function_re = re.compile(r'function (.*?)endfunction',re.DOTALL | re.MULTILINE)

In [6]:
def mine_labeled_pairs(row):
    abstract_code, replaced = utils.abstract_strings(row['code'])
    no_comments_abstract_code = utils.remove_all_comments(abstract_code)
    # Find modules
    modules = []
    for match in re.finditer(module_re,no_comments_abstract_code):
        modules.append(match.group(0))
    modules = [utils.recreate_string(m,replaced) for m in modules]
    # Find functions
    functions = []
    for match in re.finditer(function_re,no_comments_abstract_code):
        functions.append(match.group(0))
    functions = [utils.recreate_string(f,replaced) for f in functions]
    return modules, functions

In [7]:
columns_to_keep = [c for c in main_df.columns if not c in ['code']]
snippet_df = pd.DataFrame(columns=columns_to_keep + ['file_index','snippet_type','snippet'])

In [None]:
for file_index,row in main_df.iterrows():
    print(file_index)
    modules,functions = mine_labeled_pairs(row)
    for module in modules:
        snippet_df.loc[len(snippet_df)] = [row[ctk] for ctk in columns_to_keep] + [file_index,'module',module]
    for function in functions:
        snippet_df.loc[len(snippet_df)] = [row[ctk] for ctk in columns_to_keep] + [file_index,'function',function]

In [19]:
snippet_df['snippet_type'].value_counts()

module      130862
function     11752
Name: snippet_type, dtype: int64

In [21]:
print(len(snippet_df))
print(len(snippet_df.drop_duplicates()))

142614
142297


In [22]:
snippet_df.drop_duplicates().to_csv("data/full_snippets.csv")

## Mining pairs
Splitting snippets into definition-body pairs

In [26]:
snippet_df = utils.read_csv("data/full_snippets.csv")

In [27]:
module_def_re = re.compile(r'module (.*?);',re.DOTALL | re.MULTILINE)
function_def_re = re.compile(r'function (.*?);',re.DOTALL | re.MULTILINE)

In [None]:
with open('test.v','r') as f:
    s = f.read()

m = re.search(module_def_re,s)
print(m.group(0))
print("="*10)
print(s[m.span()[1]:])

In [37]:
def split_snippet(snippet,def_re):
    match = re.search(def_re,snippet)
    if not match:
        return "ERROR", "ERROR"
    s_def = match.group(0)
    s_body = snippet[match.span()[1]:]
    return s_def, s_body

def get_def_body_pairs(row):
    abstract_snippet, replaced = utils.abstract_strings(row['snippet'])
    def_re = module_def_re if row['snippet_type'] =='module' else function_def_re
    s_def, s_body = split_snippet(abstract_snippet,def_re)
    return pd.Series([utils.recreate_string(s_def,replaced), utils.recreate_string(s_body,replaced)])


In [38]:
snippet_df[['snippet_def','snippet_body']] = snippet_df.progress_apply(get_def_body_pairs,axis=1)

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


Applying: 100%|██████████| 142297/142297 [00:23<00:00, 6015.28it/s]


In [None]:
snippet_df[snippet_df['snippet_def'].isin(["ERROR"])]

In [44]:
print(len(snippet_df))
print(len(snippet_df[~snippet_df['snippet_def'].isin(["ERROR"])]))

142297
142283


In [45]:
snippet_df[~snippet_df['snippet_def'].isin(["ERROR"])].to_csv("data/full_snippets.csv")