In [1]:
import json
import re
import ast
import sys
import nltk
import traceback
import astor
import token as tk
from tokenize import generate_tokens
from io import StringIO
import itertools 
from gensim.models import FastText
from gensim.models import KeyedVectors
from time import time
import numpy as np
import pickle

In [2]:
# File Path
test_path="data/conala-corpus/conala-test.json"
test_clean_output_path="data/conala-corpus/.test.seq2seq"

# most relevant document path 
fasttext_path = 'test_list_most_relevant_doc_fasttext.pkl'
supervised_path = 'test_list_most_relevant_doc_super.pkl'
unsupervised_path = 'test_list_most_relevant_doc_unsuper.pkl'


In [3]:
QUOTED_STRING_RE = re.compile(r"(?P<quote>[`'\"])(?P<string>.*?)(?P=quote)")


def canonicalize_intent(intent):
    str_matches = QUOTED_STRING_RE.findall(intent)

    slot_map = dict()

    return intent, slot_map


def replace_strings_in_ast(py_ast, string2slot):
    for node in ast.walk(py_ast):
        for k, v in list(vars(node).items()):
            if k in ('lineno', 'col_offset', 'ctx'):
                continue
            # Python 3
            # if isinstance(v, str) or isinstance(v, unicode):
            if isinstance(v, str):
                if v in string2slot:
                    val = string2slot[v]
                    # Python 3
                    # if isinstance(val, unicode):
                    #     try: val = val.encode('ascii')
                    #     except: pass
                    setattr(node, k, val)
                else:
                    # Python 3
                    # if isinstance(v, str):
                    #     str_key = unicode(v)
                    # else:
                    #     str_key = v.encode('utf-8')
                    str_key = v

                    if str_key in string2slot:
                        val = string2slot[str_key]
                        if isinstance(val, str):
                            try: val = val.encode('ascii')
                            except: pass
                        setattr(node, k, val)


def canonicalize_code(code, slot_map):
    string2slot = {x[1]['value']: x[0] for x in list(slot_map.items())}

    py_ast = ast.parse(code)
    replace_strings_in_ast(py_ast, string2slot)
    canonical_code = astor.to_source(py_ast)

    return canonical_code


def decanonicalize_code(code, slot_map):
    try:
        slot2string = {x[0]: x[1]['value'] for x in list(slot_map.items())}
        py_ast = ast.parse(code)
        replace_strings_in_ast(py_ast, slot2string)
        raw_code = astor.to_source(py_ast)
      # for slot_name, slot_info in slot_map.items():
      #     raw_code = raw_code.replace(slot_name, slot_info['value'])

        return raw_code.strip()
    except:
        return code

def detokenize_code(code_tokens):
    newline_pos = [i for i, x in enumerate(code_tokens) if x == '\n']
    newline_pos.append(len(code_tokens))
    start = 0
    lines = []
    for i in newline_pos:
        line = ' '.join(code_tokens[start: i])
        start = i + 1
        lines.append(line)

    code = '\n'.join(lines).strip()

    return code


def encode_tokenized_code(code_tokens):
    tokens = []
    for token in code_tokens:
        if token == '\t':
            tokens.append('_TAB_')
        elif token == '\n':
            tokens.append('_NEWLINE_')


def get_encoded_code_tokens(code):
    code = code.strip()
    #print(code)
    token_stream = generate_tokens(StringIO(code).readline)
    tokens = []
    indent_level = 0
    new_line = False

    for toknum, tokval, (srow, scol), (erow, ecol), _ in token_stream:
        if toknum == tk.NEWLINE:
            tokens.append('#NEWLINE#')
            new_line = True
        elif toknum == tk.INDENT:
            indent_level += 1
            # new_line = False
            # for i in range(indent_level):
            #     tokens.append('#INDENT#')
        elif toknum == tk.STRING:
            tokens.append(tokval.replace(' ', '#SPACE#').replace('\t', '#TAB#').replace('\r\n', '#NEWLINE#').replace('\n', '#NEWLINE#'))
        elif toknum == tk.DEDENT:
            indent_level -= 1
            # for i in range(indent_level):
            #     tokens.append('#INDENT#')
            # new_line = False
        else:
            tokval = tokval.replace('\n', '#NEWLINE#')
            if new_line:
                for i in range(indent_level):
                    tokens.append('#INDENT#')

            new_line = False
            tokens.append(tokval)

    # remove ending None
    if len(tokens[-1]) == 0:
        tokens = tokens[:-1]

    if '\n' in tokval:
        pass

    return tokens


def tokenize(code):
    token_stream = generate_tokens(StringIO(code).readline)
    tokens = []
    for toknum, tokval, (srow, scol), (erow, ecol), _ in token_stream:
        if toknum == tk.ENDMARKER:
            break

        tokens.append(tokval)

    return tokens


def compare_ast(node1, node2):
    # Python 3
    # if not isinstance(node1, str) and not isinstance(node1, unicode):
    if not isinstance(node1, str):
        if type(node1) is not type(node2):
            return False
    if isinstance(node1, ast.AST):
        for k, v in list(vars(node1).items()):
            if k in ('lineno', 'col_offset', 'ctx'):
                continue
            if not compare_ast(v, getattr(node2, k)):
                return False
        return True
    elif isinstance(node1, list):
        return all(itertools.starmap(compare_ast, zip(node1, node2)))
    else:
        return node1 == node2


def encoded_code_tokens_to_code(encoded_tokens, indent=' '):
    decoded_tokens = []
    for i in range(len(encoded_tokens)):
        token = encoded_tokens[i]
        token = token.replace('#TAB#', '\t').replace('#SPACE#', ' ')

        if token == '#INDENT#': decoded_tokens.append(indent)
        elif token == '#NEWLINE#': decoded_tokens.append('\n')
        else:
            token = token.replace('#NEWLINE#', '\n')
            decoded_tokens.append(token)
            decoded_tokens.append(' ')

    code = ''.join(decoded_tokens).strip()

    return code


def find_sub_sequence(sequence, query_seq):
    for i in range(len(sequence)):
        if sequence[i: len(query_seq) + i] == query_seq:
            return i, len(query_seq) + i

    raise IndexError


def replace_sequence(sequence, old_seq, new_seq):
    matched = False
    for i in range(len(sequence)):
        if sequence[i: i + len(old_seq)] == old_seq:
            matched = True
            sequence[i:i + len(old_seq)] = new_seq
    return matched

In [4]:
# read and clean data
def read_clean_dataset(dataset_path, output_path):
    train = json.load(open(dataset_path))

    for i, example in enumerate(train):
        # updating `train` in place
        intent = example['intent']

        rewritten_intent = example['rewritten_intent']

        snippet = example['snippet']
        # print(i)
        # code_tokens = get_encoded_code_tokens(snippet)
        # print(' '.join(code_tokens))

        failed = False
        intent_tokens = []
        if rewritten_intent:
            try:
                canonical_intent, slot_map = canonicalize_intent(rewritten_intent)
                #print(canonical_intent, slot_map)

                snippet = snippet
                canonical_snippet = canonicalize_code(snippet, slot_map)
                #print("canonical_snippet:", canonical_snippet, slot_map)

                intent_tokens = nltk.word_tokenize(canonical_intent)

                decanonical_snippet = decanonicalize_code(canonical_snippet, slot_map)
                #print("decanonical_snippet: ",decanonical_snippet)

                snippet_reconstr = astor.to_source(ast.parse(snippet)).strip()
                #print("snippet_reconstr: ",decanonical_snippet)

                decanonical_snippet_reconstr = astor.to_source(ast.parse(decanonical_snippet)).strip()
                #print("decanonical_snippet_reconstr: ",decanonical_snippet_reconstr)
                encoded_reconstr_code = get_encoded_code_tokens(decanonical_snippet_reconstr)
                decoded_reconstr_code = encoded_code_tokens_to_code(encoded_reconstr_code)

                # syntax error in snippet
                if not compare_ast(ast.parse(decoded_reconstr_code), ast.parse(snippet)):
                    print(i)
                    print('Original Snippet: %s' % snippet_reconstr)
                    print('Tokenized Snippet: %s' % ' '.join(encoded_reconstr_code))
                    print('decoded_reconstr_code: %s' % decoded_reconstr_code)

            except:
                print('*' * 20, file=sys.stderr)
                print(i, file=sys.stderr)
                print(intent, file=sys.stderr)
                print(snippet, file=sys.stderr)
                traceback.print_exc()

                failed = True
            finally:
                example['slot_map'] = slot_map

        if rewritten_intent is None:
            encoded_reconstr_code = get_encoded_code_tokens(snippet.strip())
        else:
            encoded_reconstr_code = get_encoded_code_tokens(canonical_snippet.strip())

        if not intent_tokens:
            intent_tokens = nltk.word_tokenize(intent)

        example['intent_tokens'] = intent_tokens
        example['snippet_tokens'] = encoded_reconstr_code

    json.dump(train, open(output_path, 'w'), indent=2)
    

## Print Case

In [5]:
def print_result(list_most_relevant_doc, ques_list):

    # save the correct idx which one of ten result match correct code snippet 
    index_list = []
    
    # save the ranking position which the best answer in
    ranking_list = []

    for j, ques_sim_dict in enumerate(list_most_relevant_doc):
        for pid, idx in enumerate(ques_sim_dict['similar']):
            if ques_list[idx]['question_id'] == ques_sim_dict['question_id']:
                index_list.append(j)
                ranking_list.append(pid)
                break

    return index_list, ranking_list
       

In [6]:
read_clean_dataset(test_path, test_clean_output_path)
test_clean = json.load(open(test_clean_output_path))

test_size=len(test_clean)
test_ques_list=[] # [{"question_id": int, "intent_tokens": [...]}, ...]

for idx, example in enumerate(test_clean):
    test_ques_list.append({"question_id": example["question_id"], "intent_tokens": example["intent_tokens"]})

In [7]:
with open(fasttext_path, 'rb') as f:
    fasttext_doc = pickle.load(f)
    
with open(supervised_path, 'rb') as f:
    supervised_doc = pickle.load(f)
    
with open(unsupervised_path, 'rb') as f:
    unsupervised_doc = pickle.load(f)

In [8]:
f_idx_list, f_ranking_list= print_result(fasttext_doc, test_ques_list)
print(len(f_idx_list))

s_idx_list, s_ranking_list= print_result(supervised_doc, test_ques_list)
print(len(s_idx_list))

u_idx_list, u_ranking_list= print_result(unsupervised_doc, test_ques_list)
print(len(u_idx_list))


81
332
237


In [9]:
# save the id that only supervised learning correctly match
final_idx = []
for i, j in enumerate(s_idx_list):
    if j not in f_idx_list and j not in u_idx_list:
        final_idx.append((i,j))

In [10]:
len(final_idx)

111

In [11]:
for (pid, idx) in final_idx:
    print(idx, test_clean[idx]['question_id'])
    print(test_clean[idx]['intent'])
    print(test_clean[idx]['snippet'])
    print(s_ranking_list[pid])
    print("")
    
    print("##FastText###")
    for j, z in enumerate(fasttext_doc[idx]['similar']):
        print(j+1 , test_clean[z]['snippet'])
    
    print("##SuperVised Starspace###")
    
    for j, z in enumerate(supervised_doc[idx]['similar']):
        print(j+1 , test_clean[z]['snippet'])
         
    print("##UnsuperVised Starspace###")
    
    for j, z in enumerate(unsupervised_doc[idx]['similar']):
        print(j+1 , test_clean[z]['snippet'])
    
    print("##Done###")
    print("")

4 7555335
How to convert a string from CP-1251 to UTF-8?
d.decode('cp1251').encode('utf8')
0

##FastText###
1 l = sorted(l, key=lambda a: a['time'], reverse=True)
2 df.to_csv('c:\\data\\pandas.txt', header=None, index=None, sep=' ', mode='a')
3 generator = iter_iprange('192.168.1.1', '192.168.255.255', step=1)
4 r = requests.post(url, files=files, headers=headers, data=data)
5 b = models.CharField(max_length=7, default='0000000', editable=False)
6 result = ([a for (a, b) in original], [b for (a, b) in original])
7 requests.post(url, data=DATA, headers=HEADERS_DICT, auth=(username, password))
8 df['group'].plot(kind='bar', color=['r', 'g', 'b', 'r', 'g', 'b', 'r'])
9 parser.add_argument('--version', action='version', version='%(prog)s 2.0')
10 df.xs('sat', level='day', drop_level=False)
##SuperVised Starspace###
1 d.decode('cp1251').encode('utf8')
2 Counter(' '.join(df['text']).split()).most_common(100)
3 [m.group(0) for m in re.finditer('(\\d)\\1*', s)]
4 plt.scatter(np.random.randn(10

5 (n for n in [1, 2, 3, 5])
6 [element for element in lst if not isinstance(element, str)]
7 l = sorted(l, key=itemgetter('time'), reverse=True)
8 sorted(list_of_strings, key=lambda s: s.split(',')[1])
9 l = sorted(l, key=lambda a: a['time'], reverse=True)
10 from functools import reduce
reduce(lambda a, b: a + b, (('aa',), ('bb',), ('cc',)))
##SuperVised Starspace###
1 int()
2 int('1')
3 struct.unpack('!f', '470FC614'.decode('hex'))[0]
4 struct.unpack('H', struct.pack('h', number))
5 newFile.write(struct.pack('5B', *newFileBytes))
6 map(int, eval(input('Enter the unfriendly numbers: ')))
7 'ME' + str(i)
8 user_list = [int(number) for number in user_input.split(',')]
9 answer = str(round(answer, 2))
10 int(round(2.51 * 100))
##UnsuperVised Starspace###
1 map(int, eval(input('Enter the unfriendly numbers: ')))
2 T2 = [map(int, x) for x in T1]
3 np.array([zip(x, y) for x, y in zip(a, b)])
4 pd.concat([df[0].apply(pd.Series), df[1]], axis=1)
5 print(dict(zip(LD[0], zip(*[list(d.values()) 

7 dict((k, [d[k] for d in dicts]) for k in dicts[0])
8 b.sort(key=lambda x: x[1][2])
9 c2.sort(key=lambda row: (row[2], row[1]))
10 l.sort(key=lambda x: x['title'])
##UnsuperVised Starspace###
1 heapq.nlargest(10, range(len(l1)), key=lambda i: abs(l1[i] - l2[i]))
2 dict(((x, l.count(x)) for x in set(l)))
3 my_dict.update((x, y * 2) for x, y in list(my_dict.items()))
4 sorted(d.items())
5 [x for x in mylist if len(x) == 3]
6 [[sum(item) for item in zip(*items)] for items in zip(*data)]
7 plt.plot(x, y, label='H\u2082O')
8 plt.plot(x, y, label='$H_2O$')
9 max(k for k, v in x.items() if v != 0)
10 max(k for k, v in x.items() if v != 0)
##Done###

458 72899
How do I sort a list of dictionaries by values of the dictionary in Python?
newlist = sorted(l, key=itemgetter('name'), reverse=True)
0

##FastText###
1 any(key.startswith('EMP$$') for key in dict1)
2 sorted(d)
3 list(t)
4 len(s)
5 len(my_list)
6 [image for menuitem in list_of_menuitems for image in menuitem]
7 print(sum(row[column] for