In [1]:
from nltk.tokenize import RegexpTokenizer
import pandas as pd
import nltk
import re
import numpy as np

In [2]:
post_with_code = pd.read_csv("../processed_data/remove_duplicates_selected_codes.csv")
post_with_code.set_index('PostId', inplace=True)

In [3]:
selected_code_blks = pd.read_csv("../processed_data/code_top300.csv")
selected_code_blks.set_index('PostId', inplace=True)

In [4]:
identifier = r'[a-zA-Z_][\d\w_]*'

In [5]:
keywords = r'''break|default|func|interface|select|case|defer
                |go|map|struct|chan|else|goto|package|switch
                |const|fallthrough|if|range|type|continue|for|import|return|var'''

In [6]:
# operators and punctuation
operators = r'[\+\-\*\,;\$><!:\.\|&\^=\(\)\[\]\{\}]+'

In [7]:
decimal_literal = r'\d+i?'           
octal_literal = r'0[1-7]*'
hex_literal = r'0[xX][a-fA-F]+'
floating_literal = r''' \d+\.\d*(?:[eE][+-]\d+)?i?     
                | \d+[eE][+-]\d+i?               
                | \.\d+(?:[eE][+-]\d+)?i?        
                '''
string_literal = r'''\"\s*.*\n?\"'''

In [8]:
comments = r'//.*'

In [9]:
patterns =  keywords + '|' + identifier + '|' + operators + '|' \
        + decimal_literal + '|' + octal_literal + '|' + hex_literal + '|' \
        + floating_literal + '|' + string_literal + '|' + comments

In [10]:
tokenizer = RegexpTokenizer(patterns)
natural_lang_tokenizer = RegexpTokenizer(r'\S+')

In [11]:
answers = pd.read_csv('../processed_data/answers.csv')
qns = pd.read_csv('../processed_data/questions.csv')
answers.set_index('PostId', inplace=True)
qns.set_index('PostId', inplace=True)

In [20]:
def annotatet_code(code, dic, post):
    code_lines = list(filter(None, code.split('\n')))
    for line in code_lines:
        # store line info then append to 'post' array
        line_info = [line]
        tokens = tokenizer.tokenize(line)
        line_info.append(tokens)
        dic['tokens'] += (tokens)
        # start annotation
        ann = []
        for t in tokens:
            if re.match(comments, t):
                ann.append('COMMENT')
                t = t.replace('//', '')
                # annotate comments as natural language
                t = natural_lang_tokenizer.tokenize(t)
                line_info.pop()
                line_info.append(['//']+t)
                dic['tokens'].pop()
                dic['tokens'] += t
                tags = nltk.pos_tag(t)
                ann += [tags[i][1] for i in range(len(tags))]
            elif re.match(keywords, t):
                ann.append('KEYWORD')
            elif re.match(identifier, t):
                ann.append('IDENTIFIER')
            elif re.match(operators, t):
                ann.append('OPERATOR')
            elif re.match(decimal_literal, t):
                ann.append('DECIMAL_LITERAL')
            elif re.match(octal_literal, t):
                ann.append('OCTAL_LITERAL')
            elif re.match(hex_literal, t):
                ann.append('HEX_LITERAL')
            elif re.match(string_literal, t):
                ann.append('STRING_LITERAL')
            elif re.match(floating_literal, t):
                ann.append('FLOATING_LITERAL')
            else:
                ann.append('UNDEFINED')
        line_info.append(ann)
        dic['anns'] += (ann)
        post.append(line_info)
    return dic, post

In [21]:
# annotations is used to store all the 100 posts
# each element in this array should be one particular post
annotations = []
post_number = 0
# iterate through 150 posts in case there is any 'no match' problem
# Note: the 'no-mathc' problem is due to different formatting/parsing methods from xml to csv
for p_id in post_with_code.index[:150]:
    no_match = False
    if p_id in answers.index:
        body = answers.loc[p_id]['Body']
    elif p_id in qns.index:
        body = qns.loc[p_id]['Body']
    # separate code and natural language, store into parts
    parts = []
    # 1 for code, 0 for natural language
    code_or_not = []
    all_code_blks = selected_code_blks.loc[[p_id], 'Code']
    position = {}
    
    for code_blk in all_code_blks.values:
        if body.find(code_blk) == -1:
            print(str(p_id) + " code not match body")
            no_match = True
            break
        if len(code_blk) > 20:
            position[body.index(code_blk)] = code_blk 
    if no_match:
        continue
    keylist = sorted(position.keys())
    
    for key in keylist:
        code = position[key]
        rest = body.split(code)
        # handle the situation where the code appears more than once -- split the rest into more than 2 parts
        for splited in rest[:-1]:
            parts += ([splited, code])
            code_or_not += ([0, 1] if splited else [1])
        if len(rest) > 2:
            parts.append(rest[-1])
            if rest[-1].strip():
                code_or_not.append(0)
                
        parts = list(filter(None, parts))
        body = rest[-1]
    if len(rest) == 2 and rest[1].strip():
        parts.append(rest[1]) 
        code_or_not.append(0)
    
    # post is used to store line by line info
    post = []
    # dic is used to store the overall info for one post
    dic = {'text' : body, 'tokens' : [], 'anns' : []}
    for index, value in enumerate(parts):
        if code_or_not[index]:
            dic, post = annotatet_code(value, dic, post)
        else:
            token = natural_lang_tokenizer.tokenize(value)
            ann = nltk.pos_tag(token)
            ann = [ann[i][1] for i in range(len(token))]
            post.append([value, token, ann])
            dic['tokens'] += token
            dic['anns'] += ann
    
    
    # df for each post, do line by line annotation
    post_df = pd.DataFrame(post, columns = ['text', 'token', 'annotation'])
    post_df.to_csv('../processed_data/annotations/'+str(post_number)+'.csv')
    annotations.append([p_id, dic['text'], dic['tokens'], dic['anns']])
    post_number += 1
    
    # only annotate 100 posts
    if post_number == 100:
        break
# df for all posts
results = pd.DataFrame(annotations, columns = ['post_id', 'text', 'tokens', 'annotations'])

In [17]:
results.to_csv('../processed_data/ann_100_results.csv')