In [1]:
from nltk.tokenize import RegexpTokenizer
import pandas as pd
import nltk
import re
import numpy as np

In [2]:
codes = pd.read_csv("../processed_data/code_top300.csv")
# codes.iloc[:,0] = codes.index.values
df = pd.DataFrame(columns = ['text', 'tokens', 'annotations'])

In [3]:
identifier = r'[a-zA-Z_][\d\w_]*'

In [4]:
keywords = r'''break|default|func|interface|select|case|defer
                |go|map|struct|chan|else|goto|package|switch
                |const|fallthrough|if|range|type|continue|for|import|return|var'''

In [5]:
# operators and punctuation
operators = r'[\+\-\*\,;\$><!:\.\|&\^=\(\)\[\]\{\}]+'

In [6]:
decimal_literal = r'\d+i?'           
octal_literal = r'0[1-7]*'
hex_literal = r'0[xX][a-fA-F]+'
floating_literal = r''' \d+\.\d*(?:[eE][+-]\d+)?i?     
                | \d+[eE][+-]\d+i?               
                | \.\d+(?:[eE][+-]\d+)?i?        
                '''
string_literal = r'''\"\s*.*\n?\"'''

In [7]:
comments = r'//.*'

In [8]:
patterns =  keywords + '|' + identifier + '|' + operators + '|' \
        + decimal_literal + '|' + octal_literal + '|' + hex_literal + '|' \
        + floating_literal + '|' + string_literal + '|' + comments

In [9]:
tokenizer = RegexpTokenizer(patterns)

In [13]:
# annotations is used to store all the 100 posts
# each element in this array should be one particular post
annotations = []
for index, code in enumerate(codes['Code'][:100]):
    # post is used to store line by line info
    post = []
    # dic is used to store the overall info for one post
    dic = {'text' : code, 'tokens' : [], 'anns' : []}
    
    code_lines = list(filter(None, code.split('\n')))
    for line in code_lines:
        # store line info then append to 'post' array
        line_info = [line]
        tokens = tokenizer.tokenize(line)
        line_info.append(tokens)
        dic['tokens'] += (tokens)
        # start annotation
        ann = []
        for t in tokens:
            if re.match(comments, t):
                ann.append('COMMENT')
                t = t.replace('//', '')
                # annotate comments as natural language
                tags = nltk.pos_tag(nltk.word_tokenize(t))
                ann += [tags[i][1] for i in range(len(tags))]
            elif re.match(keywords, t):
                ann.append('KEYWORD')
            elif re.match(identifier, t):
                ann.append('IDENTIFIER')
            elif re.match(operators, t):
                ann.append('OPERATOR')
            elif re.match(decimal_literal, t):
                ann.append('DECIMAL_LITERAL')
            elif re.match(octal_literal, t):
                ann.append('OCTAL_LITERAL')
            elif re.match(hex_literal, t):
                ann.append('HEX_LITERAL')
            elif re.match(string_literal, t):
                ann.append('STRING_LITERAL')
            elif re.match(floating_literal, t):
                ann.append('FLOATING_LITERAL')
            else:
                ann.append('UNDEFINED')
        line_info.append(ann)
        dic['anns'] += (ann)
        post.append(line_info)
        
    # df for each post, do line by line annotation
    post_df = pd.DataFrame(post, columns = ['text', 'token', 'annotation'])
    post_df.to_csv('../processed_data/annotations/'+str(index)+'.csv')
    
    annotations.append(dic.values())
# df for all posts
results = pd.DataFrame(annotations, columns = ['text', 'tokens', 'annotations'])

In [14]:
results['post_id'] = codes['PostId']
results.to_csv('../processed_data/ann_100_results.csv')