In [1]:
import nltk, string
from sklearn.feature_extraction.text import TfidfVectorizer
import ast
import pickle
import os
import re
import pandas as pd

In [2]:
stemmer = nltk.stem.porter.PorterStemmer()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)

def stem_tokens(tokens):
    return [stemmer.stem(item) for item in tokens]


'''remove punctuation, lowercase, stem'''
def normalize(text):
    return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))

vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english')

def cosine_sim(text1, text2):
    text1 = removeSpecialChars(text1)
    text2 = removeSpecialChars(text2)
    tfidf = vectorizer.fit_transform([text1, text2])
    return ((tfidf * tfidf.T).A)[0,1]

def removeSpecialChars(text):
    return re.sub("[^a-zA-Z0-9]", " ", text)

In [3]:
from collections import deque

# Class to extract method as well as attribute calls. Each token after '.' is called attribute 
# be it function call or anything else

class AttributeVisitor(ast.NodeVisitor):
    def __init__(self):
        self._name = deque()
        self._pos = -1 
    
    @property
    def name(self):
        return '.'.join(self._name)
    
    @property
    def lineno(self):
        return self._pos
    
    @name.deleter
    def name(self):
        self._name.clear()
    
    def visit_Name(self, node):
        self._pos = node.lineno # line number
        self._name.appendleft(node.id)
    
    def visit_Attribute(self, node):
        try:
            self._pos = node.lineno # line number
            self._name.appendleft(node.attr)
            self._name.appendleft(node.value.id)
        except AttributeError:
            self.generic_visit(node)
            
def get_all_calls(tree):
    all_calls = []
    for node in ast.walk(tree):
        if isinstance(node, ast.Attribute):
            callvisitor = AttributeVisitor()
            callvisitor.visit(node)
            all_calls.append((callvisitor.name, callvisitor.lineno))
    return all_calls

# Visitin method calls only
class FunctionCallVisitor(ast.NodeVisitor):
    def __init__(self):
        self._name = deque()
        self._pos = -1 
    
    @property
    def name(self):
        return '.'.join(self._name)
    
    @property
    def lineno(self):
        return self._pos
    
    @name.deleter
    def name(self):
        self._name.clear()
    
    def visit_Name(self, node):
        self._pos = node.lineno # line number
        self._name.appendleft(node.id)
    
    def visit_Attribute(self, node):
        try:
            self._pos = node.lineno # line number
            self._name.appendleft(node.attr)
            self._name.appendleft(node.value.id)
        except AttributeError:
            self.generic_visit(node)
            
def get_func_calls(tree):
    func_calls = []
    for node in ast.walk(tree):
        if isinstance(node, ast.Call):
            callvisitor = FunctionCallVisitor()
            callvisitor.visit(node.func)
            func_calls.append((callvisitor.name, callvisitor.lineno))
    return func_calls

In [4]:
api_doc_file = '../../data-import/build_api_doc_base/api_doc.csv'
so_dump_processed_file = '../../../data/stack-overflow/pandas-preprocessedcode-dataset-part3'
code_snippet_col = 'PreprocessedCode3'
id_col = 'Id'
#cosine_sim_th = 0.0
dataset = '../../../data/stack-overflow/Dataset - Pandas.csv'

In [5]:
api_df = pd.read_csv(api_doc_file, encoding='ISO-8859-1', error_bad_lines=False)
dataset_df = pd.read_csv(dataset, encoding='ISO-8859-1', error_bad_lines=False)

processed_stackoverflow_df = pd.read_pickle(so_dump_processed_file)


## Get API description with fully qualified name for a method from API doc and build the context
def buildAPIDictionary(api_df):
    api_dict = dict()
    try:
        
        for index, row in api_df.iterrows():
            methodContext = row['Description']
            tokens = row['FullyQualifiedName'].split('.')
        
            for token in tokens:
                methodContext = str(methodContext)+' '+token
            api_dict[row['MethodName']] = methodContext
    except Exception as e:
        print(e)
    return api_dict

def buildAPIDictionaryH2(api_df):
    api_dict_h2 = dict()
    try:
        
        for index, row in api_df.iterrows():
            methodContext = row['SubCategory']
            #tokens = row['FullyQualifiedName'].split('.')
        
            #for token in tokens:
               #methodContext = str(methodContext)+' '+token
            api_dict_h2[row['MethodName']] = methodContext
    except Exception as e:
        print('Error in method buildAPIDictionary',e)
    return api_dict_h2
        
## Get AnswerId and Question Text combo from dataset to build the context
def buildAnswerIdQuestionTextDict(dataset_df):
    dataset_answerId_QText_Dict = dict()
    try:
        for idx, row in dataset_df.iterrows():
            answerId = row['AnswerId']
            if answerId != 0:
                dataset_answerId_QText_Dict[answerId] = row['QuestionText']
    except Exception as e:
        print(e)
        
    return dataset_answerId_QText_Dict

api_dict = buildAPIDictionary(api_df)
api_dict_h2 = buildAPIDictionaryH2(api_df)
dataset_answerId_QText_Dict = buildAnswerIdQuestionTextDict(dataset_df)

In [6]:
def lookUpAPIDocForContext(method_name):
    if method_name in api_dict.keys():
        return api_dict[method_name]
    else:
        return ""

def getSOContext(answerId):
    return dataset_answerId_QText_Dict[int(answerId)]

def lookUpAPIDocForContextH1H2(method_name):
    try:
        if method_name in api_dict_h2.keys():
            if api_dict_h2[method_name] == "Constructor":
                return False
            else:
                return True
        else:
            return False
    except Exception as e:
        print('Error in method lookUpAPIDocForContext', e)

In [10]:
def M1(df):
    
    try:
        # Do not process questions
        if df.PostTypeId == 1:
            df['Solution'] = 'NA'

        else:
            # Parse code and inspect the function
            code_snippet = df[code_snippet_col]
            Id = df[id_col]
            tree = ast.parse(code_snippet)        
            all_calls = get_all_calls(tree)

            solution_lines = set()
            max_score = -1
            max_line = ''
            snippet_per_line = code_snippet.split(os.linesep)
            for call, lineno in all_calls:
               
                tokens = call.split('.')
                method_name = tokens[len(tokens)-1]
                
                #if lookUpAPIDocForContextH1H2(method_name):
                API_Context = lookUpAPIDocForContext(method_name)
                
                # If API context is not defined, i.e method is not present in the API, 
                # reject the method as it could not be solution as per our assumption by making cosine score negative
                if API_Context == "":
                            cos_score = -1 # Reject the method
                
                # If method is in API, use its description and match with question text (SO_context)
                else:
                    SO_Context = getSOContext(Id)
                    cos_score = cosine_sim(API_Context, SO_Context)
                if cos_score > max_score:
                    max_score = cos_score
                    max_line = snippet_per_line[lineno - 1]
                    #print(API_Context, '\n', SO_Context,  '\n\n')
                    # Use a set to not add the same line twice
                    solution_lines.add(lineno - 1)

            if max_line:
                print(str(Id)+','+max_line)
            solution = []
            solution_lines = sorted(solution_lines)
            for i in solution_lines:
                solution.append(snippet_per_line[i])


            #print(solution)
            df['Solution'] = solution
            #df['Solution'] = os.linesep.join(solution)
    except KeyError:
        pass
    return df


solution_df = processed_stackoverflow_df.apply(M1, axis=1)

10202789,df = pandas.DataFrame(np.random.randn(5,3),columns=['A','B','C'])
10374456,DataFrame({'count' : df1.groupby( [ "Name", "City"] ).size()}).reset_index()
10458386,In : data2 = data.set_index('a')
11067072,df.reindex_axis(sorted(df.columns), axis=1)
11138275,df_ora = pd.read_sql('select * from user_objects', con=ora_conn)    
11287278,df1 = df.ix[0,0:2].copy() 
11346337,df = pd.DataFrame({'$a':[1,2], '$b': [10,20]})
11354850,df = df.rename(columns={'oldName1': 'newName1', 'oldName2': 'newName2'})
11362056,paramdata.index
11531402,df[df['A'].str.contains("hello")]
11711637,pd.set_option('display.height', 1000)
12098586,df[df['A'].isin([3, 6])]
12525836,df_norm.max() - df_norm.min()
12555510,df1 = df1.assign(e=p.Series(np.random.randn(sLength)).values)
12681217,                    for _, row in a.iterrows()]).reset_index()
13148611,cols = df.columns.tolist()
13295801,df.fillna(0)
13434501,df = pd.DataFrame(np.random.randn(10,3))
13682381,data['result'] = data['result'].map(lambda x