In [1]:
import ast
import pickle
import os
import pandas as pd

#### Utility to extract method name

In [2]:
from collections import deque

class FunctionCallVisitor(ast.NodeVisitor):
    def __init__(self):
        self._name = deque()
    
    @property
    def name(self):
        return '.'.join(self._name)
    
    @name.deleter
    def name(self):
        self._name.clear()
    
    def visit_Name(self, node):
        self._name.appendleft(node.id)
    
    def visit_Attribute(self, node):
        try:
            self._name.appendleft(node.attr)
            self._name.appendleft(node.value.id)
        except AttributeError:
            self.generic_visit(node)
            
def get_func_calls(tree):
    func_calls = []
    for node in ast.walk(tree):
        if isinstance(node, ast.Call):
            callvisitor = FunctionCallVisitor()
            callvisitor.visit(node.func)
            func_calls.append(callvisitor.name)
    return func_calls

#### Load APIDoc and Dataset

In [20]:
#API_DOC_FILE_PATH = 'api_doc.pkl'
#DATASET_FILE_PATH = 'dataset_df.pkl'
#api_df = pickle.load( open( API_DOC_FILE_PATH, "rb" ))
#dataset_df = pickle.load( open( DATASET_FILE_PATH, "rb" ))

API_DOC_OBJ_FILE_PATH = '../../data-import/build_api_doc_base/api_doc.csv'
TAGGED_DATASET_FILE_PATH = '../../../data/stack-overflow/Dataset - Pandas.csv'
STACK_OVERFLOW_PROCESSED_DUMP_FILE = '../../../data/stack-overflow/pandas-preprocessedcode-dataset-part3.csv'

api_df = pd.read_csv(API_DOC_OBJ_FILE_PATH, encoding='ISO-8859-1', error_bad_lines=False)
tagged_dataset_df = pd.read_csv(TAGGED_DATASET_FILE_PATH, encoding='ISO-8859-1', error_bad_lines=False)
processed_stackoverflow_df = pd.read_csv(STACK_OVERFLOW_PROCESSED_DUMP_FILE, encoding='ISO-8859-1', error_bad_lines=False)



def buildMethodNameSet(api_df):
    method_set = set()
    for index, row in api_df.iterrows():
        method_set.add(row['MethodName'])
    return method_set
        

def buildTaggedDatasetDSForEvaluation(tagged_dataset_df):
    dataset_dict = dict()
    total_solutions = 0
    try:    
        for idx, row in tagged_dataset_df.iterrows():
            answerId = row['AnswerId']
            if answerId != 0:
                total_solutions = total_solutions +1
                tup = (int(row['SolutionId']), row['Solution'])
                if answerId in dataset_dict:
                    ls = dataset_dict[answerId]
                    ls.append(tup)
                    dataset_dict[answerId] = ls
                else:
                    ls = list()
                    ls.append(tup)
                    dataset_dict[answerId] = ls
    except Exception as e:
        print(e)
        
    return dataset_dict, total_solutions

def buildStackOverflowDumpDict(processed_stackoverflow_df):
    stackoverflow_dict = dict()
    try:
        for idx, row in processed_stackoverflow_df.iterrows():
            postTypeId = row['PostTypeId']
            if postTypeId == 2:
                answerId = row['Id']
                stackoverflow_dict[answerId] = row['PreprocessedCode']
    except Exception as e:
        print(e)
    return stackoverflow_dict
    
method_set = buildMethodNameSet(api_df)
tagged_dataset_dict, total_solutions = buildTaggedDatasetDSForEvaluation(tagged_dataset_df)
stackoverflow_dict = buildStackOverflowDumpDict(processed_stackoverflow_df)

cannot convert float NaN to integer


In [17]:
def lookUpAPIDoc(method_set, method_name):
    if method_name in method_set:
        return True
    else:
        return False

In [19]:
PROCESSED_CODE_OUTPUT_FOLDER = '../../data-preprocess/neelesh/processed_answer_codes'


TP = 0
TN = 0
FP = 0
FN = 0

'''for fil in os.listdir(PROCESSED_CODE_OUTPUT_FOLDER):
    try:
        solutionList = dataset_dict[int(fil)]
        with open(os.path.join(PROCESSED_CODE_OUTPUT_FOLDER, fil), 'r') as f:
            lines = f.readlines()
            #print f
            for line in lines:
                actualSolution = False
                predictedSolution = False
                for tup in solutionList:
                    if tup[1].strip() == line.strip():
                        actualSolution = True
                tree = ast.parse(line)
                func_calls = get_func_calls(tree)
                for func_call in func_calls:
                    tokens = func_call.split('.')
                    method_name = tokens[len(tokens)-1]
                    if lookUpAPIDoc(method_set, method_name):
                        predictedSolution = True
                        break
                if actualSolution and predictedSolution:
                    TP = TP + 1
                if (not actualSolution) and predictedSolution:
                    FP = FP + 1
                if (not actualSolution) and (not predictedSolution):
                    TN = TN + 1
                if actualSolution and (not predictedSolution):
                    FN = FN + 1
    except Exception as e:
        print e'''


for key in tagged_dataset_dict.keys():
    try:
        solutionList = tagged_dataset_dict[key]
        content = str(stackoverflow_dict[key])
        lines = content.split(os.linesep)
        for line in lines:
            actualSolution = False
            predictedSolution = False
            for tup in solutionList:
                    if tup[1].strip() == line.strip():
                        actualSolution = True
            tree = ast.parse(line)
            func_calls = get_func_calls(tree)
            for func_call in func_calls:
                    tokens = func_call.split('.')
                    method_name = tokens[len(tokens)-1]
                    if lookUpAPIDoc(method_set, method_name):
                        predictedSolution = True
                        break
            if actualSolution and predictedSolution:
                TP = TP + 1
            if (not actualSolution) and predictedSolution:
                FP = FP + 1
            if (not actualSolution) and (not predictedSolution):
                TN = TN + 1
            if actualSolution and (not predictedSolution):
                FN = FN + 1
    except Exception as e:
        print(e)
        
acc = (TP+TN)*100/(TP+TN+FP+FN)
Precision = TP*100/(TP + FP)
Recall = TP*100/(TP+FN)
F1 = (2*Precision*Recall)/(Precision+Recall)
F1 = float(F1)/100
print('Accuracy = '+ str(acc) + '%')
print('Precision = '+ str(Precision) + '%')
print('Recall = '+ str(Recall) + '%')
print('F1 = '+ str(F1))


41386927
39923012
37787724
46526249
Accuracy = 90.5940594059406%
Precision = 63.855421686746986%
Recall = 86.88524590163935%
F1 = 0.7361111111111112
