In [1]:
import nltk, string
from sklearn.feature_extraction.text import TfidfVectorizer
import ast
import pickle
import os
import re
import pandas as pd

#### Luigi migration. Task yet to be created

In [27]:
import os
import sys
module_path = os.path.abspath(os.path.join("../.."))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from jupyter_notebook import load_parameters 


pars = load_parameters()

api_doc_file = pars.get('api_doc_file')
tagged_dataset_file = pars.get('tagged_dataset_file')
so_dump_processed_file = pars.get('so_dump_processed_file')
output_file = pars.get('output_file')
cosine_sim_th = pars.get('cosine_sim_th')

debug = pars.get('debug')

ImportError: No module named 'jupyter_notebook'

#### Utility to get similarity between two texts

In [2]:
stemmer = nltk.stem.porter.PorterStemmer()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)

def stem_tokens(tokens):
    return [stemmer.stem(item) for item in tokens]


'''remove punctuation, lowercase, stem'''
def normalize(text):
    return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))

vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english')

def cosine_sim(text1, text2):
    text1 = removeSpecialChars(text1)
    text2 = removeSpecialChars(text2)
    tfidf = vectorizer.fit_transform([text1, text2])
    return ((tfidf * tfidf.T).A)[0,1]

def removeSpecialChars(text):
    return re.sub("[^a-zA-Z0-9]", " ", text)

#### Utility to extract method name

In [3]:
from collections import deque

class FunctionCallVisitor(ast.NodeVisitor):
    def __init__(self):
        self._name = deque()
    
    @property
    def name(self):
        return '.'.join(self._name)
    
    @name.deleter
    def name(self):
        self._name.clear()
    
    def visit_Name(self, node):
        self._name.appendleft(node.id)
    
    def visit_Attribute(self, node):
        try:
            self._name.appendleft(node.attr)
            self._name.appendleft(node.value.id)
        except AttributeError:
            self.generic_visit(node)
            
def get_func_calls(tree):
    func_calls = []
    for node in ast.walk(tree):
        if isinstance(node, ast.Call):
            callvisitor = FunctionCallVisitor()
            callvisitor.visit(node.func)
            func_calls.append(callvisitor.name)
    return func_calls

#### Load APIDoc and Dataset

In [4]:
api_doc_file = '../../data-import/build_api_doc_base/api_doc.csv'
tagged_dataset_file = '../../../data/stack-overflow/Dataset - Pandas.csv'
so_dump_processed_file = '../../../data/stack-overflow/pandas-preprocessedcode-dataset-part3'

api_df = pd.read_csv(api_doc_file, encoding='ISO-8859-1', error_bad_lines=False)
tagged_dataset_df = pd.read_csv(tagged_dataset_file, encoding='ISO-8859-1', error_bad_lines=False)
processed_stackoverflow_df = pd.read_pickle(so_dump_processed_file)

def buildAPIDictionary(api_df):
    api_dict = dict()
    try:
        
        for index, row in api_df.iterrows():
            methodContext = row['Description']
            tokens = row['FullyQualifiedName'].split('.')
        
            for token in tokens:
                methodContext = str(methodContext)+' '+token
            api_dict[row['MethodName']] = methodContext
    except Exception as e:
        print(e)
    return api_dict
        

def buildTaggedDatasetDSForEvaluation(tagged_dataset_df):
    dataset_dict = dict()
    total_solutions = 0
    try:    
        for idx, row in tagged_dataset_df.iterrows():
            answerId = row['AnswerId']
            if answerId != 0:
                total_solutions = total_solutions +1
                tup = (int(row['SolutionId']), row['Solution'])
                if answerId in dataset_dict:
                    ls = dataset_dict[answerId]
                    ls.append(tup)
                    dataset_dict[answerId] = ls
                else:
                    ls = list()
                    ls.append(tup)
                    dataset_dict[answerId] = ls
    except Exception as e:
        e
        
    return dataset_dict, total_solutions

def buildAnswerIdQuestionTextDict(tagged_dataset_df):
    dataset_answerId_QText_Dict = dict()
    try:
        for idx, row in tagged_dataset_df.iterrows():
            answerId = row['AnswerId']
            if answerId != 0:
                dataset_answerId_QText_Dict[answerId] = row['QuestionText']
    except Exception as e:
        print(e)
        
    return dataset_answerId_QText_Dict

def buildStackOverflowDumpDict(processed_stackoverflow_df):
    stackoverflow_dict = dict()
    try:
        for idx, row in processed_stackoverflow_df.iterrows():
            postTypeId = row['PostTypeId']
            if postTypeId == 2:
                answerId = row['Id']
                stackoverflow_dict[answerId] = row['PreprocessedCode']
    except Exception as e:
        e
    return stackoverflow_dict
    
api_dict = buildAPIDictionary(api_df)
tagged_dataset_dict, total_solutions = buildTaggedDatasetDSForEvaluation(tagged_dataset_df)
dataset_answerId_QText_Dict = buildAnswerIdQuestionTextDict(tagged_dataset_df)
stackoverflow_dict = buildStackOverflowDumpDict(processed_stackoverflow_df)

In [5]:
def lookUpAPIDocForContext(method_name):
    if method_name in api_dict.keys():
        return api_dict[method_name]
    else:
        return ""

def getSOContext(answerId):
    return dataset_answerId_QText_Dict[int(answerId)]

all the lines --> tp +tn + fp+fn
tp-> identified as solution and the solution
fp-> identified as solution but not the solution
tn-> not identified as solution and also not the solution
fn-> not identified as solution but the solution

Accuracy: (tp+tn)/(tp+tn+fp+fn)
Precision: tp/(tp+fp)
Recall: tp/(tp+fn)
F1: (2*P*R)/(P+R)

TP-> Lines identified as solution are the solution
TN -> Lines not identified as solution are not the solution

In [6]:
def applyM1(cosine_sim_thresould):
    df_columns = ['ansId', 'actual','line','predicted']
    result_df = pd.DataFrame(columns=df_columns)
    df_row_id = 0
    for key in tagged_dataset_dict.keys():
        try:
            solutionList = tagged_dataset_dict[key]
            content = str(stackoverflow_dict[key])
            lines = content.split(os.linesep)
            for line in lines:
                actualSolution = False
                predictedSolution = False
                for tup in solutionList:
                    if tup[1].strip() == line.strip():
                        actualSolution = True
                
                tree = ast.parse(line)
                func_calls = get_func_calls(tree)
                for func_call in func_calls:
                    tokens = func_call.split('.')
                    method_name = tokens[len(tokens)-1]
                    API_Context = lookUpAPIDocForContext(method_name)
                    if API_Context == "":
                        cos_score = -1 # Reject the method
                    else:
                        SO_Context = getSOContext(key)
                        cos_score = cosine_sim(API_Context, SO_Context)
                    if cos_score > cosine_sim_thresould:
                        predictedSolution = True
                        break
                
                result_df.loc[df_row_id] = [key, line, actualSolution, predictedSolution]
                df_row_id = df_row_id + 1
        except Exception as e:
            e
    return result_df

In [7]:
cosine_sim_th = 0.0
result_df = applyM1(cosine_sim_th)

In [8]:
output_file = '../../../data/results/result_df_m1'
debug = True

result_df.to_pickle(output_file)

if debug:
    result_df.to_csv(output_file + ".csv", encoding='ISO-8859-1', sep=",", doublequote=True, index=False)
