In [1]:
import pandas as pd
import pickle

## 1 Load Data 

1. Source code function : function and docstring training data for word embedding (py100k_2.pkl)
2. Stackoverflow QA pair and result : SO QA pair and similarity result 
(download link : https://drive.google.com/file/d/1_6aSg-BzrZirhML1HWBlZ2FPDmLep-5f/view?usp=sharing)
3. Source python code : the path list to find the original .py file (python100k_train.txt) 
(download link : https://www.sri.inf.ethz.ch/py150)


In [2]:
# data path 
function_path = './data/py100k_2.pkl'
SO_QA_path = './data/SO_similarity_0_200.pkl'
SO_QA_path_2 = './data/SO_similarity_200_400.pkl'
python_code_path = './data/python100k_train.txt'

### Source code function

In [3]:
# load in processed file. A set of keywords for each document (source code function)
def load_words_from_ast(file_path):
    with open(file_path, 'rb') as f:
        function_list = pickle.load(f)
    unpickled_df = pd.DataFrame(function_list, columns=['data_id', 'function_name', 'docstring', 'func_call'])

    func_size=len(unpickled_df)
    print("Total Number of Functions in \"{}\": {}".format(file_path, func_size))
    return unpickled_df

In [4]:
function_df=load_words_from_ast(function_path)
# function_df.head()

Total Number of Functions in "./data/py100k_2.pkl": 742490


### Stackoverflow QA pair and result

In [5]:
with open(SO_QA_path, 'rb') as f:
    _0_200 = pickle.load(f)

with open(SO_QA_path_2, 'rb') as f:
    _200_400 = pickle.load(f)

so_result = pd.concat([_0_200, _200_400])

In [8]:
# question with python tag only
# so_result[(so_result.Tags == '<python>')|(so_result.Tags == '<python-3.x>')]

In [7]:
with open(python_code_path) as data_file:
    source_list = data_file.readlines()

## 2 Search Result

### Print match function

In [9]:
def print_question_function(post_link_id, so_result = so_result, function_df = function_df):
    
    """
    input the post link id and print the matched function 
    """
    
    print("Question: ")
    print(so_result[so_result['Post Link'] == post_link_id]['Question Title'].iloc[0])
    print("")

    
    func_id = so_result[so_result['Post Link'] == post_link_id]['func_id']
    
    for i, value in enumerate(func_id.iloc[0]):
        print("Result:", i)
        print(function_df.iloc[value])
        print("")
    

### Print source link

In [10]:
def print_question_source_link(post_link_id, so_result = so_result, function_df = function_df, source_list = source_list):
    
    """
    input the question id and print the source_link
    """
    
    # print("Question: ")
    # print(so_result[so_result['Post Link'] == post_link_id]['Question Title'])
    # print("")
    
    func_id = so_result[so_result['Post Link'] == post_link_id]['func_id']
    source_link = []
    
    for i, value in enumerate(func_id.iloc[0]):

        # find data id 
        # print("Search Result: ", i)
        data_id = function_df.iloc[value]['data_id']
        # print(source_list[data_id])
        source_link.append(source_list[data_id])
        
    return source_link
            

In [11]:
source_link = print_question_source_link(48935250)

### Print .py 

Should download 'py150_files/data.tar.gz' and compress it to get all .py files 
https://www.sri.inf.ethz.ch/py150

In [12]:
def print_full_py(source_link, idx):
    path = './data/py150_files/' + source_link[idx].replace('\n','')
    with open(path) as data_file:
        for r in data_file.readlines():
            print(r)

In [21]:
#print_full_py(source_link, 0)

### StackOverflow Best Answer

In [23]:
# so_result[so_result['Post Link'] == post_link_id]['Answer'].split('\n')

## ** Evaluation Here **

Evaluation Sheet: https://docs.google.com/spreadsheets/d/1uPUaQCESrOscvA99ofU6Dl283nl7k-UEfQ71L0C7OVo/edit#gid=0

### 1. StackOverflow Q&A pairs

In [13]:
so_result[(so_result.Tags == '<python>')|(so_result.Tags == '<python-3.x>')]

Unnamed: 0,Post Link,Question Score,ViewCount,Question Title,Question Content,Answer,Tags,func_id
1,48211001,0,56,Python: How to update multiple address lines,<p>I'm currently working on a banking system f...,<p>Your issue is arguably in your <code>update...,<python>,"[13683, 646659, 475808, 207093, 66185, 254360,..."
13,48313280,0,91,"NameError, python3 get_step not defined","<p>I'm following the book ""Python crash Course...",<p>You know how when you refer to <code>x_valu...,<python-3.x>,"[560504, 560510, 560494, 372372, 560503, 56049..."
23,48426834,0,43,How to use global variables in imported librar...,<p>I've a <code>main.py</code> file with a blo...,<p>You should pass the <code>urtc.DS3231</code...,<python>,"[701854, 310520, 293856, 665271, 122594, 45914..."
60,48220664,-2,47,Unable to understand code -newbie,<p>I am new to python ( started 1 week ago) an...,"<p>Firstly, <code>chain</code> is not a Functi...",<python>,"[325904, 665355, 145290, 424311, 625039, 29223..."
67,48318835,-3,57,List data changed when no methods are called t...,<p>This is my code for copying data from the o...,<p>Dictionaries in Python are references. This...,<python>,"[477962, 393437, 741944, 722228, 477980, 69434..."
89,48220098,-1,64,run the code multiple times in python 3.6,"<p>I tried to make it only ask ""do you want to...",<p>The comment from darvark is correct. If you...,<python>,"[510201, 510202, 510210, 510211, 510206, 51020..."
101,48318102,1,51,How to break from `__main__` without exit(),<p>I'm doing python coding with Emacs.</p>\n\n...,<p>Why not wrap the main code in a function an...,<python>,"[104212, 21090, 448419, 355768, 355790, 355786..."
103,48319126,-5,55,Python - How can I use a variable defined in o...,<p>I am stuck on one concept in my automation ...,"<p>You obviously not only ""just started in Pyt...",<python>,"[496234, 352762, 127311, 703696, 455673, 52893..."
105,48320030,0,55,index out of range in Numbers list,"<p>I am learning the basics of python, and try...",<p>That happens because in this inner <code>wh...,<python>,"[68621, 170401, 652354, 50407, 253532, 23789, ..."
115,48430098,3,96,Python append to list from return tuple,<p>I have a list and a function <code>f</code>...,<p>What you want to do is not possible with a ...,<python>,"[573114, 242386, 579171, 544288, 282807, 24789..."


### 2. Enter Post Link ID  

In [40]:
POST_LINK_ID = 48426834
print_question_function(POST_LINK_ID)

Question: 
How to use global variables in imported libraries?

Result: 0
data_id                                                 95330
function_name                                           parse
docstring        Passes input to each QueryLineHandler in use
func_call                                              handle
Name: 701854, dtype: object

Result: 1
data_id                                          42243
function_name                      install_npm_modules
docstring        Uses npm to dependencies in node.json
func_call                                             
Name: 310520, dtype: object

Result: 2
data_id                                                      40358
function_name                                               gotcha
docstring        Use to progressively zero in on return value i...
func_call                                                   pprint
Name: 293856, dtype: object

Result: 3
data_id                                                      89390
functio

Should download 'py150_files/data.tar.gz' and compress it to get all .py files 
https://www.sri.inf.ethz.ch/py150

In [44]:
source_link = print_question_source_link(POST_LINK_ID)
source_link

['data/mongolab/dex/dex/parsers.py\n',
 'data/rehandalal/buchner/buchner/project-template/manage.py\n',
 'data/miklevin/pipulate/common.py\n',
 'data/ProgVal/Limnoria/plugins/Reply/plugin.py\n',
 'data/home-assistant/home-assistant/homeassistant/components/sensor/steam_online.py\n',
 'data/mwilliamson/spur.py/tests/ssh_tests.py\n',
 'data/deanhiller/databus/webapp/play1.3.x/python/Lib/wsgiref/handlers.py\n',
 'data/CollabQ/CollabQ/vendor/wsgiref/handlers.py\n',
 'data/dropbox/pyston/from_cpython/Lib/wsgiref/handlers.py\n',
 'data/Opentaste/bombolone/bombolone/core/languages.py\n']

In [42]:
print_full_py(source_link, 0) # from 0-9

__author__ = 'eric'



import re

from utils import pretty_json, small_json, yamlfy

from time import strptime, mktime

from datetime import datetime

import traceback



try:

    from collections import OrderedDict

except ImportError:

    from ordereddict import OrderedDict





################################################################################

# Query masking and scrubbing functions

################################################################################



def scrub(e):

    if isinstance(e, dict):

        return scrub_doc(e)

    elif isinstance(e, list):

        return scrub_list(e)

    else:

        return None





def scrub_doc(d):

    for k in d:

        if k in ['$in', '$nin', '$all']:

            d[k] = ["<val>"]

        else:

            d[k] = scrub(d[k])

        if d[k] is None:

            d[k] = "<val>"

    return d





def scrub_list(a):

    v = []

    for e in a:

        e = scrub(e)

        if e is not None:

            v.

### 4. Stack Overflow Best Answer (For Reference)

In [45]:
so_result[so_result['Post Link'] == POST_LINK_ID]['Answer'].iloc[0].split('\n')

['<p>You should pass the <code>urtc.DS3231</code> instance to the <code>current_time</code> function like so:</p>',
 '',
 '<pre><code>def current_time(rtc):',
 '    return urtc.tuple2seconds(rtc.datetime())',
 '</code></pre>',
 '',
 '<p>But you still need to <code>import urtc</code> in <code>func.py</code> so that <code>urtc.tuple2seconds</code> is available.</p>',
 '']