In [3]:
import numpy as np
import pandas as pd
import pickle

## 1 Load Data 

1. Source code function : function and docstring training data for word embedding
2. Stackoverflow QA pair and result : SO QA pair and similarity result
3. Source python code : the path list to find the original .py file


### Source code function

In [4]:
# load in processed file. A set of keywords for each document (source code function)
def load_words_from_ast(file_path):
    with open(file_path, 'rb') as f:
        function_list = pickle.load(f)
    unpickled_df = pd.DataFrame(function_list, columns=['data_id', 'function_name', 'docstring', 'func_call'])

    func_size=len(unpickled_df)
    print("Total Number of Functions in \"{}\": {}".format(file_path, func_size))
    return unpickled_df

In [5]:
function_df=load_words_from_ast("./data/py100k_2.pkl")
function_df.head()

Total Number of Functions in "./data/py100k_2.pkl": 742490


Unnamed: 0,data_id,function_name,docstring,func_call
0,1,_send,,"send,send,send,send"
1,1,__call__,,"Timeout,_send,_send"
2,1,_recv,,"append,recv,recv"
3,1,__call__,,"Timeout,_recv,_recv"
4,1,__init__,,"Channel,spawn,_sender"


### Stackoverflow QA pair and result

In [6]:
with open('./data/SO_similarity_0_200.pkl', 'rb') as f:
    so_result = pickle.load(f)

In [7]:
# question with python tag only
so_result[(so_result['Tags'] == '<python-3.x>')|(so_result['Tags'] == '<python>')].head(5)

Unnamed: 0,Post Link,Question Score,ViewCount,Question Title,Question Content,Answer,Tags,func_id
1,48211001,0,56,Python: How to update multiple address lines,<p>I'm currently working on a banking system f...,<p>Your issue is arguably in your <code>update...,<python>,"[13683, 646659, 475808, 207093, 66185, 254360,..."
13,48313280,0,91,"NameError, python3 get_step not defined","<p>I'm following the book ""Python crash Course...",<p>You know how when you refer to <code>x_valu...,<python-3.x>,"[560504, 560510, 560494, 372372, 560503, 56049..."
23,48426834,0,43,How to use global variables in imported librar...,<p>I've a <code>main.py</code> file with a blo...,<p>You should pass the <code>urtc.DS3231</code...,<python>,"[701854, 310520, 293856, 665271, 122594, 45914..."
60,48220664,-2,47,Unable to understand code -newbie,<p>I am new to python ( started 1 week ago) an...,"<p>Firstly, <code>chain</code> is not a Functi...",<python>,"[325904, 665355, 145290, 424311, 625039, 29223..."
67,48318835,-3,57,List data changed when no methods are called t...,<p>This is my code for copying data from the o...,<p>Dictionaries in Python are references. This...,<python>,"[477962, 393437, 741944, 722228, 477980, 69434..."


### Source python code 

In [8]:
with open('./data/python100k_train.txt') as data_file:
    source_list = data_file.readlines()
source_list[:2]

['data/00/wikihouse/urls.py\n', 'data/0rpc/zerorpc-python/zerorpc/events.py\n']

## 2 Search Result

In [90]:
def print_question_function(idx, so_result = so_result, function_df = function_df, source_list = source_list):
    
    """
    input the question id and print the matched function 
    """
    
    print("Question: ")
    print(so_result.iloc[idx]['Question Title'])
    print("")

    
    func_id = so_result.iloc[idx]['func_id']
    
    for i in func_id:
        print(i)
        print(function_df.iloc[i])
        print("")
    

In [91]:
print_question_function(115)

Question: 
Python append to list from return tuple

573114
data_id                    77065
function_name    test_from_tuple
docstring                       
func_call                to_list
Name: 573114, dtype: object

242386
data_id                                  32998
function_name                        serialize
docstring         Propagate to list elements. 
func_call           serialize,append,serialize
Name: 242386, dtype: object

579171
data_id                                                      77812
function_name                                             __repr__
docstring        \n        Return repr using sequence's repr fu...
func_call                                                  to_list
Name: 579171, dtype: object

544288
data_id                                                      73777
function_name                                   get_returned_value
docstring        \n        Return value representation to local...
func_call                                   

In [92]:
def print_question_full_answer(idx, so_result = so_result, function_df = function_df, source_list = source_list):
    
    """
    input the question id and print the full .py file
    """
    
    print("Question: ")
    print(so_result.iloc[idx]['Question Title'])
    print("")

    
    func_id = so_result.iloc[idx]['func_id']
    print(func_id)
    print("")
    
    for i, value in enumerate(func_id):

        # find data id 
        print("Search Result: ", i+1)
        data_id = function_df.iloc[value]['data_id']

        # open the python source code
        path = './data/' + source_list[data_id].replace('\n','')
        with open(path) as data_file:
            for r in data_file.readlines():
                print(r)
            

Should download 'py150_files/data.tar.gz' and compress it to get all .py files

In [93]:
print_question_full_answer(115)

Question: 
Python append to list from return tuple

[573114 242386 579171 544288 282807 247890 103229 544668 479491 239193]

Search Result:  1
import copy

import sys



from sqlalchemy import util, sql, exc, testing

from sqlalchemy.testing import assert_raises, assert_raises_message, fixtures


from sqlalchemy.testing.util import picklers, gc_collect

from sqlalchemy.util import classproperty, WeakSequence, get_callable_argspec

from sqlalchemy.sql import column

from sqlalchemy.util import langhelpers, compat

import inspect





class _KeyedTupleTest(object):



    def _fixture(self, values, labels):

        raise NotImplementedError()



    def test_empty(self):

        keyed_tuple = self._fixture([], [])

        eq_(str(keyed_tuple), '()')

        eq_(len(keyed_tuple), 0)



        eq_(list(keyed_tuple.keys()), [])

        eq_(keyed_tuple._fields, ())

        eq_(keyed_tuple._asdict(), {})



    def test_values_but_no_labels(self):

        keyed_tuple = self._fixture([

        eq_(

            util.generic_repr(

                Bar(a='a', b='b', c='c'),

                to_inspect=[Bar, Foo]

            ),

            "Bar(b='b', c='c', a='a')"

        )





    def test_discard_vargs(self):

        class Foo(object):

            def __init__(self, a, b, *args):

                self.a = a

                self.b = b

                self.c, self.d = args[0:2]

        eq_(

            util.generic_repr(Foo(1, 2, 3, 4)),

            "Foo(1, 2)"

        )



    def test_discard_vargs_kwargs(self):

        class Foo(object):

            def __init__(self, a, b, *args, **kw):

                self.a = a

                self.b = b

                self.c, self.d = args[0:2]

        eq_(

            util.generic_repr(Foo(1, 2, 3, 4, x=7, y=4)),

            "Foo(1, 2)"

        )



    def test_significant_vargs(self):

        class Foo(object):

            def __init__(self, a, b, *args):

                self.a = a

                s

        >>> seq([1, 2, 3]).last()

        3



        Raises IndexError when the sequence is empty.



        >>> seq([]).last()

        Traceback (most recent call last):

         ...

        IndexError: list index out of range



        :return: last element of sequence

        """

        return _wrap(self.sequence[-1])



    def last_option(self):

        """

        Returns the last element of the sequence or None, if the sequence is empty.



        >>> seq([1, 2, 3]).last_option()

        3



        >>> seq([]).last_option()

        None



        :return: last element of sequence or None if sequence is empty

        """

        if not self.sequence:

            return None

        return self.last()



    def init(self):

        """

        Returns the sequence, without its last element.



        >>> seq([1, 2, 3]).init()

        [1, 2]



        :return: sequence without last element

        """

        return self._transform(transformations.init


            loader = RegistryLoader(registry)

            self._registries[registry] = loader

        self.insert_func_defn(loader.new_registrations('functions'))

        self._insert_getattr_defn(loader.new_registrations('getattrs'))

        self._insert_setattr_defn(loader.new_registrations('setattrs'))

        self._insert_cast_defn(loader.new_registrations('casts'))

        self._insert_get_constant_defn(loader.new_registrations('constants'))



    def insert_func_defn(self, defns):

        for impl, func, sig in defns:

            self._defns[func].append(impl, sig)



    def _insert_getattr_defn(self, defns):

        for impl, attr, sig in defns:

            self._getattrs[attr].append(impl, sig)



    def _insert_setattr_defn(self, defns):

        for impl, attr, sig in defns:

            self._setattrs[attr].append(impl, sig)



    def _insert_cast_defn(self, defns):

        for impl, sig in defns:

            self._casts.append(impl, sig)



    def _insert_

  def _GetSize(self):

    """Returns number of rows in table."""



    if not self._table:

      return 0

    return len(self._table) - 1



  def _GetTable(self):

    """Returns table, with column headers and separators.



    Returns:

      The whole table including headers as a string. Each row is

      joined by a newline and each entry by self.separator.

    """

    result = []

    # Avoid the global lookup cost on each iteration.

    lstr = str

    for row in self._table:

      result.append(

          '%s\n' %

          self.separator.join(lstr(v) for v in row))



    return ''.join(result)



  def _SetTable(self, table):

    """Sets table, with column headers and separators."""

    if not isinstance(table, TextTable):

      raise TypeError('Not an instance of TextTable.')

    self.Reset()

    self._table = copy.deepcopy(table._table)   # pylint: disable=W0212

    # Point parent table of each row back ourselves.

    for row in self:

      row.table = se


# Copyright (c) 2009 Roy Keyes (roy.coding)

# Copyright (c) 2011 Aditya Panchal

# This file is part of dicompyler, relased under a BSD license.

#    See the file license.txt included with this distribution, also

#    available at http://code.google.com/p/dicompyler/

#

# Start - 20 Nov. 2009

# It is assumed that the bin width of the cDVH is fixed at 1 cGy.



def get_dvh_min(dvh):

    '''Return minimum dose to ROI derived from cDVH.'''



    # ROI volume (always receives at least 0% dose)

    v1 = dvh[0]



    j = 1

    jmax = len(dvh) - 1

    mindose = 0

    while j < jmax:

        if dvh[j] < v1:

            mindose = (2*j - 1)/2.0

            break

        else:

            j += 1



    return mindose



def get_dvh_max(dvh):

    '''Return maximum dose to ROI derived from cDVH.'''



    # Calulate dDVH

    ddvh = get_ddvh(dvh)



    maxdose = 0

    j = len(ddvh) - 1

    while j >= 0:

        if ddvh[j] > 0.0:

            maxdose = j+1

            break



### StackOverflow Best Answer

In [76]:
so_result.iloc[115]['Answer'].split('\n')

['<p>What you want to do is not possible with a "one liner" (without using semi-colon to fit two statements on a single line) <em>unless</em> you change your function definition.</p>',
 '',
 '<p>Specifically, define <code>f()</code> in the following way:</p>',
 '',
 '<pre><code>def f(list_to_append_to):',
 '    a = ...',
 '    b = ...',
 '    c = ...',
 '    list_to_append_to.append(c)',
 '    return a, b',
 '</code></pre>',
 '',
 '<p>Then:</p>',
 '',
 '<pre><code>a, b = f(mylist)',
 '</code></pre>',
 '']