Installing the wget package.

In [0]:
pip install wget

Importing useful libraries.

In [0]:
import re
import wget
import pickle
import pandas as pd

# disabling pandas' chain warnings.
pd.options.mode.chained_assignment = None

Downloading the dataset.

In [0]:
url = 'https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/java.zip'
wget.download(url, '/content/dataset.zip')

Unzipping the dataset.

In [0]:
!unzip '/content/dataset.zip'

Unpickling the dataset and creating a DataFrame with its data.

In [0]:
data = pickle.load(open('java_dedupe_definitions_v2.pkl', 'rb'))
data = pd.DataFrame(data)
# selecting only the first 50,000 rows for faster testing.
data = data[:50000]

Defining all preprocessing functions.

*   *remove_after_dot*: removes all strings after the occurence of the first dot in the doscstring.
*   *remove_non_ascii*: removes all non-ascii characters in the docstring.
*   *remove_special*: removes all special characters in the docstring.
*   *fill_empty*: fills empty docstrings with words from each function's identifier in order to perform data augmentation.
*   *lowercase*: lowercases all strings in the docstring to avoid case sensitivity.



In [0]:
def remove_after_dot(docstring):
  for index, row in docstring.iteritems():
    for token in row:
      if token == '.':
        token_index = row.index(token)
        docstring[index] = row[:token_index]
        break
  
  return docstring

def remove_non_ascii(docstring):
  for index, row in docstring.iteritems():
    regex = re.compile(r'[^\x00-\x7f]')
    filtered_row = [i for i in row if not regex.match(i)]
    docstring[index] = filtered_row

  return docstring

def remove_special(docstring):
  for index, row in docstring.iteritems():
    for token in row:
      token_index = row.index(token)
      # replacing special characters with an empty string.
      token = re.sub('[^A-Za-z0-9]+', '', token)
      docstring[index][token_index] = token
      if not token:
        # popping empty strings from the list.
        docstring[index].pop(token_index)

  return docstring

def fill_empty(identifier, docstring):
  for (index, row), identifier_row in zip(docstring.iteritems(), identifier):
    if not row:
      # splitting identifiers on the dots.
      augmented_row = identifier_row.split('.')
      # capitalizing the first letter of the second half of the identifier.
      augmented_row[1] = augmented_row[1].capitalize()
      # seperating all identifier words using their first capital letter.
      a = re.findall('[A-Z][^A-Z]*', augmented_row[0])
      b = re.findall('[A-Z][^A-Z]*', augmented_row[1])
      docstring[index] = a + b

  return docstring

def lowercase(docstring):
  for index, row in docstring.iteritems():
    for token in row:
      token_index = row.index(token)
      token = token.lower()
      docstring[index][token_index] = token

  return docstring 

Applying preprocressing on all docstring tokens.

In [0]:
docstring_tokens = data['docstring_tokens']
identifier = data['identifier']

docstring_tokens = remove_after_dot(docstring_tokens)
docstring_tokens = remove_non_ascii(docstring_tokens)
docstring_tokens = remove_special(docstring_tokens)
docstring_tokens = fill_empty(identifier, docstring_tokens)
docstring_tokens = lowercase(docstring_tokens)

Creating a DataFrame that consists of docstring tokens, functions and function tokens. Also, creating a CSV for network usage.

In [0]:
dataset = pd.concat([docstring_tokens, data.function, data.function_tokens], axis=1)
dataset.to_csv('/content/drive/My Drive/dataset.csv', sep='\t')