Installing the wget package.

In [0]:
pip install wget

Importing useful libraries.

In [0]:
import re
import wget
import pickle
import pandas as pd
import numpy as np
import tensorflow_datasets as tfds

Downloading the dataset.

In [0]:
url = 'https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/java.zip'
wget.download(url, '/content/dataset.zip')

Unzipping the dataset.

In [0]:
!unzip '/content/dataset.zip'

Unpickling the dataset and creating a DataFrame with its data.

In [0]:
data = pickle.load(open('java_dedupe_definitions_v2.pkl', 'rb'))
data = pd.DataFrame(data)

Removing data without a docstring.

In [0]:
# removing rows with no docstrings.
data = data[data['docstring_tokens'].map(lambda d: len(d)) > 0]
# resetting DataFrame indices.
data = data.reset_index(drop=True)

# selecting only the first 50,000 rows for faster testing.
#data = data[:50000]

Defining the preprocessing functions.

*   *remove_after_dot*: removes all strings after the occurence of the first dot in the doscstring.
*   *remove_non_ascii*: replaces all non-ASCII characters with an empty string.
*   *remove_special*: replaces all special characters in the docstring with an empty string.
*   *seperate_strings*: seperates strings that have at least one uppercase and one lowercase letter.
*   *remove_empty*: removes all empty strings.
*   *fill_empty*: empties docstrings with less than three or more than 30 words and fills them with words from function's identifier to perform data augmentation.
*   *lowercase*: lowercases all strings in the docstring to avoid case sensitivity.
*   *remove_unnecessary*: removes all string values and comments in the function.
*   *trim*: keeps a maximum of 100 function tokens for each function.

In [0]:
def remove_after_dot(data):
  for index, row in data.iteritems():
    for token in row:
      if token == '.':
        token_index = row.index(token)
        data[index] = row[:token_index]
        break
  
  return data

def remove_non_ascii(data):
  for index, row in data.iteritems():
    for token in row:
      token_index = row.index(token)
      # replacing non-ASCII characters with an empty string.
      token = re.sub(r'[^\x00-\x7f]', '', token)
      data[index][token_index] = token
      
  return data

def remove_special(data, function=False):
  for index, row in data.iteritems():
    for token in row:
      token_index = row.index(token)
      # replacing special characters with an empty string.
      token = re.sub(r'[^A-Za-z0-9]+', '', token)
      data[index][token_index] = token

  return data

def seperate_strings(data):
  for index, row in data.iteritems():
    for token in row:
      # if the string has at least one uppercase and one lowercase letter.
      if re.findall(r'[A-Z][a-z][^A-Z]*', token):
        token_index = row.index(token)
        # capitalizing the first letter of the token.
        token = token[0].capitalize() + token[1:]
        token = re.findall(r'[A-Z][a-z][^A-Z]*', token)
        # replacing token with an empty string.
        data[index][token_index] = ''
        # adding the seperated words to the list preserving their original position.
        data[index] = data[index][:token_index] + token + data[index][token_index:]
        # updating row.
        row = data[index]

  return data

def remove_empty(data):
  for index, row in data.iteritems():
    for token in row:
      if not token:  
        # removing empty strings from the list.
        data[index] = list(filter(None, row))

  return data

def fill_empty(identifier, data):
  for (index, row), identifier_row in zip(data.iteritems(), identifier):
    if len(row) < 3 or len(row) > 30:
        data[index] = []
    if not data[index]:
      # splitting identifiers on the dots.
      augmented_row = identifier_row.split('.')
      # capitalizing the first letter of the second half of the identifier.
      augmented_row[1] = augmented_row[1][0].capitalize() + augmented_row[1][1:]
      # seperating all identifier words using their first capital letter.
      a = re.findall(r'[A-Z][^A-Z]*', augmented_row[0])
      b = re.findall(r'[A-Z][^A-Z]*', augmented_row[1])
      data[index] = a + b

  return data

def lowercase(data):
  for index, row in data.iteritems():
    for token in row:
      token_index = row.index(token)
      token = token.lower()
      data[index][token_index] = token

  return data

def remove_unnecessary(data):
  for index, row in data.iteritems():
    for token in row:
      # if the string contains space, double quotes or is a comment.
      if re.findall(r'[ ]', token) or re.findall(r'(")', token) or re.findall(r'(^//)', token) or re.findall(r'(^/\*)', token) or re.findall(r'(^/\*\*)', token):
        token_index = row.index(token)
        # replacing token with an empty string.
        data[index][token_index] = ''
  
  return data

def trim(data):
  for index, row in data.iteritems():
    if len(row) > 100:
      data[index] = row[:100]

  return data

Applying preprocressing to the docstring tokens.

In [0]:
# copying docstring_tokens column.
docstring_tokens = data['docstring_tokens'].copy(deep=True)
# copying identifier column.
identifier = data['identifier'].copy(deep=True)

# applying the preprocessing functions on all docstring tokens.
docstring_tokens = remove_after_dot(docstring_tokens)
docstring_tokens = remove_non_ascii(docstring_tokens)
docstring_tokens = remove_special(docstring_tokens)
docstring_tokens = seperate_strings(docstring_tokens)
docstring_tokens = remove_empty(docstring_tokens)
docstring_tokens = fill_empty(identifier, docstring_tokens)
docstring_tokens = lowercase(docstring_tokens)

Applying preprocressing to the function tokens.

In [0]:
# copying docstring_tokens column.
function_tokens = data['function_tokens'].copy(deep=True)

function_tokens = remove_non_ascii(function_tokens)
function_tokens = seperate_strings(function_tokens)
function_tokens = remove_unnecessary(function_tokens)
function_tokens = remove_special(function_tokens)
function_tokens = remove_empty(function_tokens)
function_tokens = trim(function_tokens)
function_tokens = lowercase(function_tokens)

Creating a DataFrame that consists of docstring tokens, functions and function tokens, and exporting it in pickle format.

In [0]:
dataset = pd.concat([docstring_tokens, function_tokens], axis=1)
dataset.to_pickle('/content/drive/My Drive/dataset.pkl')

Creating the docstring and function vocabularies, and exporting them in pickle format.

In [0]:
docstring_vocab = list(set(token for row in docstring_tokens for token in row))
function_vocab = list(set(token for row in function_tokens for token in row))

with open('/content/drive/My Drive/docstring_vocab.pkl', 'wb') as docstring_vocab_pkl:
    pickle.dump(docstring_vocab, docstring_vocab_pkl, protocol=pickle.HIGHEST_PROTOCOL)

with open('/content/drive/My Drive/function_vocab.pkl', 'wb') as function_vocab_pkl:
    pickle.dump(function_vocab, function_vocab_pkl, protocol=pickle.HIGHEST_PROTOCOL)