In [None]:
from google.colab import drive
drive.mount('/content/drive')

Importing libraries and APIs.

In [None]:
import re
import pickle
import pandas as pd
import numpy as np

Unrarring the dataset.

In [None]:
!unrar x '/content/drive/MyDrive/use.rawcode.rar'

Defining the preprocessing functions.

*   *remove_non_ascii*: replaces all non-ASCII characters with an empty string.
*   *remove_special*: replaces all special characters in the docstring with an empty string.
*   *seperate_strings*: seperates camelCase strings.
*   *remove_empty*: removes all empty strings.
*   *lowercase*: lowercases all strings in the docstring to avoid case sensitivity.
*   *remove_unnecessary*: removes all string values and comments in the code.
*   *replace_symbols*: replaces specific programming symbols with their names.
*   *trim*: keeps a maximum of 100 code tokens for each code.

In [None]:
def remove_non_ascii(data):
  for index, row in data.iteritems():
    for token in row:
      token_index = row.index(token)
      # replacing non-ASCII characters with an empty string.
      token = re.sub(r'[^\x00-\x7f]', '', token)
      data[index][token_index] = token
      
  return data

def remove_special(data):
  for index, row in data.iteritems():
    for token in row:
      token_index = row.index(token)
      # replacing special characters with an empty string.
      token = re.sub(r'[^A-Za-z0-9]+', '', token)
      data[index][token_index] = token

  return data

def separate_strings(data):
  for index, row in data.iteritems():
    for token in row:
      # if the string is in camelCase format.
      if re.findall(r'[a-z][^A-Z]*|[A-Z][a-z][^A-Z]*', token):
        token_index = row.index(token)
        # capitalizing the first letter of the token.
        token = token[0].capitalize() + token[1:]
        token = re.findall(r'[A-Z][a-z][^A-Z]*|[A-Z]*(?![a-z])|[A-Z][a-z][^A-Z]*', token)
        # replacing token with an empty string.
        data[index][token_index] = ''
        # adding the seperated words to the list preserving their original position.
        data[index] = data[index][:token_index] + token + data[index][token_index:]
        # updating row.
        row = data[index]

  return data

def remove_empty(data):
  for index, row in data.iteritems():
    for token in row:
      if not token:  
        # removing empty strings from the list.
        data[index] = list(filter(None, row))

  return data

def lowercase(data):
  for index, row in data.iteritems():
    for token in row:
      token_index = row.index(token)
      token = token.lower()
      data[index][token_index] = token

  return data

def remove_unnecessary(data):
  for index, row in data.iteritems():
    for token in row:
      # if the string contains space, double quotes or is a comment.
      if re.findall(r'[ ]', token) or re.findall(r'(")', token) or re.findall(r'(^//)', token) or re.findall(r'(^/\*)', token) or re.findall(r'(^/\*\*)', token):
        token_index = row.index(token)
        # replacing token with an empty string.
        data[index][token_index] = ''
  
  return data

def replace_symbols(data):
  dictionary = {'(': 'openingparenthesis',
                ')': 'closingparenthesis',
                '[': 'openingbracket', 
                ']': 'closingbracket',
                '{': 'openingbrace', 
                '}': 'closingbrace',
                '+': 'addoperator', 
                '-': 'subtractoperator',
                '*': 'multiplyoperator', 
                '/': 'divideoperator',
                '^': 'poweroperator', 
                '%': 'modulooperator',
                '=': 'assignoperator', 
                '==': 'equaloperator',
                '!=': 'notequaloperator', 
                '>': 'greateroperator',
                '<': 'lessoperator', 
                '>=': 'greaterequaloperator',
                '<=': 'lessequaloperator', 
                '++': 'incrementoperator',
                '--': 'decrementoperator', 
                '!': 'notoperator',
                '@': 'atsign',
                ';': 'semicolon'}

  for index, row in data.iteritems():
    for token in row:
      # if the string contains one or more of the following symbols.
      if re.findall(r'^[()[\]{}<>+\-*/^%=!@;]', token):
        token_index = row.index(token)
        # replacing token with the name of the symbol contained.
        for symbol, name in dictionary.items():
          if token == symbol:
            data[index][token_index] = name
        
  return data

def trim(data):
  for index, row in data.iteritems():
    if len(row) > 100:
      data[index] = row[:100]

  return data

Applying preprocressing to the code tokens.

In [None]:
for deepcs_tokens in pd.read_csv('/content/use.rawcode.txt', sep='\n', encoding='latin', header=None, chunksize=1000000):
  deepcs_tokens = deepcs_tokens.squeeze().str.split()

  deepcs_tokens = remove_non_ascii(deepcs_tokens)
  deepcs_tokens = separate_strings(deepcs_tokens)
  deepcs_tokens = remove_unnecessary(deepcs_tokens)
  deepcs_tokens = replace_symbols(deepcs_tokens)
  deepcs_tokens = remove_special(deepcs_tokens)
  deepcs_tokens = remove_empty(deepcs_tokens)
  deepcs_tokens = trim(deepcs_tokens)
  deepcs_tokens = lowercase(deepcs_tokens)

  deepcs_tokens.to_csv('/content/drive/MyDrive/deepcs_tokens.csv', mode='a', header=False, index=False)