In [4]:
# mount drive https://datascience.stackexchange.com/questions/29480/uploading-images-folder-from-my-system-into-google-colab
# login with your google account and type authorization code to mount on your googlbie drive.
from google.colab import drive
drive.mount('/gdrive')
root = '/gdrive/My Drive/CS492I/project'

Mounted at /gdrive


## 1. Collect CodeSearchNet Repositories

In [1]:
import json

import pandas as pd
from pathlib import Path
pd.set_option('max_colwidth',300)
from pprint import pprint

In [2]:
!wget https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/python.zip
!mkdir CodeSearchNet
!unzip python.zip -d CodeSearchNet

--2021-12-03 05:33:04--  https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/python.zip
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.145.45
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.145.45|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 940909997 (897M) [application/zip]
Saving to: ‘python.zip’


2021-12-03 05:33:25 (43.4 MB/s) - ‘python.zip’ saved [940909997/940909997]

Archive:  python.zip
   creating: CodeSearchNet/python/
   creating: CodeSearchNet/python/final/
   creating: CodeSearchNet/python/final/jsonl/
   creating: CodeSearchNet/python/final/jsonl/train/
  inflating: CodeSearchNet/python/final/jsonl/train/python_train_9.jsonl.gz  
  inflating: CodeSearchNet/python/final/jsonl/train/python_train_12.jsonl.gz  
  inflating: CodeSearchNet/python/final/jsonl/train/python_train_10.jsonl.gz  
  inflating: CodeSearchNet/python/final/jsonl/train/python_train_0.jsonl.gz  
  inflating: CodeSearchNet/python/final/jsonl/train/pytho

In [5]:
python_files = sorted(Path('CodeSearchNet/python').glob('**/*.gz'))
print(python_files)

[PosixPath('CodeSearchNet/python/final/jsonl/test/python_test_0.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_0.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_1.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_10.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_11.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_12.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_13.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_2.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_3.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_4.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_5.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_6.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_7.jsonl.gz

In [6]:
columns_long_list = ['repo', 'path', 'url', 'code', 
                     'code_tokens', 'docstring', 'docstring_tokens', 
                     'language', 'partition']

def jsonl_list_to_dataframe(file_list, columns=columns_long_list):
    """Load a list of jsonl.gz files into a pandas DataFrame."""
    return pd.concat([pd.read_json(f, 
                                   orient='records', 
                                   compression='gzip',
                                   lines=True)[columns] 
                      for f in file_list], sort=False)

In [8]:
pydf = jsonl_list_to_dataframe(python_files, columns=['repo'])
pydf = pydf.drop_duplicates().reset_index(drop=True)
pydf.to_pickle(f"{root}/repos.pkl")

## 2. Collect diff and commits

In [1]:
!pip install pydriller
!pip install pandas
!pip install spacy

Collecting pydriller
  Downloading PyDriller-2.0-py3-none-any.whl (65 kB)
[K     |████████████████████████████████| 65 kB 1.7 MB/s 
[?25hCollecting gitpython
  Downloading GitPython-3.1.24-py3-none-any.whl (180 kB)
[K     |████████████████████████████████| 180 kB 11.7 MB/s 
[?25hCollecting lizard
  Downloading lizard-1.17.9-py2.py3-none-any.whl (64 kB)
[K     |████████████████████████████████| 64 kB 1.9 MB/s 
Collecting gitdb<5,>=4.0.1
  Downloading gitdb-4.0.9-py3-none-any.whl (63 kB)
[K     |████████████████████████████████| 63 kB 599 kB/s 
[?25hCollecting smmap<6,>=3.0.1
  Downloading smmap-5.0.0-py3-none-any.whl (24 kB)
Installing collected packages: smmap, gitdb, lizard, gitpython, pydriller
Successfully installed gitdb-4.0.9 gitpython-3.1.24 lizard-1.17.9 pydriller-2.0 smmap-5.0.0


In [8]:
from pydriller import *
import pandas as pd
import nltk
import spacy
import re
from itertools import chain
import json

In [5]:
repodf = pd.read_pickle(f"{root}/repos.pkl")
print(repodf.shape)
spacy_tokenizer = spacy.load("en_core_web_sm")
print(repodf)

(13590, 1)
                                 repo
0                     soimort/you-get
1                      apache/airflow
2                      pytorch/vision
3                      asciimoo/searx
4              tensorflow/probability
...                               ...
13585         praekelt/python-ambient
13586                 zenreach/py-era
13587  TakesxiSximada/custom_settings
13588            openpermissions/bass
13589               xnuinside/clifier

[13590 rows x 1 columns]


In [9]:
def basic_filter(message):
    return message.split("\n", 1)[0].strip()

# Remove [label] in front of commit if exists
def label_filter(message):
    if (message.startswith('[')):
        end_bracket_index = message.find(']')
        if (end_bracket_index == -1):
            return None
        return message[:end_bracket_index+1]
    return message

def camel_case_split(str):
    return re.findall(r'[A-Z](?:[a-z]+|[A-Z]*(?=[A-Z]|$))', str)

def case_splitter(token):
    return list(map(lambda x: x.lower(), camel_case_split(token[0].upper() + token[1:])))


def split_by_quote(diff):
    text_list = [(0,0)]
    quote_state = 0 #["out","single","double","Single","Double","quote"]
    i=0
    while i<len(diff):
      if diff[i] == "\\":i+=1
      elif (diff[i] == "\n" and quote_state == 5) or (diff[i] == '#' and quote_state == 0):
        text_list.append((quote_state,i))
        quote_state = abs(5-quote_state)
      elif diff[i] == '"':
        if i+2<len(diff) and diff[i:i+3]=='"""' and (quote_state == 0 or quote_state ==4):
          text_list.append((quote_state,i+max(quote_state-1,0)))
          quote_state = abs(4-quote_state)
          i+=2
        elif quote_state == 0 or quote_state == 2:
          text_list.append((quote_state,i+max(quote_state-1,0)))
          quote_state = abs(2-quote_state)
      elif diff[i] == "'":
        if i+2<len(diff) and diff[i:i+3] == "'''" and (quote_state == 0 or quote_state == 3):
          text_list.append((quote_state,i+quote_state))
          quote_state = abs(3-quote_state)
          i+=2
        elif quote_state == 0 or quote_state == 1:
          text_list.append((quote_state,i+quote_state))
          quote_state = abs(1-quote_state)
      i+=1
    text_list.append((quote_state,i))
    return [(text_list[i][0]==0,diff[text_list[i-1][1]:text_list[i][1]]) for i in range(1,len(text_list))]
  

def diff_tokenizer(diff_text):
    diff = '\n'.join(map(lambda x: x[1], filter(lambda y: y[0] % 2 == 0, enumerate(diff_text.split("@@")))))
    diff = diff.replace('\n+', '\n<add>').replace('\n-', '\n<del>')
    diff = re.sub(r"(?:\n[ \t\r\f\v]*)+","\n",diff) #Join continuous row change
    quote_split = split_by_quote(diff)
    token_regex = r"""(?x)
      <(?:add|del)>   #Filtered eariler
    |(?:[-+*/^&~|=%!<>@?$][\s]*)+     #Sequence of symbols
    |[\n]                    #Change row
    |[a-zA-Z]+               #General text
    |[0-9]+                  #Number
    """
    string_regex = r"""(?x)
    <(?:add|del)>
    |[\n]
    |[a-zA-Z]+
    """
    #'"`,.;:()[]{}_ not included
    token_initial = chain.from_iterable(map(lambda a: nltk.tokenize.regexp_tokenize(a[1],token_regex) if a[0] else nltk.tokenize.regexp_tokenize(a[1],string_regex), quote_split))
    token_camel_case_split = chain.from_iterable(map(lambda a: case_splitter(a) if 97<=ord(a[0].lower())<122 else [a], token_initial))
    return token_camel_case_split


def parse_repo_commits(repo_name, commit_limit=50):
    data = []
    commit_count = 0
    for commit in Repository(
        f"https://github.com/{repo_name}",
        only_modifications_with_file_types=[".py"],
        only_no_merge=True,
        order='reverse'
    ).traverse_commits():
        if (commit_count >= commit_limit): break
        line = basic_filter(commit.msg)
        line = label_filter(line)
        if (line is None):
            print(f"[DEBUG] Label filter return None for repo {repo} and hash {commit.hash}")
        
        line = line.replace('_', ' ').replace('.', ' ')

        # ignore mentions, non-English, github issue #
        if (len(re.findall(r"[^a-zA-Z0-9: ]", line)) != 0):
          continue
        
        tokens = spacy_tokenizer(line)
        
        # VERB filter
        if (tokens[0].pos_ != 'VERB'):
            continue
        
        commit_tokens = list(chain.from_iterable(map(lambda token: case_splitter(token.text), tokens)))
        
        if (len(commit_tokens) < 3 or len(commit_tokens) > 30):
            continue

        # Check if changed files are python
        file_failed = False
        count_diff_lines = 0

        for mf in commit.modified_files:
            if (not mf.filename.endswith(".py")):
                file_failed = True
                break
            count_diff_lines += mf.added_lines + mf.deleted_lines
        
        if (file_failed):
            continue
        
        if(count_diff_lines > 50):
            continue
        
        # Create diff tokens
        diff_tokens = []
        for f in commit.modified_files:
          diff_tokens.append (['<file>'])
          diff_tokens.append(diff_tokenizer(f.diff))

        diff_whole_tokens = list(chain.from_iterable(diff_tokens))

        data.append([repo_name, commit.hash, json.dumps(commit_tokens), json.dumps(diff_whole_tokens)])
        commit_count += 1

    return pd.DataFrame(data, columns=["repo", "hash", "commit_messsage", "diff"])

In [10]:
import time
start = time.time()
df = parse_repo_commits("soimort/you-get")
#df = parse_repo_commits("tensorflow/probability")
end = time.time()
print(end - start)
df.head(3)

19.178345680236816


Unnamed: 0,repo,hash,commit_messsage,diff
0,soimort/you-get,439354e730d8b864de9401536c93220467ccb355,"[""add"", ""hdr"", ""support"", ""for"", ""bilibili""]","[""<file>"", ""\n"", ""class"", ""bilibili"", ""video"",..."
1,soimort/you-get,4a9d2c1e13b8918deba39af515d315b60e545422,"[""add"", ""fake"", ""header""]","[""<file>"", ""\n"", ""def"", ""netease"", ""song"", ""do..."
2,soimort/you-get,205470ec116654608ddd97390bd885ba6df100b1,"[""add"", ""support"", ""for"", ""socks"", ""proxy"", ""u...","[""<file>"", ""\n"", ""def"", ""load"", ""cookies"", ""co..."


# 3. Create whole dataset via Multiprocessing

In [11]:
import multiprocessing

data = []

def f(repo):
    start = time.time()
    df = parse_repo_commits(repo)
    #df.to_pickle(f"./repos/{repo.replace('/', '+')}.pkl")
    end = time.time()
    print(f"{repo} took {end-start} seconds")
    return df

pool = multiprocessing.Pool()
outputs = pool.map(f, repodf['repo'][:3])
pd.concat(outputs).to_pickle(f"{root}/data.pkl")

soimort/you-get Done
pytorch/vision Done


Process ForkPoolWorker-1:
Process ForkPoolWorker-2:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/usr/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/usr/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.7/multiprocessing/pool.py", line 121, in worker
    result = (True, func(*args, **kwds))
  File "/usr/lib/python3.7/multiprocessing/pool.py", line 110, in worker
    task = get()
  File "/usr/lib/python3.7/multiprocessing/pool.py", line 44, in mapstar
    return list(map(*args))
  File "/usr/lib/python3.7/multiprocessing/queues.py", line 352, in get
    res = self._reader.recv_bytes()
  File "/usr/lib/python3.7/multiprocess

KeyboardInterrupt: ignored

  File "/usr/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes
    buf = self._recv(4)
  File "/usr/local/lib/python3.7/dist-packages/pydriller/utils/conf.py", line 282, in _has_modification_with_file_type
    for mod in commit.modified_files:
  File "/usr/local/lib/python3.7/dist-packages/pydriller/utils/conf.py", line 267, in is_commit_filtered
    if not self._has_modification_with_file_type(commit):
  File "<ipython-input-11-b9bbe713e615>", line 6, in f
    df = parse_repo_commits(repo)
  File "/usr/local/lib/python3.7/dist-packages/pydriller/domain/commit.py", line 684, in _get_modifications
    self._c_object, create_patch=True, **options
  File "/usr/local/lib/python3.7/dist-packages/pydriller/domain/commit.py", line 668, in modified_files
    self._modifications = self._get_modifications()
  File "/usr/local/lib/python3.7/dist-packages/git/diff.py", line 175, in diff
    index = diff_method(self.repo, proc)
  File "/usr/local/lib/python3.7/dist-packages/git

In [None]:
df = pd.read_pickle("data.pkl")
df.head(3)