## 1. Collect CodeSearchNet Repositories

In [1]:
import json

import pandas as pd
from pathlib import Path
pd.set_option('max_colwidth',300)
from pprint import pprint

In [2]:
!wget https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/python.zip
!mkdir CodeSearchNet
!unzip python.zip -d CodeSearchNet

--2021-11-30 15:58:09--  https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/python.zip
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.142.16
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.142.16|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 940909997 (897M) [application/zip]
Saving to: ‘python.zip’


2021-11-30 15:58:21 (79.4 MB/s) - ‘python.zip’ saved [940909997/940909997]

Archive:  python.zip
   creating: CodeSearchNet/python/
   creating: CodeSearchNet/python/final/
   creating: CodeSearchNet/python/final/jsonl/
   creating: CodeSearchNet/python/final/jsonl/train/
  inflating: CodeSearchNet/python/final/jsonl/train/python_train_9.jsonl.gz  
  inflating: CodeSearchNet/python/final/jsonl/train/python_train_12.jsonl.gz  
  inflating: CodeSearchNet/python/final/jsonl/train/python_train_10.jsonl.gz  
  inflating: CodeSearchNet/python/final/jsonl/train/python_train_0.jsonl.gz  
  inflating: CodeSearchNet/python/final/jsonl/train/pytho

In [3]:
python_files = sorted(Path('CodeSearchNet/python').glob('**/*.gz'))

In [4]:
print(python_files)

[PosixPath('CodeSearchNet/python/final/jsonl/test/python_test_0.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_0.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_1.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_10.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_11.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_12.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_13.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_2.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_3.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_4.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_5.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_6.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_7.jsonl.gz

In [5]:
columns_long_list = ['repo', 'path', 'url', 'code', 
                     'code_tokens', 'docstring', 'docstring_tokens', 
                     'language', 'partition']

def jsonl_list_to_dataframe(file_list, columns=columns_long_list):
    """Load a list of jsonl.gz files into a pandas DataFrame."""
    return pd.concat([pd.read_json(f, 
                                   orient='records', 
                                   compression='gzip',
                                   lines=True)[columns] 
                      for f in file_list], sort=False)

In [6]:
columns_repo = ['repo']

pydf = jsonl_list_to_dataframe(python_files, columns=columns_repo)

In [7]:
pydf = pydf.drop_duplicates().reset_index(drop=True)

In [8]:
print(pydf.shape)
pydf.head(13590)

(13590, 1)


Unnamed: 0,repo
0,soimort/you-get
1,apache/airflow
2,pytorch/vision
3,asciimoo/searx
4,tensorflow/probability
...,...
13585,praekelt/python-ambient
13586,zenreach/py-era
13587,TakesxiSximada/custom_settings
13588,openpermissions/bass


In [9]:
pydf.to_pickle("repos.pkl")

## 2. Collect diff and commits

In [10]:
!mkdir repos
!pip install pydriller
!pip install pandas
!pip install spacy

Collecting pydriller
  Downloading PyDriller-2.0-py3-none-any.whl (65 kB)
[K     |████████████████████████████████| 65 kB 2.3 MB/s 
[?25hCollecting gitpython
  Downloading GitPython-3.1.24-py3-none-any.whl (180 kB)
[K     |████████████████████████████████| 180 kB 10.9 MB/s 
Collecting lizard
  Downloading lizard-1.17.9-py2.py3-none-any.whl (64 kB)
[K     |████████████████████████████████| 64 kB 2.9 MB/s 
[?25hCollecting gitdb<5,>=4.0.1
  Downloading gitdb-4.0.9-py3-none-any.whl (63 kB)
[K     |████████████████████████████████| 63 kB 1.5 MB/s 
Collecting smmap<6,>=3.0.1
  Downloading smmap-5.0.0-py3-none-any.whl (24 kB)
Installing collected packages: smmap, gitdb, lizard, gitpython, pydriller
Successfully installed gitdb-4.0.9 gitpython-3.1.24 lizard-1.17.9 pydriller-2.0 smmap-5.0.0


In [11]:
from pydriller import *
import pandas as pd
import nltk
import spacy
import re
from functools import reduce
# spacy.cli.download("en_core_web_sm")

In [12]:
repodf = pd.read_pickle("repos.pkl")
print(repodf.shape)
spacy_tokenizer = spacy.load("en_core_web_sm")
diff_tokenizer = nltk.tokenize.WordPunctTokenizer()

(13590, 1)


In [106]:
import re
def basic_filter(message):
    return message.split("\n", 1)[0].strip()

# Remove [label] in front of commit if exists
def label_filter(message):
    if (message.startswith('[')):
        end_bracket_index = message.find(']')
        if (end_bracket_index == -1):
            return None
        return message[:end_bracket_index+1]
    return message

def camel_case_split(str):
    return re.findall(r'[A-Z](?:[a-z]+|[A-Z]*(?=[A-Z]|$))', str)


def case_splitter(token):
    return list(map(lambda x: x.lower(), camel_case_split(token[0].upper() + token[1:])))
    

def parse_repo_commits(repo_name, commit_limit=50):
    data = []
    commit_count = 0
    for commit in Repository(
        f"https://github.com/{repo_name}",
        only_modifications_with_file_types=[".py"],
        only_no_merge=True,
        order='reverse'
    ).traverse_commits():
        if (commit_count >= commit_limit): break
        line = basic_filter(commit.msg)
        line = label_filter(line)
        
        if (line is None):
            print(f"[DEBUG] Label filter return None for repo {repo} and hash {commit.hash}")
        
        line = line.replace('_', ' ')
        
        # Only alphabet and blank characters
        if (not line.isascii() or not all([c.isalpha() or c.isspace() for c in line])):
            continue
        
        '''
        if (not line.isascii()):                     # Ignore non-English
            continue
        
        if ('@' in line and not 'decorat' in line):  # Ignore Github mentions
            continue
        
        if ('#' in line):                            # Ignore Github issue
            continue
        '''
        
        tokens = spacy_tokenizer(line)
        
        # VERB filter
        if (tokens[0].pos_ != 'VERB'):
            continue
        
        tokens = reduce(lambda a,b: a+b, map(case_splitter, [token.text for token in tokens]), [])
        
        if (len(tokens) < 3 or len(tokens) > 30):
            continue
        
        # Check if changed files are python
        file_failed = False
        
        for mf in commit.modified_files:
            if (not mf.filename.endswith(".py")):
                file_failed = True
                break
        
        if (file_failed):
            continue
        
        def diff_processing(mf):
            print(mf.diff)
            diff = '\n'.join(map(lambda x: x[1], filter(lambda y: y[0] % 2 == 0, enumerate(mf.diff.split("@@")))))
            diff = diff.replace('\n+', '\n<add>').replace('\n-', '\n<del>')
            #TODO
            #replace_symbol_in_string = ex)url
            replace_number = re.compile(r"""
            (?P<prefix>[^a-zA-Z_])  #prefix is not alphabet
            (?P<number>
              0x[0-9A-Fa-f]+        #hexadecimal number
              |[0-9]+               #decimal number
            )
            """,re.VERBOSE)
            diff_number_filtered = replace_number.sub("\g<prefix><number>",diff)
            print(diff_number_filtered)
            token_regex = r"""(?x)
             <(?:add|del|number)>   #Filtered eariler
            |[-+*/^&~|=%!]=?        #Symbols which can join with equal
            |[<>]{1,2}              #neq and bit shift symbols
            |#+                     #Comment symbol
            |[@?$]                  #Other symbols
            |[a-zA-Z0-9]+           #General text
            """
            test = r"""<(?:add|del|number)>|[-+*/^&~|=%!]=?|[<>]{1,2}|#+|[@?$]|[a-zA-Z0-9]+"""
            #'"`\,.;:()[]{}_ not included
            token = nltk.tokenize.regexp_tokenize(diff_number_filtered,test)
            print(token)
            input()
            return diff
        
        print(tokens)
        diff = ''.join(map(diff_processing, commit.modified_files))
        
        data.append([repo_name, commit.hash, ' '.join(tokens), diff])
        commit_count += 1
    
    return pd.DataFrame(data, columns=["repo", "hash", "commit_messsage", "diff"])

In [None]:
import time
start = time.time()
df = parse_repo_commits("soimort/you-get")
end = time.time()
print(end - start)
df.head(3)

['add', 'hdr', 'support', 'for', 'bilibili']
@@ -12,6 +12,8 @@ class Bilibili(VideoExtractor):
 
     # Bilibili media encoding options, in descending quality order.
     stream_types = [
+        {'id': 'hdflv2', 'quality': 125, 'audio_quality': 30280,
+         'container': 'FLV', 'video_resolution': '3840p', 'desc': '真彩 HDR'},
         {'id': 'hdflv2_4k', 'quality': 120, 'audio_quality': 30280,
          'container': 'FLV', 'video_resolution': '2160p', 'desc': '超清 4K'},
         {'id': 'flv_p60', 'quality': 116, 'audio_quality': 30280,


 class Bilibili(VideoExtractor):
 
     # Bilibili media encoding options, in descending quality order.
     stream_types = [
<add>        {'id': 'hdflv2', 'quality': <number>, 'audio_quality': <number>,
<add>         'container': 'FLV', 'video_resolution': '<number>p', 'desc': '真彩 HDR'},
         {'id': 'hdflv2_4k', 'quality': <number>, 'audio_quality': <number>,
          'container': 'FLV', 'video_resolution': '<number>p', 'desc': '超清 <number>K'}

In [15]:
import multiprocessing

data = []

def f(repo):
    df = parse_repo_commits(repo)
    df.to_pickle(f"./repos/{repo.replace('/', '+')}.pkl")
    print(repo, "Done")
    return df

pool = multiprocessing.Pool()
outputs = pool.map(f, repodf['repo'][:3])
pd.concat(outputs).to_pickle("data.pkl")

['add', 'hdr', 'support', 'for', 'bilibili']


Exception ignored in: <finalize object at 0x7f0ad01eb3a0; dead>
Traceback (most recent call last):
  File "/usr/lib/python3.7/weakref.py", line 572, in __call__
    return info.func(*info.args, **(info.kwargs or {}))
  File "/usr/lib/python3.7/tempfile.py", line 936, in _cleanup
    _rmtree(name)
  File "/usr/lib/python3.7/shutil.py", line 485, in rmtree
    onerror(os.lstat, path, sys.exc_info())
  File "/usr/lib/python3.7/shutil.py", line 483, in rmtree
    orig_st = os.lstat(path)
FileNotFoundError: [Errno 2] No such file or directory: '/tmp/tmpsz94zojv'


['add', 'fake', 'header']
['add', 'format', 'selection', 'for', 'ac', 'fun']
['fixed', 'tiktok', 'extraction']
['fix', 'iqiyi', 'playlist', 'extrator']
['fix', 'acfun', 'download', 'fail']
['fix', 'resuming', 'when', 'downloading', 'in', 'chunked', 'mode']
['fix', 'wrong', 'range', 'usage']
['fix', 'bilibili', 'favlist', 'download']
['add', 'support', 'for', 'bvid', 'in', 'playlist', 'mode', 'of', 'bilibili']
['purge', 'dead', 'sites']
['add', 'support', 'for', 'bvid', 'of', 'bilibili']
['use', 'urllib', 'instead', 'of', 'requests']
['fix', 'issue', 'on', 'itag']
['modify', 'encoding', 'with', 'open', 'cookies', 'file']
['fix', 'wrong', 'video', 'title', 'for', 'ixigua']
['pick', 'best', 'video', 'quality', 'for', 'ixigua']
['replace', 'broken', 'api', 'to', 'get', 'correct', 'video', 'title']
['fix', 'toutiao', 'errors']


Exception ignored in: <finalize object at 0x7f0ad01eb3a0; dead>
Traceback (most recent call last):
  File "/usr/lib/python3.7/weakref.py", line 572, in __call__
    return info.func(*info.args, **(info.kwargs or {}))
  File "/usr/lib/python3.7/tempfile.py", line 936, in _cleanup
    _rmtree(name)
  File "/usr/lib/python3.7/shutil.py", line 485, in rmtree
    onerror(os.lstat, path, sys.exc_info())
  File "/usr/lib/python3.7/shutil.py", line 483, in rmtree
    orig_st = os.lstat(path)
FileNotFoundError: [Errno 2] No such file or directory: '/tmp/tmpsz94zojv'


['purge', 'dead', 'sites']
['check', 'if', 'the', 'player', 'exist', 'or', 'not']
['fix', 'ixigua', 'downloading', 'failure']
['fix', 'acfun', 'flv', 'support']
['remove', 'retry', 'for', 'testing', 'bilibili']
['reduce', 'logging', 'message']
['added', 'an', 'auto', 'rename', 'option', 'and', 'fixed', 'the', 'force', 'option']
['fix', 'download', 'url', 'ffmpeg', 'extension']
['update', 'the', 'test']
['fix', 'miaopai', 'download', 'failed']
['fix', 'bar', 'display', 'under', 'windows', 'terminal']
['fix', 'load', 'cookies', 'local', 'name', 'error']
['fix', 'wrong', 'local', 'name']
['comment', 'the', 'wip', 'code', 'to', 'silent', 'lint']
['use', 'argparse', 'instead', 'of', 'getopt']
['fix', 'parsing', 'irregular', 'episode', 'index']
['add', 'support', 'for', 'send', 'the', 'password', 'from', 'cli']
['fix', 'apikey', 'matching', 'error', 'in', 'gallery', 'case']


Process ForkPoolWorker-1:
Process ForkPoolWorker-2:
Traceback (most recent call last):
  File "/usr/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/usr/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/usr/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.7/multiprocessing/pool.py", line 121, in worker
    result = (True, func(*args, **kwds))
  File "/usr/lib/python3.7/multiprocessing/pool.py", line 44, in mapstar
    return list(map(*args))
  File "<ipython-input-15-010b44758e56>", line 6, in f
    df = parse_repo_commits(repo)
  File "<ipython-input-13-37e54b8452af>", line 28, in parse_repo_commits
    order='reverse'
  File "/usr/local/lib/python3.7/dist-packages/pydriller/repository.py", line 233, in traverse_commits
    for commit in job.result():
  File "/usr/lib/python3.7/multiproces

KeyboardInterrupt: ignored

In [None]:
df = pd.read_pickle("data.pkl")
df.head(3)