## 1. Collect CodeSearchNet Repositories

In [1]:
import json

import pandas as pd
from pathlib import Path
pd.set_option('max_colwidth',300)
from pprint import pprint

In [2]:
!wget https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/python.zip
!mkdir CodeSearchNet
!unzip python.zip -d CodeSearchNet

--2021-12-02 08:00:26--  https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/python.zip
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.12.142
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.12.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 940909997 (897M) [application/zip]
Saving to: ‘python.zip’


2021-12-02 08:00:38 (73.6 MB/s) - ‘python.zip’ saved [940909997/940909997]

Archive:  python.zip
   creating: CodeSearchNet/python/
   creating: CodeSearchNet/python/final/
   creating: CodeSearchNet/python/final/jsonl/
   creating: CodeSearchNet/python/final/jsonl/train/
  inflating: CodeSearchNet/python/final/jsonl/train/python_train_9.jsonl.gz  
  inflating: CodeSearchNet/python/final/jsonl/train/python_train_12.jsonl.gz  
  inflating: CodeSearchNet/python/final/jsonl/train/python_train_10.jsonl.gz  
  inflating: CodeSearchNet/python/final/jsonl/train/python_train_0.jsonl.gz  
  inflating: CodeSearchNet/python/final/jsonl/train/pytho

In [3]:
python_files = sorted(Path('CodeSearchNet/python').glob('**/*.gz'))
print(python_files)

[PosixPath('CodeSearchNet/python/final/jsonl/test/python_test_0.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_0.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_1.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_10.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_11.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_12.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_13.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_2.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_3.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_4.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_5.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_6.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_7.jsonl.gz

In [4]:
columns_long_list = ['repo', 'path', 'url', 'code', 
                     'code_tokens', 'docstring', 'docstring_tokens', 
                     'language', 'partition']

def jsonl_list_to_dataframe(file_list, columns=columns_long_list):
    """Load a list of jsonl.gz files into a pandas DataFrame."""
    return pd.concat([pd.read_json(f, 
                                   orient='records', 
                                   compression='gzip',
                                   lines=True)[columns] 
                      for f in file_list], sort=False)

In [5]:
pydf = jsonl_list_to_dataframe(python_files, columns=['repo'])
pydf = pydf.drop_duplicates().reset_index(drop=True)
pydf.to_pickle("repos.pkl")

## 2. Collect diff and commits

In [6]:
!mkdir repos
!pip install pydriller
!pip install pandas
!pip install spacy

Collecting pydriller
  Downloading PyDriller-2.0-py3-none-any.whl (65 kB)
[?25l[K     |█████                           | 10 kB 20.3 MB/s eta 0:00:01[K     |██████████                      | 20 kB 22.4 MB/s eta 0:00:01[K     |███████████████                 | 30 kB 27.9 MB/s eta 0:00:01[K     |████████████████████            | 40 kB 30.3 MB/s eta 0:00:01[K     |█████████████████████████       | 51 kB 33.1 MB/s eta 0:00:01[K     |██████████████████████████████  | 61 kB 36.1 MB/s eta 0:00:01[K     |████████████████████████████████| 65 kB 3.2 MB/s 
Collecting gitpython
  Downloading GitPython-3.1.24-py3-none-any.whl (180 kB)
[K     |████████████████████████████████| 180 kB 58.6 MB/s 
[?25hCollecting lizard
  Downloading lizard-1.17.9-py2.py3-none-any.whl (64 kB)
[K     |████████████████████████████████| 64 kB 2.5 MB/s 
Collecting gitdb<5,>=4.0.1
  Downloading gitdb-4.0.9-py3-none-any.whl (63 kB)
[K     |████████████████████████████████| 63 kB 1.5 MB/s 
[?25hCollecting s

In [7]:
from pydriller import *
import pandas as pd
import nltk
import spacy
import re
from functools import reduce

In [8]:
repodf = pd.read_pickle("repos.pkl")
print(repodf.shape)
spacy_tokenizer = spacy.load("en_core_web_sm")
print(repodf)

(13590, 1)
                                 repo
0                     soimort/you-get
1                      apache/airflow
2                      pytorch/vision
3                      asciimoo/searx
4              tensorflow/probability
...                               ...
13585         praekelt/python-ambient
13586                 zenreach/py-era
13587  TakesxiSximada/custom_settings
13588            openpermissions/bass
13589               xnuinside/clifier

[13590 rows x 1 columns]


In [28]:
import re
def basic_filter(message):
    return message.split("\n", 1)[0].strip()

# Remove [label] in front of commit if exists
def label_filter(message):
    if (message.startswith('[')):
        end_bracket_index = message.find(']')
        if (end_bracket_index == -1):
            return None
        return message[:end_bracket_index+1]
    return message

def camel_case_split(str):
    return re.findall(r'[A-Z](?:[a-z]+|[A-Z]*(?=[A-Z]|$))', str)

def case_splitter(token):
    return list(map(lambda x: x.lower(), camel_case_split(token[0].upper() + token[1:])))

def split_by_quote(diff):
    text_list = [(0,0)]
    quote_state = 0 #["out","single","double","Single","Double","quote"]
    i=0
    while i<len(diff):
      if diff[i] == "\\":i+=1
      elif (diff[i] == "\n" and quote_state == 5) or (diff[i] == '#' and quote_state == 0):
        text_list.append((quote_state,i))
        quote_state = abs(5-quote_state)
      elif diff[i] == '"':
        if i+2<len(diff) and diff[i:i+3]=='"""' and (quote_state == 0 or quote_state ==4):
          text_list.append((quote_state,i+max(quote_state-1,0)))
          quote_state = abs(4-quote_state)
          i+=2
        elif quote_state == 0 or quote_state == 2:
          text_list.append((quote_state,i+max(quote_state-1,0)))
          quote_state = abs(2-quote_state)
      elif diff[i] == "'":
        if i+2<len(diff) and diff[i:i+3] == "'''" and (quote_state == 0 or quote_state == 3):
          text_list.append((quote_state,i+quote_state))
          quote_state = abs(3-quote_state)
          i+=2
        elif quote_state == 0 or quote_state == 1:
          text_list.append((quote_state,i+quote_state))
          quote_state = abs(1-quote_state)
      i+=1
    return [(text_list[i][0]%5==0,diff[text_list[i-1][1]:text_list[i][1]]) for i in range(1,len(text_list))]
def diff_processing(mf):
    diff = '\n'.join(map(lambda x: x[1], filter(lambda y: y[0] % 2 == 0, enumerate(mf.diff.split("@@")))))
    diff = diff.replace('\n+', '\n<add>').replace('\n-', '\n<del>')
    # Replace_symbol_in_string = ex)url
    diff = re.sub(r"(?:\n[ \t\r\f\v]*)+","\n",diff) #Join continuous row change
    quote_split = split_by_quote(diff)
    token_regex = r"""(?x)
      <(?:add|del)>   #Filtered eariler
    |(?:[#][\s]*)+           #Quotation
    |(?:[-+*/^&~|=%!<>@?$][\s]*)+     #Sequence of symbols
    |[\n]                    #Change row
    |[a-zA-Z]+               #General text
    |[0-9]+                  #Number
    """
    string_regex = r"""(?x)
    <(?:add|del)>
    |[\n]
    |[a-zA-Z]+
    """
    #'"`,.;:()[]{}_ not included
    token_initial = reduce(lambda a,b:a+b,map(lambda a: nltk.tokenize.regexp_tokenize(a[1],token_regex) if a[0] else nltk.tokenize.regexp_tokenize(a[1],string_regex),quote_split),[])
    token_camel_case_split = reduce(lambda a,b:a+b,map(lambda a: case_splitter(a) if 97<=ord(a[0].lower())<122 else [a], token_initial),[])
    return token_camel_case_split


def parse_repo_commits(repo_name, commit_limit=50):
    data = []
    commit_count = 0
    for commit in Repository(
        f"https://github.com/{repo_name}",
        only_modifications_with_file_types=[".py"],
        only_no_merge=True,
        order='reverse'
    ).traverse_commits():
        if (commit_count >= commit_limit): break
        line = basic_filter(commit.msg)
        line = label_filter(line)
        
        if (line is None):
            print(f"[DEBUG] Label filter return None for repo {repo} and hash {commit.hash}")
        
        line = line.replace('_', ' ')

        # ignore mentions, non-English, github issue #
        if (len(re.findall(r"[^a-zA-Z0-9: ]", line)) != 0):
          continue
        
        tokens = spacy_tokenizer(line)
        
        # VERB filter
        if (tokens[0].pos_ != 'VERB'):
            continue
        
        tokens = reduce(lambda a,b: a+b, map(case_splitter, [token.text for token in tokens]), [])
        
        if (len(tokens) < 3 or len(tokens) > 30):
            continue
        
        # Check if changed files are python
        file_failed = False
        
        for mf in commit.modified_files:
            if (not mf.filename.endswith(".py")):
                file_failed = True
                break
        
        if (file_failed):
            continue
        
        diff_tokens = list(map(diff_processing, commit.modified_files))
        diff_tokens_size = sum(map(len, diff_tokens))
        if (diff_tokens_size > 100):
          continue
        
        for mf in commit.modified_files:
          print(mf.diff)
          print("")


        print(list(diff_tokens))
        input()
        
        
        data.append([repo_name, commit.hash, ' '.join(tokens), diff])
        commit_count += 1

    return pd.DataFrame(data, columns=["repo", "hash", "commit_messsage", "diff"])

In [None]:
import time
start = time.time()
df = parse_repo_commits("soimort/you-get")
#df = parse_repo_commits("tensorflow/probability")
end = time.time()
print(end - start)
df.head(3)

@@ -12,6 +12,8 @@ class Bilibili(VideoExtractor):
 
     # Bilibili media encoding options, in descending quality order.
     stream_types = [
+        {'id': 'hdflv2', 'quality': 125, 'audio_quality': 30280,
+         'container': 'FLV', 'video_resolution': '3840p', 'desc': '真彩 HDR'},
         {'id': 'hdflv2_4k', 'quality': 120, 'audio_quality': 30280,
          'container': 'FLV', 'video_resolution': '2160p', 'desc': '超清 4K'},
         {'id': 'flv_p60', 'quality': 116, 'audio_quality': 30280,


[['\n', 'class', 'bilibili', 'video', 'extractor', '\n', '# ', 'bilibili', 'media', 'encoding', 'options', 'in', 'descending', 'quality', 'order', '\n', 'stream', 'types', '= ', '\n', '<add>', 'id', 'hdflv', 'quality', '125', 'audio', 'quality', '30280', '\n', '<add>', 'container', 'flv', 'video', 'resolution', 'p', 'desc', 'hdr', '\n', 'id', 'hdflv', 'k', 'quality', '120', 'audio', 'quality', '30280', '\n', 'container', 'flv', 'video', 'resolution', 'p', 'desc', 'k', '\n', 'id', 'flv', 'p', '

In [None]:
import multiprocessing

data = []

def f(repo):
    df = parse_repo_commits(repo)
    df.to_pickle(f"./repos/{repo.replace('/', '+')}.pkl")
    print(repo, "Done")
    return df

pool = multiprocessing.Pool()
outputs = pool.map(f, repodf['repo'][:3])
pd.concat(outputs).to_pickle("data.pkl")

In [None]:
df = pd.read_pickle("data.pkl")
df.head(3)

In [None]:
for repo in repodf["repo"]:
  for commit in Repository(
      f"https://github.com/{repo}",
      only_modifications_with_file_types=[".py"],
      only_no_merge=True,
      order='reverse').traverse_commits():
      
      line = commit.msg.split('\n', 1)[0]
      if (len(re.findall(r"[^a-zA-Z0-9: ]", line)) == 0):
        print(repo)
        print(commit.hash)
        print(line)
        tokens = spacy_tokenizer(line)
        print(list(tokens))
        input()