## 1. Collect CodeSearchNet Repositories

In [1]:
import json

import pandas as pd
from pathlib import Path
pd.set_option('max_colwidth',300)
from pprint import pprint

In [2]:
!wget https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/python.zip
!mkdir CodeSearchNet
!unzip python.zip -d CodeSearchNet

--2021-11-30 16:44:53--  https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/python.zip
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.22.45
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.22.45|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 940909997 (897M) [application/zip]
Saving to: ‘python.zip’


2021-11-30 16:45:53 (15.2 MB/s) - ‘python.zip’ saved [940909997/940909997]

Archive:  python.zip
   creating: CodeSearchNet/python/
   creating: CodeSearchNet/python/final/
   creating: CodeSearchNet/python/final/jsonl/
   creating: CodeSearchNet/python/final/jsonl/train/
  inflating: CodeSearchNet/python/final/jsonl/train/python_train_9.jsonl.gz  
  inflating: CodeSearchNet/python/final/jsonl/train/python_train_12.jsonl.gz  
  inflating: CodeSearchNet/python/final/jsonl/train/python_train_10.jsonl.gz  
  inflating: CodeSearchNet/python/final/jsonl/train/python_train_0.jsonl.gz  
  inflating: CodeSearchNet/python/final/jsonl/train/python_

In [3]:
python_files = sorted(Path('CodeSearchNet/python').glob('**/*.gz'))

In [4]:
print(python_files)

[PosixPath('CodeSearchNet/python/final/jsonl/test/python_test_0.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_0.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_1.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_10.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_11.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_12.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_13.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_2.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_3.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_4.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_5.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_6.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_7.jsonl.gz

In [5]:
columns_long_list = ['repo', 'path', 'url', 'code', 
                     'code_tokens', 'docstring', 'docstring_tokens', 
                     'language', 'partition']

def jsonl_list_to_dataframe(file_list, columns=columns_long_list):
    """Load a list of jsonl.gz files into a pandas DataFrame."""
    return pd.concat([pd.read_json(f, 
                                   orient='records', 
                                   compression='gzip',
                                   lines=True)[columns] 
                      for f in file_list], sort=False)

In [6]:
columns_repo = ['repo']

pydf = jsonl_list_to_dataframe(python_files, columns=columns_repo)

In [7]:
pydf = pydf.drop_duplicates().reset_index(drop=True)

In [8]:
print(pydf.shape)
pydf.head(13590)

(13590, 1)


Unnamed: 0,repo
0,soimort/you-get
1,apache/airflow
2,pytorch/vision
3,asciimoo/searx
4,tensorflow/probability
...,...
13585,praekelt/python-ambient
13586,zenreach/py-era
13587,TakesxiSximada/custom_settings
13588,openpermissions/bass


In [9]:
pydf.to_pickle("repos.pkl")

## 2. Collect diff and commits

In [10]:
!pip install pydriller
!pip install pandas
!pip install spacy

Collecting pydriller
  Downloading PyDriller-2.0-py3-none-any.whl (65 kB)
[?25l[K     |█████                           | 10 kB 25.7 MB/s eta 0:00:01[K     |██████████                      | 20 kB 28.1 MB/s eta 0:00:01[K     |███████████████                 | 30 kB 12.3 MB/s eta 0:00:01[K     |████████████████████            | 40 kB 9.5 MB/s eta 0:00:01[K     |█████████████████████████       | 51 kB 4.3 MB/s eta 0:00:01[K     |██████████████████████████████  | 61 kB 4.6 MB/s eta 0:00:01[K     |████████████████████████████████| 65 kB 2.2 MB/s 
Collecting gitpython
  Downloading GitPython-3.1.24-py3-none-any.whl (180 kB)
[K     |████████████████████████████████| 180 kB 8.2 MB/s 
[?25hCollecting lizard
  Downloading lizard-1.17.9-py2.py3-none-any.whl (64 kB)
[K     |████████████████████████████████| 64 kB 3.2 MB/s 
[?25hCollecting gitdb<5,>=4.0.1
  Downloading gitdb-4.0.9-py3-none-any.whl (63 kB)
[K     |████████████████████████████████| 63 kB 1.7 MB/s 
Collecting smmap

In [12]:
from pydriller import *
import pandas as pd
import nltk
import spacy
import re
from functools import reduce
# spacy.cli.download("en_core_web_sm")

In [13]:
repodf = pd.read_pickle("repos.pkl")
spacy_tokenizer = spacy.load("en_core_web_sm")
diff_tokenizer = nltk.tokenize.WordPunctTokenizer()

In [None]:
def basic_filter(message):
    return message.split("\n", 1)[0].strip()

# Remove [label] in front of commit if exists
def label_filter(message):
    if (message.startswith('[')):
        end_bracket_index = message.find(']')
        if (end_bracket_index == -1):
            return None
        return message[:end_bracket_index+1]
    return message

def camel_case_split(str):
    return re.findall(r'[A-Z](?:[a-z]+|[A-Z]*(?=[A-Z]|$))', str)


def case_splitter(token):
    return list(map(lambda x: x.lower(), camel_case_split(token[0].upper() + token[1:])))
    

def parse_repo_commits(repo_name, commit_limit=50):
    data = []
    commit_count = 0
    for commit in Repository(
        f"https://github.com/{repo_name}",
        only_modifications_with_file_types=[".py"],
        only_no_merge=True,
        order='reverse'
    ).traverse_commits():
        if (commit_count >= commit_limit): break
        line = basic_filter(commit.msg)
        line = label_filter(line)
        
        if (line is None):
            print(f"[DEBUG] Label filter return None for repo {repo} and hash {commit.hash}")
        
        line = line.replace('_', ' ')
        
        # Only alphabet and blank characters
        if (not line.isascii() or not all([c.isalpha() or c.isspace() for c in line])):
            continue
        
        '''
        if (not line.isascii()):                     # Ignore non-English
            continue
        
        if ('@' in line and not 'decorat' in line):  # Ignore Github mentions
            continue
        
        if ('#' in line):                            # Ignore Github issue
            continue
        '''
        
        tokens = spacy_tokenizer(line)
        
        # VERB filter
        if (tokens[0].pos_ != 'VERB'):
            continue
        
        tokens = reduce(lambda a,b: a+b, map(case_splitter, [token.text for token in tokens]), [])
        
        if (len(tokens) < 3 or len(tokens) > 30):
            continue
        
        # Check if changed files are python
        file_failed = False
        
        for mf in commit.modified_files:
            if (not mf.filename.endswith(".py")):
                file_failed = True
                break
        
        if (file_failed):
            continue
        
        def diff_processing(mf):
            diff = '\n'.join(map(lambda x: x[1], filter(lambda y: y[0] % 2 == 0, enumerate(mf.diff.split("@@")))))
            diff = diff.replace('\n+', '\n<add>').replace('\n-', '\n<del>')
            #tokens = nltk.tokenize.wordpunct_tokenize(diff)
            #print(diff)
            #print(tokens)
            return diff
        
        print(tokens)
        diff = ''.join(map(diff_processing, commit.modified_files))
        
        data.append([repo_name, commit.hash, ' '.join(tokens), diff])
        commit_count += 1
    
    return pd.DataFrame(data, columns=["repo", "hash", "commit_messsage", "diff"])

In [None]:
import time
start = time.time()
df = parse_repo_commits("soimort/you-get")
end = time.time()
print(end - start)
df.head(3)

['skip', 'private', 'video']
['add', 'hdr', 'support', 'for', 'bilibili']
['add', 'fake', 'header']
['update', 'regex', 'to', 'match', 'vid', 'for', 'xinpianchang']
['fix', 'bilibili', 'space', 'videos']
['add', 'format', 'selection', 'for', 'ac', 'fun']
['fixed', 'tiktok', 'extraction']
['fix', 'iqiyi', 'playlist', 'extrator']
['fix', 'acfun', 'download', 'fail']
['fix', 'resuming', 'when', 'downloading', 'in', 'chunked', 'mode']
['fix', 'wrong', 'range', 'usage']
['fix', 'bilibili', 'favlist', 'download']
['add', 'support', 'for', 'bvid', 'in', 'playlist', 'mode', 'of', 'bilibili']
['add', 'support', 'for', 'bvid', 'of', 'bilibili']
['added', 'py', 'socks', 'extra', 'requirement']
['use', 'urllib', 'instead', 'of', 'requests']
['fix', 'issue', 'on', 'itag']
['fix', 'coub', 'with', 'quotes']
['get', 'all', 'streams', 'we', 'can', 'download']
['modify', 'encoding', 'with', 'open', 'cookies', 'file']
['fix', 'wrong', 'video', 'title', 'for', 'ixigua']
['pick', 'best', 'video', 'quality'

Unnamed: 0,repo,hash,commit_messsage,diff
0,soimort/you-get,144886840212d5d0ee059858e6493dd265927376,skip private video,"\n def iwara_download(url, output_dir='.', mer..."
1,soimort/you-get,439354e730d8b864de9401536c93220467ccb355,add hdr support for bilibili,\n class Bilibili(VideoExtractor):\n \n # ...
2,soimort/you-get,4a9d2c1e13b8918deba39af515d315b60e545422,add fake header,"\n def netease_song_download(song, output_dir=..."


In [None]:
import multiprocessing

data = []

def f(repo):
    df = parse_repo_commits(repo)
    df.to_pickle(f"./repos/{repo.replace('/', '+')}.pkl")
    print(repo, "Done")
    return df

pool = multiprocessing.Pool()
outputs = pool.map(f, repodf['repo'][:3])
pd.concat(outputs).to_pickle("data.pkl")

soimort/you-get Done
pytorch/vision Done
apache/airflow Done


In [None]:
df = pd.read_pickle("data.pkl")
df.head(3)

Unnamed: 0,repo,hash,commit_messsage,diff
0,soimort/you-get,439354e730d8b864de9401536c93220467ccb355,add hdr support for bilibili,"@@ -12,6 +12,8 @@ class Bilibili(VideoExtracto..."
1,soimort/you-get,4a9d2c1e13b8918deba39af515d315b60e545422,add fake header,"@@ -123,10 +123,10 @@ def netease_song_downloa..."
2,soimort/you-get,1b1f1dd1181bb15dabd04f928842891ac635f49c,update regex to match vid for xinpianchang,"@@ -20,7 +20,7 @@ class Xinpianchang(VideoExtr..."
