## 1. Collect CodeSearchNet Repositories

In [1]:
import json

import pandas as pd
from pathlib import Path
pd.set_option('max_colwidth',300)
from pprint import pprint

In [None]:
!wget https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/python.zip
!mkdir CodeSearchNet
!unzip python.zip -d CodeSearchNet

In [3]:
python_files = sorted(Path('CodeSearchNet/python').glob('**/*.gz'))

In [4]:
print(python_files)

[PosixPath('CodeSearchNet/python/final/jsonl/test/python_test_0.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_0.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_1.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_10.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_11.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_12.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_13.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_2.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_3.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_4.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_5.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_6.jsonl.gz'), PosixPath('CodeSearchNet/python/final/jsonl/train/python_train_7.jsonl.gz

In [5]:
columns_long_list = ['repo', 'path', 'url', 'code', 
                     'code_tokens', 'docstring', 'docstring_tokens', 
                     'language', 'partition']

def jsonl_list_to_dataframe(file_list, columns=columns_long_list):
    """Load a list of jsonl.gz files into a pandas DataFrame."""
    return pd.concat([pd.read_json(f, 
                                   orient='records', 
                                   compression='gzip',
                                   lines=True)[columns] 
                      for f in file_list], sort=False)

In [10]:
columns_repo = ['repo']

pydf = jsonl_list_to_dataframe(python_files, columns=columns_repo)

In [18]:
pydf = pydf.drop_duplicates().reset_index(drop=True)

In [20]:
print(pydf.shape)
pydf.head(13590)

(13590, 1)


Unnamed: 0,repo
0,soimort/you-get
1,apache/airflow
2,pytorch/vision
3,asciimoo/searx
4,tensorflow/probability
...,...
13585,praekelt/python-ambient
13586,zenreach/py-era
13587,TakesxiSximada/custom_settings
13588,openpermissions/bass


In [17]:

pydf.to_pickle("repos.pkl")

(13590, 1)


## 2. Collect diff and commits

In [14]:
!mkdir repos
!pip install pydriller
!pip install pandas

mkdir: cannot create directory ‘repos’: File exists
Collecting nltk
  Downloading nltk-3.6.5-py3-none-any.whl (1.5 MB)
     |████████████████████████████████| 1.5 MB 5.5 MB/s            
[?25hCollecting joblib
  Downloading joblib-1.1.0-py2.py3-none-any.whl (306 kB)
     |████████████████████████████████| 306 kB 11.0 MB/s            
[?25hCollecting click
  Downloading click-8.0.3-py3-none-any.whl (97 kB)
     |████████████████████████████████| 97 kB 5.9 MB/s             
[?25hCollecting tqdm
  Downloading tqdm-4.62.3-py2.py3-none-any.whl (76 kB)
     |████████████████████████████████| 76 kB 3.4 MB/s             
[?25hCollecting regex>=2021.8.3
  Downloading regex-2021.11.10-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (763 kB)
     |████████████████████████████████| 763 kB 11.4 MB/s            
[?25hInstalling collected packages: tqdm, regex, joblib, click, nltk
Successfully installed click-8.0.3 joblib-1.1.0 nltk-3.6.5 regex-2021.11.10 tqdm-4.62.3


In [1]:
from pydriller import *
import pandas as pd

In [2]:
repodf = pd.read_pickle("repos.pkl")
print(repodf.shape)

(13590, 1)


In [3]:
def parse_repo_commits(repo_name, commit_limit=50):
    data = []
    verbs = set(["add", "fix", "use", "update", "remove", "make",
                 "change", "move", "allow", "improve", "implement", "create", "upgrade"])
    commit_count = 0
    for commit in Repository(
        f"https://github.com/{repo_name}",
        only_modifications_with_file_types=[".py"],
        only_no_merge=True,
        order='reverse'
    ).traverse_commits():
        if (commit_count >= commit_limit): break
        commit_line = commit.msg.split('\n', 1)[0].lower()
        if commit_line[-1] == '.':
            commit_line = commit_line[:-1]
        if not all(x.isalpha() or x.isspace() for x in commit_line):
            continue
        '''
        if ('#' in commit_line or '@' in commit_line): # ignore issue and mention
            continue
        if (not commit_line.isascii()):                # ignore non-english
            continue
        '''
        tokens = commit_line.split()
        if (len(tokens) <= 2):
            continue
        if (not tokens[0] in verbs):
            continue
        
        file_failed = False
        
        if (len(commit.modified_files) > 2):
            continue
        
        for mf in commit.modified_files:
            if (not mf.filename.endswith(".py")):
                file_failed = True
                break
        
        if (file_failed):
            continue
        diff = ''.join(map(lambda x: x.diff, commit.modified_files))
        
        data.append([repo_name, commit.hash, commit_line, diff])
        commit_count += 1
    
    return pd.DataFrame(data, columns=["repo", "hash", "commit_messsage", "diff"])

In [4]:
import time
start = time.time()
df = parse_repo_commits("soimort/you-get")
end = time.time()
print(end - start)
df.head(3)

6.283020257949829


Unnamed: 0,repo,hash,commit_messsage,diff
0,soimort/you-get,d28a2abe07fe5e2ce452540c0c3bc7f566b2828c,fix incorrect range response issue,"@@ -161,7 +161,10 @@ def url_save(url, filepat..."
1,soimort/you-get,ff7ef4c09aec7af10d3317730fb4e0e140a4c4d0,fix blank line in terminal output,"@@ -278,6 +278,7 @@ def download_urls(urls, ti..."
2,soimort/you-get,b138f9399a4c72812f953623d0342b0c6cc6ee57,fix assert error of file extension,"@@ -17,7 +17,7 @@ def w56_download_by_id(id, t..."


In [4]:
import multiprocessing

data = []

def f(repo):
    df = parse_repo_commits(repo)
    df.to_pickle(f"./repos/{repo.replace('/', '+')}.pkl")
    print(repo, "Done")
    return df

pool = multiprocessing.Pool()
outputs = pool.map(f, repodf['repo'][:3])
pd.concat(outputs).to_pickle("data.pkl")

soimort/you-get Done
pytorch/vision Done
apache/airflow Done


In [6]:
df = pd.read_pickle("data.pkl")
df.head(3)

Unnamed: 0,repo,hash,commit_messsage,diff
0,soimort/you-get,439354e730d8b864de9401536c93220467ccb355,add hdr support for bilibili,"@@ -12,6 +12,8 @@ class Bilibili(VideoExtracto..."
1,soimort/you-get,4a9d2c1e13b8918deba39af515d315b60e545422,add fake header,"@@ -123,10 +123,10 @@ def netease_song_downloa..."
2,soimort/you-get,1b1f1dd1181bb15dabd04f928842891ac635f49c,update regex to match vid for xinpianchang,"@@ -20,7 +20,7 @@ class Xinpianchang(VideoExtr..."
