In [2]:
# Jupyter Notebook for GitHub issue data analysis
import os
import pandas as pd

from dotenv import load_dotenv
load_dotenv()
ROOT = os.environ.get("ROOT")  # remember to set your root path in `.env`; refer to installation.

# English repo - train sets
df_tensorflow = pd.read_json(f'{ROOT}/data/eng_labelled/raw/tensorflow.json')
df_rust = pd.read_json(f'{ROOT}/data/eng_labelled/raw/rust.json')
df_kubernetes = pd.read_json(f'{ROOT}/data/eng_labelled/raw/kubernetes.json')

# English repo - test sets
df_flutter = pd.read_json(f'{ROOT}/data/eng_labelled/raw/flutter.json')
df_ohmyzsh = pd.read_json(f'{ROOT}/data/eng_labelled/raw/ohmyzsh.json')
df_electron = pd.read_json(f'{ROOT}/data/eng_labelled/raw/electron.json')

num_row_tensorflow = len(df_tensorflow.index)
num_row_rust = len(df_rust.index)
num_row_kubernetes = len(df_kubernetes.index)
num_row_flutter = len(df_flutter.index)
num_row_ohmyzsh = len(df_ohmyzsh.index)
num_row_electron = len(df_electron.index)

print('Overall')
print('-------')
print('Total number of tensorflow issues:' + str(num_row_tensorflow))
print('Total number of rust issues:' + str(num_row_rust))
print('Total number of kubernetes issues:' + str(num_row_kubernetes))
print('Total number of flutter issues:' + str(num_row_flutter))
print('Total number of ohmyzsh issues:' + str(num_row_ohmyzsh))
print('Total number of electron issues:' + str(num_row_electron))

Overall
-------
Total number of tensorflow issues:19929
Total number of rust issues:19115
Total number of kubernetes issues:19836
Total number of flutter issues:13207
Total number of ohmyzsh issues:1608
Total number of electron issues:5433


In [19]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.probability import FreqDist
from nltk.corpus import stopwords
import string

stop = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')

In [21]:
feature_data = []
doc_data = []
bug_data = []

for _, row in df_kubernetes.iterrows():
    # feature
    if row['labels'] == 'kind/feature' or row['labels'] == 'kind/api-change': 
        feature_data.extend([w for w in tokenizer.tokenize(row['title']) if w not in stop])
        feature_data.extend([w for w in tokenizer.tokenize(row['body']) if w not in stop])
    # bug
    if row['labels'] == 'kind/bug' or row['labels'] == 'kind/failing-test': 
        bug_data.extend([w for w in tokenizer.tokenize(row['title']) if w not in stop])
        bug_data.extend([w for w in tokenizer.tokenize(row['body']) if w not in stop])
    # doc
    if row['labels'] == 'kind/documentation': 
        doc_data.extend([w for w in tokenizer.tokenize(row['title']) if w not in stop])
        doc_data.extend([w for w in tokenizer.tokenize(row['body']) if w not in stop])

for _, row in df_rust.iterrows():
    # feature
    if row['labels'] == 'C-feature-request' or row['labels'] == 'C-feature-accepted' or row['labels'] == 'C-enhancement': 
        feature_data.extend([w for w in tokenizer.tokenize(row['title']) if w not in stop])
        feature_data.extend([w for w in tokenizer.tokenize(row['body']) if w not in stop])
    # bug
    if row['labels'] == 'C-bug': 
        bug_data.extend([w for w in tokenizer.tokenize(row['title']) if w not in stop])
        bug_data.extend([w for w in tokenizer.tokenize(row['body']) if w not in stop])
    # doc
    if row['labels'] == 'T-doc': 
        doc_data.extend([w for w in tokenizer.tokenize(row['title']) if w not in stop])
        doc_data.extend([w for w in tokenizer.tokenize(row['body']) if w not in stop])

for _, row in df_tensorflow.iterrows():
    if row['labels'] == 'type:feature':
        feature_data.extend([w for w in tokenizer.tokenize(row['title']) if w not in stop])
        feature_data.extend([w for w in tokenizer.tokenize(row['body']) if w not in stop])
    elif row['labels'] == 'type:docs-feature' or row['labels'] == 'type:docs-bug':
        doc_data.extend([w for w in tokenizer.tokenize(row['title']) if w not in stop])
        doc_data.extend([w for w in tokenizer.tokenize(row['body']) if w not in stop])
    else:
        bug_data.extend([w for w in tokenizer.tokenize(row['title']) if w not in stop])
        bug_data.extend([w for w in tokenizer.tokenize(row['body']) if w not in stop])
        
print("Feature word frequencies:")
feature_freq = FreqDist(feature_data)
feature_freq.most_common(50)

Feature word frequencies:


[('S', 102839),
 ('std', 30577),
 ('I', 23962),
 ('1', 20116),
 ('0', 19339),
 ('rust', 16952),
 ('Ratio', 16915),
 ('num_rational', 16900),
 ('https', 16294),
 ('error', 15932),
 ('mut', 15331),
 ('io', 13541),
 ('fn', 12151),
 ('com', 11544),
 ('type', 11210),
 ('2', 10817),
 ('iter', 10749),
 ('use', 10397),
 ('would', 10074),
 ('rs', 10064),
 ('kubernetes', 9504),
 ('github', 9172),
 ('src', 8564),
 ('x', 8488),
 ('The', 8314),
 ('version', 8283),
 ('Take', 8144),
 ('like', 8060),
 ('tensorflow', 7972),
 ('This', 7863),
 ('code', 7697),
 ('let', 7690),
 ('feature', 7464),
 ('T', 7082),
 ('3', 7050),
 ('self', 6957),
 ('tf', 6904),
 ('lang', 6448),
 ('impl', 6440),
 ('main', 6112),
 ('test', 5773),
 ('str', 5445),
 ('4', 5252),
 ('note', 5235),
 ('_', 5161),
 ('rustc', 5155),
 ('pub', 5154),
 ('using', 5117),
 ('It', 4857),
 ('foo', 4808)]

In [22]:
print("Doc word frequencies:")
doc_freq = FreqDist(doc_data)
doc_freq.most_common(50)

Doc word frequencies:


[('https', 6039),
 ('tensorflow', 5905),
 ('0', 4509),
 ('I', 4366),
 ('1', 4238),
 ('tf', 4108),
 ('org', 3080),
 ('com', 2965),
 ('docs', 2482),
 ('documentation', 2449),
 ('rust', 2393),
 ('github', 2373),
 ('issue', 2341),
 ('2', 2271),
 ('python', 2174),
 ('www', 2160),
 ('The', 2140),
 ('kubernetes', 1975),
 ('example', 1901),
 ('version', 1716),
 ('code', 1659),
 ('doc', 1609),
 ('use', 1545),
 ('lang', 1448),
 ('keras', 1274),
 ('master', 1230),
 ('issues', 1224),
 ('TensorFlow', 1211),
 ('10', 1202),
 ('3', 1162),
 ('io', 1159),
 ('x', 1143),
 ('guide', 1138),
 ('blob', 1125),
 ('py', 1120),
 ('std', 1101),
 ('defined', 1038),
 ('name', 1015),
 ('html', 975),
 ('api_docs', 950),
 ('error', 922),
 ('This', 901),
 ('link', 901),
 ('md', 897),
 ('would', 887),
 ('source', 883),
 ('node', 879),
 ('go', 869),
 ('using', 864),
 ('line', 864)]

In [23]:
print("Bug word frequencies:")
bug_freq = FreqDist(bug_data)
bug_freq.most_common(50)

Bug word frequencies:


[('0', 253466),
 ('1', 194895),
 ('tensorflow', 171648),
 ('I', 102266),
 ('2', 87713),
 ('tf', 85691),
 ('lib', 80044),
 ('version', 78021),
 ('_', 76386),
 ('go', 69370),
 ('kubernetes', 66147),
 ('3', 62546),
 ('10', 62036),
 ('src', 60323),
 ('python', 58129),
 ('py', 57376),
 ('io', 54209),
 ('https', 52507),
 ('error', 49243),
 ('com', 49031),
 ('packages', 45272),
 ('k8s', 44950),
 ('cc', 44819),
 ('line', 44224),
 ('6', 42701),
 ('std', 41638),
 ('core', 41189),
 ('rustc', 40748),
 ('7', 40310),
 ('File', 39638),
 ('self', 38992),
 ('4', 38968),
 ('model', 37947),
 ('rs', 37914),
 ('5', 37169),
 ('build', 36568),
 ('x86_64', 36135),
 ('C', 35588),
 ('name', 35288),
 ('TensorFlow', 35132),
 ('code', 34867),
 ('home', 33926),
 ('site', 33896),
 ('github', 33425),
 ('16', 32758),
 ('8', 32220),
 ('11', 31765),
 ('python3', 31586),
 ('use', 31543),
 ('local', 31523)]

In [3]:
train_mappings = {
    "tf": {
        "feature": ["type:feature"],
        "bug": ["type:bug"],
        "doc": ["type:docs-feature", "type:docs-bug"],
        "repo": df_tensorflow
    },
    "rust": {
        "feature": ["C-feature-request", "C-feature-accepted", "C-enhancement"],
        "bug": ["C-bug"],
        "doc": ["T-doc"],
        "repo": df_rust
    },
    "kubernetes": {
        "feature": ["kind/feature", "kind/api-change"],
        "bug": ["kind/bug"],
        "doc": ["kind/documentation"],
        "repo": df_kubernetes
    }
}

test_mappings = {
    "flutter": {
        "feature": ['severe: new feature'],
        "bug": ["severe: crash", "severe: fatal crash", "severe: rendering"],
        "doc": ["documentation"],
        "repo": df_flutter
    },
    "ohmyzsh": {
        "feature": ["Feature", "Enhancement"],
        "bug": ["Bug"],
        "doc": ["Type: documentation"],
        "repo": df_ohmyzsh,
    },
    "electron": {
        "feature": ["enhancement :sparkles:"],
        "bug": ["bug :beetle:", "crash :boom:"],
        "doc": ["documentation :notebook:"],
        "repo": df_electron
    }
}

In [4]:
def analysis(mappings):
    total_features = 0
    total_bugs = 0
    total_docs = 0
    for repo_label, repo in mappings.items():
        repo_features = 0
        repo_bugs = 0
        repo_docs = 0
        for _, row in repo["repo"].iterrows():
            if row['labels'] in repo["feature"]: repo_features += 1
            elif row['labels'] in repo["bug"]: repo_bugs += 1
            elif row['labels'] in repo["doc"]: repo_docs += 1
        
        print(f'{repo_label} issue analysis')
        print('-------------------------')
        print('Number of feature issues:' + str(repo_features))
        print('Number of bug issues:' + str(repo_bugs))
        print('Number of doc issues:' + str(repo_docs))
        print('Total issues:' + str(repo_features + repo_bugs + repo_docs))        
        print('-------------------------')
        
        total_features += repo_features
        total_bugs += repo_bugs
        total_docs += repo_docs
        
    total = total_features + total_bugs + total_docs
    print(f'Overall issue analysis')
    print('-------------------------')
    print('Number of feature issues:' + str(total_features))
    print('Number of bug issues:' + str(total_bugs))
    print('Number of doc issues:' + str(total_docs))
    print('Total issues:' + str(total))
    
    print('-------------------------')
    print('% of feature issues:' + str(total_features / total))
    print('% of bug issues:' + str(total_bugs / total))
    print('% of doc issues:' + str(total_docs / total))
    print('-------------------------')
        
        
analysis(train_mappings)
analysis(test_mappings)

tf issue analysis
-------------------------
Number of feature issues:2461
Number of bug issues:6190
Number of doc issues:1648
Total issues:10299
-------------------------
rust issue analysis
-------------------------
Number of feature issues:10163
Number of bug issues:7049
Number of doc issues:766
Total issues:17978
-------------------------
kubernetes issue analysis
-------------------------
Number of feature issues:4302
Number of bug issues:10422
Number of doc issues:968
Total issues:15692
-------------------------
Overall issue analysis
-------------------------
Number of feature issues:16926
Number of bug issues:23661
Number of doc issues:3382
Total issues:43969
-------------------------
% of feature issues:0.38495303509290635
% of bug issues:0.5381291364370352
% of doc issues:0.07691782847005844
-------------------------
flutter issue analysis
-------------------------
Number of feature issues:4387
Number of bug issues:4318
Number of doc issues:1102
Total issues:9807
-------------

In [None]:
# German repo
df_corona_widget = pd.read_json('./data/de_unlabelled/corona-widget.json')
df_open_wb = pd.read_json('./data/de_unlabelled/openWB.json')

num_corona_widget = len(df_corona_widget.index)
num_open_wb = len(df_open_wb)

print('German repo issue analysis')
print('--------------------------')
print('Total number of corona widget issues:' + str(num_corona_widget))
print('Total number of openWB issues:' + str(num_open_wb))

In [19]:
# French repo
df_dvf_app = pd.read_json('./data/fr_unlabelled/DVF-app.json')
df_grafikart = pd.read_json('./data/fr_unlabelled/Grafikart.fr.json')
df_azure_docs = pd.read_json('./data/fr_unlabelled/azure-docs.fr-fr.json')
df_bcdlibre = pd.read_json('./data/fr_unlabelled/bcdlibre.json')

num_row_dvf_app = len(df_dvf_app.index)
num_grafikart = len(df_grafikart.index)
num_azure_docs = len(df_azure_docs.index)
num_bcdlibre = len(df_bcdlibre.index)

print('French repo issue analysis')
print('--------------------------')
print('Total number of DVF-app issues:' + str(num_row_dvf_app))
print('Total number of Grafikart issues:' + str(num_grafikart))
print('Total number of azure docs issues:' + str(num_azure_docs))
print('Total number of bcdlibre issues:' + str(num_bcdlibre))

French repo issue analysis
--------------------------
Total number of DVF-app issues:104
Total number of Grafikart issues:313
Total number of azure docs issues:247
Total number of bcdlibre issues:36
