In [8]:
!pip install dgl-cu116 dglgo -f https://data.dgl.ai/wheels/repo.html
!pip install iteration-utilities==0.11.0
!pip install sentencepiece==0.1.98
!pip install tokenizers==0.13.3
!pip install transformers==4.28.1
!pip install sentence-transformers==2.2.2
!pip install scipy==1.8.0 
!pip install networkx==2.6
!pip install pingouin==0.5.3

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://data.dgl.ai/wheels/repo.html
Collecting dgl-cu116
  Downloading https://data.dgl.ai/wheels/dgl_cu116-0.9.1.post1-cp310-cp310-manylinux1_x86_64.whl (246.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m246.3/246.3 MB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dglgo
  Downloading dglgo-0.0.2-py3-none-any.whl (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.5/63.5 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Collecting ruamel.yaml>=0.17.20
  Downloading ruamel.yaml-0.17.22-py3-none-any.whl (107 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.8/107.8 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting numpydoc>=1.1.0
  Downloading numpydoc-1.5.0-py3-none-any.whl (52 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.4/52.4 kB[0m [31m7.0 MB/s[0m eta

In this work, we use google drive as file system. Please connect to your own Google Drive.

In [9]:
from google.colab import drive
!mkdir drive
drive.mount('drive')

Mounted at drive


In [10]:
import pandas as pd
import dgl
from dgl.data import DGLDataset
import torch
import os
from os import listdir
from os.path import isfile, join
import glob
import subprocess
import csv
import numpy as np
import pickle
from iteration_utilities import unique_everseen
import itertools
import sentencepiece as spm
import transformers
import tokenizers
from transformers import PreTrainedTokenizerFast,PreTrainedTokenizer
from tokenizers import SentencePieceBPETokenizer
import scipy.sparse as sp
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score
from sentence_transformers import SentenceTransformer, util
import matplotlib.pyplot as plt
import networkx as nx
import statistics
import random
import math
import warnings
warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')

# set your file url here 
url = ''
base_url = './drive/MyDrive/' + url 
seed = 2
k = 5

Setting the default backend to "pytorch". You can change it in the ~/.dgl/config.json file or export the DGLBACKEND environment variable.  Valid options are: pytorch, mxnet, tensorflow (all lowercase)


DGL backend not selected or invalid.  Assuming PyTorch for now.


We concentrate on two installations: Chromium and Qt

In [3]:
projects = ["chromium",'qt']

Generate GNN graph

In [5]:
def generateGraphGNN(project):
    print(project)
    # Load pre-trained sentencepiece model
    pre_tokenizer = PreTrainedTokenizerFast(tokenizer_file=f'{base_url}/tokenizer/tokenizer.model', model_max_length=512, pad_token ="<pad>")
    train_dir_path = f'{base_url}/dataset/{project}/train/'

    files = [f for f in listdir(train_dir_path) if isfile(join(train_dir_path, f))]

    for file in files:
        project_name = '-'.join(file.split('-')[:-1])
        with open(f'{base_url}/dataset/{project}/train/{file}', 'rb') as f:
            content = pickle.load(f)
            edges_src = []
            edges_dst = []
            node_code = []
            node_label = []
            n_nodes = 0

            for review_item in content:
                review_function = []
                for function in review_item:
                    single_function_name = function['function_name']
                    single_function_code = function['code']
                    single_file_name = "-".join(function['file'].split("-")[1:-1])
                    unique_function_string = f"{single_function_name} {single_function_code} {single_file_name}"
                    function_label = function['label']
                    function_idx = function['function_idx']
                    if function_idx not in review_function:
                        review_function.append(function_idx)

                    
                    if unique_function_string not in node_code:
                        node_code.append(unique_function_string)
                        node_label.append(function_label)
                        if function_idx >= n_nodes:
                            n_nodes = function_idx + 1
                function_perm = list(itertools.combinations(review_function, 2))
                edges_src += [e[0] for e in function_perm]
                edges_dst += [e[1] for e in function_perm]
            
            # convert the textual information of code functions into vectors
            node_feature_tensor = pre_tokenizer(node_code,max_length=512,padding=True, truncation=True, return_tensors='pt')["input_ids"]
            node_label_tensor = torch.tensor(node_label)
            edge_src_tensor = torch.tensor(edges_src)
            edge_dst_tensor = torch.tensor(edges_dst)
            graph = dgl.graph((edges_src, edges_dst), num_nodes=n_nodes)
            graph.ndata['feat'] = node_feature_tensor
            graph.ndata['label'] = node_label_tensor
        with open(f"{base_url}/gnn-graph/{project}/{file}-graph.pkl", 'wb') as f:
            pickle.dump(graph, f)

In [28]:
for project in projects:
    generateGraphGNN(project)

chromium
qt


GraphSage configuration

In [7]:
import dgl.function as fn
from dgl.nn import SAGEConv
import random
seed = 2
def setup_seed(seed):
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    dgl.seed(seed)
    dgl.random.seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
device = 'cpu'

class GraphSAGE(nn.Module):
    def __init__(self, in_feats, h_feats):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_feats, h_feats, 'mean')
        self.conv2 = SAGEConv(h_feats, h_feats, 'mean')

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h
        
class DotPredictor(nn.Module):
    def forward(self, g, h):
        with g.local_scope():
            g.ndata['h'] = h
            g.apply_edges(fn.u_dot_v('h', 'h', 'score'))
            return g.edata['score'][:, 0]

def compute_loss(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score])
    labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])])
    return F.binary_cross_entropy_with_logits(scores, labels)

def compute_auc(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score]).numpy()
    labels = torch.cat(
        [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
    return roc_auc_score(labels, scores)

Generate node embeddings for the knowledge graph using GraphSage model

In [30]:
def generateEmbedding(project):
    print(project)
    seed = 2
    test_dir_path = f'{base_url}/gnn-graph/{project}/'
    files = [f for f in listdir(test_dir_path) if isfile(join(test_dir_path, f))]
    setup_seed(seed)
    error = []
    for file in files:
        project_name = '-'.join(file.split('-')[:-2])
        print(file)
        with open(f'{base_url}/gnn-graph/{project}/{file}', 'rb') as f:
            g = pickle.load(f)
            g.to(device)
            u,v = g.edges()
            eids = np.arange(g.number_of_edges())
            np.random.seed(seed)
            eids = np.random.permutation(eids)
            valid_size = int(len(eids) * 0.1)
            train_size = g.number_of_edges() - valid_size
            test_pos_u, test_pos_v = u[eids[:valid_size]], v[eids[:valid_size]]
            train_pos_u, train_pos_v = u[eids[valid_size:]], v[eids[valid_size:]]
            adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy())), shape=(g.number_of_nodes(),g.number_of_nodes()))
            adj_neg = 1 - adj.todense() - np.eye(g.number_of_nodes())
            neg_u, neg_v = np.where(adj_neg != 0)
            neg_eids = np.random.choice(len(neg_u), g.number_of_edges())
            test_neg_u, test_neg_v = neg_u[neg_eids[:valid_size]], neg_v[neg_eids[:valid_size]]
            train_neg_u, train_neg_v = neg_u[neg_eids[valid_size:]], neg_v[neg_eids[valid_size:]]
            train_g = dgl.remove_edges(g, eids[:valid_size])
            train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=g.number_of_nodes())
            train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.number_of_nodes())

            test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=g.number_of_nodes())
            test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.number_of_nodes())
            model = GraphSAGE(train_g.ndata['feat'].shape[1], 16)
            model = model.to(device)
            pred = DotPredictor()
            
            optimizer = torch.optim.Adam(itertools.chain(model.parameters(), pred.parameters()), lr=0.01)
            
            train_g = train_g.to(device)
            train_pos_g = train_pos_g.to(device)
            train_neg_g = train_neg_g.to(device)
            test_pos_g = test_pos_g.to(device)
            test_neg_g = test_neg_g.to(device)

            all_logits = []
            min_auc = 0

            early_stop = 0
            best_h = h = model(train_g, train_g.ndata['feat'].float())

            for e in range(100):
                h = model(train_g, train_g.ndata['feat'].float())
                pos_score = pred(train_pos_g, h)
                neg_score = pred(train_neg_g, h)
                loss = compute_loss(pos_score, neg_score)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                with torch.no_grad():
                    pos_score = pred(test_pos_g, h)
                    neg_score = pred(test_neg_g, h)
                    auc = compute_auc(pos_score, neg_score)
                    if auc > min_auc:
                        min_auc = auc
                        best_h = h
                        early_stop = 0
                    else:
                        early_stop += 1

                if early_stop == 5:
                    break
        with open(f'{base_url}/node_embedding/{project}/{project_name}-embedding.pkl', 'wb') as f:
            pickle.dump(best_h, f)

Please use premium GPU for training, otherwise it may occur "out of memory error"

In [31]:
for project in projects:
    generateEmbedding(project)

chromium
chromium-angle_2Fangle-train.pkl-graph.pkl
chromium-aosp_2Fplatform_2Fsystem_2Fupdate_engine-train.pkl-graph.pkl
chromium-chromiumos_2Fplatform_2Fec-train.pkl-graph.pkl
chromium-chromiumos_2Fplatform2-train.pkl-graph.pkl
chromium-chromiumos_2Fthird_party_2Fflashrom-train.pkl-graph.pkl
chromium-v8_2Fv8-train.pkl-graph.pkl
qt
qt-qt_2Fqtwayland-train.pkl-graph.pkl
qt-qt_2Fqtwebengine-train.pkl-graph.pkl
qt-qbs_2Fqbs-train.pkl-graph.pkl
qt-qt_2Fqtquick3d-train.pkl-graph.pkl
qt-qt3dstudio_2Fqt3d-runtime-train.pkl-graph.pkl
qt-qt_2Fqtbase-train.pkl-graph.pkl
qt-installer-framework_2Finstaller-framework-train.pkl-graph.pkl
qt-qt_2Fqtquickcontrols2-train.pkl-graph.pkl
qt-qt3dstudio_2Fqt3dstudio-train.pkl-graph.pkl
qt-qt_2Fqt3d-train.pkl-graph.pkl
qt-qt-creator_2Fqt-creator-train.pkl-graph.pkl
qt-qt_2Fqtdeclarative-train.pkl-graph.pkl


Evaluation

In [1]:
# generate the results using CoChangeFinder
def getResult(project):
    dir_path = f'{base_url}/node_embedding/{project}/'
    projects = [f for f in listdir(dir_path) if isfile(join(dir_path, f))] 
    graph_result = {}
    for subproject in projects:
        project_name = '-'.join(subproject.split('-')[:-1])

        with open(f"{base_url}/node_embedding/{project}/{project_name}-embedding.pkl", 'rb') as nf:
            best_h = pickle.load(nf)
            best_h = F.normalize(best_h,p=2, dim=1)
        with open(f'{base_url}/gnn-graph/{project}/{project_name}-train.pkl-graph.pkl', 'rb') as f:
            g = pickle.load(f)
        with open(f"{base_url}/dataset/{project}/test/{project_name}-test.pkl", 'rb') as f:
            test_data = pickle.load(f)
            project_score = []
            h_feature = best_h.size()[1]
            co_change_embedding = torch.zeros(1, h_feature)
            result = []
            for data in test_data:
                co_changed_data = [element['function_idx'] for element in data["co-change"]]
                target_data = [(element['function_idx'],element['label']) for element in data["to-predict"]]
                for function_idx in co_changed_data:
                    co_change_embedding = torch.add(co_change_embedding, best_h[function_idx])
                target_embedding = []
                
                target_score = []
                for target in target_data: 
                    target_index = target[0]
                    target_label = target[1]         


                    neighbors = g.predecessors(target_index).tolist()
                    intersection = list(set(co_changed_data)&set(list(neighbors)))
                    co_change_embedding = torch.zeros(1, h_feature)
                    if len(intersection) > 0:
                        score = 0   
                        target = best_h[target_index]
                        for index in intersection:
                            co_change_embedding = torch.add(co_change_embedding, best_h[index])
                        avg_co_change_embedding = co_change_embedding[0]/len(intersection)
                        dot_score = util.dot_score(avg_co_change_embedding, target)
                        target_score.append((float(dot_score), target_label))
                    else:
                        target_score.append((0, target_label))

                sorted_predictions = sorted(target_score,key=lambda tup: tup[0], reverse=True)[:5]
                result.append(sorted_predictions)
        graph_result[project_name] = result
    return graph_result

In [2]:
# generate the results using NeighborCounting approaches
def getResultNeighbor(project):
    dir_path = f'{base_url}/node_embedding/{project}/'
    projects = [f for f in listdir(dir_path) if isfile(join(dir_path, f))] 
    project_result = {}
    for subproject in projects:
        project_name = '-'.join(subproject.split('-')[:-1])
        with open(f'{base_url}/gnn-graph/{project}/{project_name}-train.pkl-graph.pkl', 'rb') as f:
            g = pickle.load(f)
            g = dgl.to_networkx(g)
        with open(f"{base_url}/dataset/{project}/test/{project_name}-test.pkl", 'rb') as f:
            test_data = pickle.load(f)
            project_score = []
            result = []
            for data in test_data:
                co_changed_data = [element['function_idx'] for element in data["co-change"]]
                target_data = [(element['function_idx'],element['label']) for element in data["to-predict"]]
                
                target_score = []
                for target in target_data: 
                    target_index = target[0]
                    target_label = target[1]
                    if target_index in g.nodes:
                        neighbors = list(nx.neighbors(g, target_index))
                        count_score = 0
                        for neighbor in neighbors:
                            if neighbor in co_changed_data:
                                count_score+= 1
                    else:
                        count_score = 0
                    target_score.append((count_score, target_label))
                
                sorted_predictions = sorted(target_score,key=lambda tup: tup[0], reverse=True)[:5]
                result.append(sorted_predictions)
        project_result[project_name] = result
    return project_result

In [3]:
# generate the results using top-k recent functions approach
def getBaseline(project, topk):
    train_dir_path = f'{base_url}/dataset/{project}/train/'

    projects = [f for f in listdir(train_dir_path) if isfile(join(train_dir_path, f))]

    project_rec_result = {}

    for subproject in projects:
      print(f"processing {subproject}")
      sorted_base_rencent = {}

      project_recent_result = []
      project_name = '-'.join(subproject.split('-')[:-1])
      with open(f'{base_url}/dataset/{project}/train/{project_name}-train.pkl', 'rb') as f:
          train_data = pickle.load(f)
          score = 0 
          for review in list(train_data):
              for function in review:
                  label = function["label"]
                  code = function["code"]
                  if label == 1:
                      sorted_base_rencent[code] = score
              score += 1

      with open(f'{base_url}/dataset/{project}/test/{project_name}-test.pkl', 'rb') as f:
          test_data = pickle.load(f)
          topk_recent_result = []
          id = 0
          for review in test_data:
              freq_result = []
              rec_result = []
              review_code = []
              review_label = []
              temp_rec = []
              for to_predict_data in review['to-predict']:
                  test_code = to_predict_data['code']
                  test_label = to_predict_data['label']
                  sorted_base_rencent_list = list(sorted_base_rencent.keys())
                  if test_code in sorted_base_rencent_list:
                      temp_rec.append((sorted_base_rencent[test_code],test_label))
                  else:
                      temp_rec.append((0,0))
              temp_rec = sorted(temp_rec, key=lambda tup:tup[0], reverse=True)[:topk]
              topk_recent_result.append(temp_rec)
              
              id += 1
      project_rec_result[project_name] = topk_recent_result

    return project_rec_result

Top-k accuracy evaluation metric

In [4]:
def evaluateTopkAcc(result, topk):
    project_result = []
    for project in result.keys():
        eval_result = []
        for predictions in result[project]:
              predictons = predictions[:topk]
              evaluate = False
              for predict in predictons:
                  if predict[1] == 1:
                      evaluate = True
                      break
              if evaluate:
                  eval_result.append(1)
              else:
                  eval_result.append(0)
        project_result.append(sum(eval_result)/len(eval_result))
    print(statistics.median(project_result))
    return project_result

MRR evaluation metric

In [5]:
def evaluateMRR(result, topk):
    project_result = []
    for project in result.keys():
        eval_result = []
        for predictions in result[project]:
              predict_k = predictions[:topk]
              true_index = -1
              for n in range(len(predict_k)):
                  if predict_k[n][1] == 1:
                      true_index = n+1
                      break
              if true_index != -1:
                  eval_result.append(1/true_index)
              else:
                  eval_result.append(0)   
        project_result.append(sum(eval_result)/len(result[project]))
    print(statistics.median(project_result))
    return project_result

MAP evaluation metric

In [6]:
def evaluateMAP(result, topk):
    project_result = []
    for project in result.keys():
        eval_result = []
        for predictions in result[project]:
              predict_k = predictions[:topk]
              ap = 0
              count = 0
              for n in range(len(predict_k)):
                  if predict_k[n][1] == 1:
                      count += 1
                      ap += count/(n+1)
              if ap:
                  eval_result.append(ap/count)
              else:
                  eval_result.append(0)   
        project_result.append(sum(eval_result)/len(result[project]))

    print(statistics.median(project_result))
    return project_result

Results evaluation

In [11]:
from scipy.stats import ranksums
from pingouin import compute_effsize
from scipy.stats import wilcoxon

def mergeResult(project, topk, rst_g, rst_n,rst_r):
    res = []
    if project == 'chromium':
        project = 'Chromium'
    else:
        project = 'Qt'
    for data in rst_g:
        res.append({'project':project,'topk':f'Best-{topk}','approach':"CoChangeFinder", 'result':data})
    for data in rst_n:
        res.append({'project':project,'topk':f'Best-{topk}','approach':"NeighborCount", 'result':data})
    for data in rst_r:
        res.append({'project':project,'topk':f'Best-{topk}','approach':"Top-k recent", 'result':data})
    return res


projects = ["chromium","qt"]

for project in projects:
    print("eval", project)

    rst_g = getResult(project)
    rst_n = getResultNeighbor(project)
    rst_rec_ba  = getBaseline(project, k)


    print(f"\n=== topkacc ====")
    result_graph_topk = evaluateTopkAcc(rst_g, k)
    result_nb_topk = evaluateTopkAcc(rst_n, k)
    result_rec_r_topk = evaluateTopkAcc(rst_rec_ba, k)
    effect_size = compute_effsize(result_graph_topk, result_rec_r_topk, eftype='cohen')
    print("effect_size", effect_size)

    print(f"\n=== mrr ====")
    result_graph_mrr = evaluateMRR(rst_g, k)
    result_nb_mrr = evaluateMRR(rst_n, k)
    result_rec_r_mrr = evaluateMRR(rst_rec_ba, k)
    effect_size = compute_effsize(result_graph_mrr, result_rec_r_mrr, eftype='cohen')
    print("effect_size", effect_size)

    print(f"\n=== map ====")
    result_graph_map = evaluateMAP(rst_g, k)
    result_nb_map = evaluateMAP(rst_n, k)
    result_rec_r_map = evaluateMAP(rst_rec_ba, k)
    effect_size = compute_effsize(result_graph_map, result_rec_r_map, eftype='cohen')
    print("effect_size", effect_size)


eval chromium
processing chromium-angle_2Fangle-train.pkl
processing chromium-aosp_2Fplatform_2Fsystem_2Fupdate_engine-train.pkl
processing chromium-chromiumos_2Fplatform_2Fec-train.pkl
processing chromium-chromiumos_2Fplatform2-train.pkl
processing chromium-chromiumos_2Fthird_party_2Fflashrom-train.pkl
processing chromium-v8_2Fv8-train.pkl

=== topkacc ====
0.2372448979591837
0.1577708006279435
0.06018616458641845
effect_size 0.9938022849629965

=== mrr ====
0.15956632653061226
0.10828754578754579
0.055073690148790634
effect_size 1.0934771754600643

=== map ====
0.1672264739229025
0.10842490842490843
0.055073690148790634
effect_size 1.113892396062397
eval qt
processing qt-qt_2Fqtwayland-train.pkl
processing qt-qt_2Fqtwebengine-train.pkl
processing qt-qbs_2Fqbs-train.pkl
processing qt-qt_2Fqtquick3d-train.pkl
processing qt-qt3dstudio_2Fqt3d-runtime-train.pkl
processing qt-qt_2Fqtbase-train.pkl
processing qt-installer-framework_2Finstaller-framework-train.pkl
processing qt-qt_2Fqtquickc

Statistic of the studied dataset

In [39]:
# Statistic of the studied dataset
over_all_review_train = 0
over_all_function_train = 0
over_all_review_test = 0
over_all_function_test_co = 0
over_all_function_test_to = 0
over_all_project = 0
for project in projects:
    train_dir_path = f'{base_url}/dataset/{project}/train/'

    files = [f for f in listdir(train_dir_path) if isfile(join(train_dir_path, f))]

    total_review_train = 0
    total_review_test = 0
    total_function_train = 0
    total_function_test_co = 0
    total_function_test_to = 0

    
    for file in files:
        project_name = '-'.join(file.split('-')[:-1])
        with open(f'{base_url}/dataset/{project}/train/{file}', 'rb') as f:
            content = pickle.load(f)
            for review in content:
                total_review_train += 1
                for function in review:
                    total_function_train += 1
        with open(f'{base_url}/dataset/{project}/test/{project_name}-test.pkl', 'rb') as f:
            content = pickle.load(f)
            for review in content:
                total_review_test += 1
                total_function_test_co += len(review['co-change'])
                total_function_test_to += len(review['to-predict'])
    print(f"{project}, studied training:{total_review_train},studied functions:{total_function_train},studied testing:{total_review_test},changed functions:{total_function_test_co},preseved function:{total_function_test_to}, project_num:{len(files)}")
    over_all_review_train += total_review_train
    over_all_function_train += total_function_train
    over_all_review_test += total_review_test
    over_all_function_test_co += total_function_test_co
    over_all_function_test_to += total_function_test_to
    over_all_project += len(files)
print("total",over_all_review_train,over_all_function_train,over_all_review_test,over_all_function_test_co,over_all_function_test_to,over_all_project)

chromium, studied training:5729,studied functions:75728,studied testing:383,changed functions:3406,preseved function:72334, project_num:6
qt, studied training:12317,studied functions:206403,studied testing:686,changed functions:7925,preseved function:118545, project_num:12
total 18046 282131 1069 11331 190879 18


Design challenge

In [40]:
#Design challenge
import statistics
for project in projects:
    train_dir_path = f'{base_url}/dataset/{project}/train/'

    files = [f for f in listdir(train_dir_path) if isfile(join(train_dir_path, f))]

    result = []
    for file in files:
        project_name = '-'.join(file.split('-')[:-1])
        with open(f'{base_url}/dataset/{project}/train/{file}', 'rb') as f:
            train_data = pickle.load(f)
        with open(f"{base_url}/dataset/{project}/test/{project_name}-test.pkl", 'rb') as f:
            test_data = pickle.load(f)
        
        train_code_list = []
        exist_idx = []
        for review in train_data:
            for function in review:
                function_name = function['function_name']
                function_code = function['code']
                file_name = '-'.join(function['file'].split('-')[1:-1])
                unique_string = function_name + function_code + file_name
                train_code_list.append(unique_string)
        count = 0
        test_funciton = 0
        for review in test_data:
            co_changed = review['co-change']
            to_predict = review['to-predict']
            test_funciton += len(co_changed)
            test_funciton += len(to_predict)
            for function in co_changed:
                function_name = function['function_name']
                function_code = function['code']
                file_name = '-'.join(function['file'].split('-')[1:-1])
                unique_string = function_name + function_code + file_name
                if unique_string not in train_code_list:
                    count += 1
                else:
                    exist_idx.append(function['function_idx'])
            for function in to_predict:
                function_name = function['function_name']
                function_code = function['code']
                file_name = '-'.join(function['file'].split('-')[1:-1])
                unique_string = function_name + function_code + file_name
                if unique_string not in train_code_list:
                    count += 1
                else:
                    exist_idx.append(function['function_idx'])
        # print(project_name,count, test_funciton, len(exist_idx))
        result.append(count/test_funciton)
    print(f"median of percentage test data not in train dataset for {project}", statistics.median(result))

median of percentage test data not in train dataset for chromium 0.8232816056575952
median of percentage test data not in train dataset for qt 0.8223450814771656
