In [7]:
import pandas as pd
import numpy as np   
 

# Get top 25 unique repo pairs based on the best model

In [2]:
def sort_repo12(row):
    repo12 = [row['repo_url1'].strip(), row['repo_url2'].strip()]
    repo12.sort()
    
    row['repo12'] = '-'.join(repo12)
    return row
    

def clean_and_get_top_pairs(path, top):
    df = pd.read_csv(path).copy()
    df = df.sort_values(by=['score'], ascending=False)
    print('original size: ', len(df))

    
    
    df = df.apply(sort_repo12, axis=1)
    
    
    df =df[df['repo_url1'] != df['repo_url2'] ]
    print('after dropping pairs with same repos ', len(df))
    
    df.drop_duplicates(['repo12'], keep='first', inplace=True)
    print('dropping duplicate pairs: ', len(df))
    
    df.drop_duplicates([ 'repo_url2'], keep='first', inplace=True)
    print('after dropping duplicate url2 ', len(df))
    
    df.drop_duplicates(['repo_url1'], keep='first', inplace=True)
    print('after dropping duplicate url1 ', len(df))

    
    df =df.drop([ 'Unnamed: 0'], axis=1)
    
    if top is not None and top < len(df):
        result_df = df[:top].copy()
    else: result_df = df.copy()
    result_df['is_random'] = False
    return result_df

def get_top_generated_data(top=30):
    lda_rwr_d02 = clean_and_get_top_pairs('../../data/evaluation/manual/lda_rwr_d02_top6_per_repo.csv', top=None)
    lda_rwr_d02['algorithm'] = 'd02'
    
    lda_rwr_d00 = clean_and_get_top_pairs('../../data/evaluation/manual/lda_rwr_d00_top6_per_repo.csv', top=None)
    lda_rwr_d00['algorithm'] = 'd00'
    
    generated_data = pd.concat([lda_rwr_d00, lda_rwr_d02])
    generated_data.drop_duplicates(['repo12'], keep=False, inplace=True)
    generated_data['algorithm'].value_counts()
    generated_data = generated_data.drop([ 'repo12'], axis=1)
    
    
    d00 = (generated_data[generated_data['algorithm'] == 'd00'])[:top]
    d02 = (generated_data[generated_data['algorithm'] == 'd02'])[:top]
    
    #result = pd.concat([d00, d02])
    return d00, d02

In [8]:
clean_lda_rwr_d02 = clean_and_get_top_pairs('../../data/evaluation/manual/lda_rwr_d02_top6_per_repo.csv', top=None)
clean_lda_rwr_d00 = clean_and_get_top_pairs('../../data/evaluation/manual/lda_rwr_d00_top6_per_repo.csv', top=None)
clean_lda_rwr_d02.to_csv('../../data/evaluation/manual/cleaned_lda_rwr_d02_top6_per_repo.csv')
clean_lda_rwr_d00.to_csv('../../data/evaluation/manual/cleaned_lda_rwr_d00_top6_per_repo.csv')

original size:  79584
after dropping pairs with same repos  79218
dropping duplicate pairs:  78717
after dropping duplicate url2  322
after dropping duplicate url1  219
original size:  79584
after dropping pairs with same repos  78966
dropping duplicate pairs:  78301
after dropping duplicate url2  707
after dropping duplicate url1  568


In [13]:
pd.read_csv('../../data/evaluation/manual/cleaned_lda_rwr_d02_top6_per_repo.csv')

Unnamed: 0.1,Unnamed: 0,repo_url1,repo_description1,repo_url2,repo_description2,score,repo12,is_random
0,2,https://api.github.com/repos/tmrowco/electrici...,A real-time visualisation of the CO2 emissions...,https://api.github.com/repos/limhenry/earthview,Earth View is a collection of the most beautif...,0.003000,https://api.github.com/repos/limhenry/earthvie...,False
1,12,https://api.github.com/repos/googlei18n/corpus...,Crawler for linguistic corpora,https://api.github.com/repos/tmrowco/electrici...,A real-time visualisation of the CO2 emissions...,0.001884,https://api.github.com/repos/googlei18n/corpus...,False
2,19,https://api.github.com/repos/chaimleib/interva...,"A mutable, self-balancing interval tree. Queri...",https://api.github.com/repos/vineetjohn/daily-...,Solutions to problems sent by dailycodingprobl...,0.001786,https://api.github.com/repos/chaimleib/interva...,False
3,21,https://api.github.com/repos/mikeorr/Unipath,An object-oriented approach to Python file/dir...,https://api.github.com/repos/abarker/pdfCropMa...,pdfCropMargins -- a program to crop the margin...,0.001757,https://api.github.com/repos/abarker/pdfCropMa...,False
4,23,https://api.github.com/repos/crazyhottommy/ChI...,ChIP-seq analysis notes from Tommy Tang,https://api.github.com/repos/kcakdemir/HiCPlotter,,0.001752,https://api.github.com/repos/crazyhottommy/ChI...,False
...,...,...,...,...,...,...,...,...
214,74582,https://api.github.com/repos/coto/gae-boilerplate,Google App Engine Boilerplate,https://api.github.com/repos/googlemaps/google...,Python client library for Google Maps API Web ...,0.000810,https://api.github.com/repos/coto/gae-boilerpl...,False
215,75930,https://api.github.com/repos/pwndbg/pwndbg,Makes debugging suck less,https://api.github.com/repos/CleanCut/green,"Green is a clean, colorful, fast python test r...",0.000800,https://api.github.com/repos/CleanCut/green-ht...,False
216,77433,https://api.github.com/repos/potatolondon/djangae,The best way to run Django on Google App Engine,https://api.github.com/repos/GoogleCloudPlatfo...,Data pipeline is a tool to run Data loading pi...,0.000796,https://api.github.com/repos/GoogleCloudPlatfo...,False
217,77808,https://api.github.com/repos/ankane/s3tk,A security toolkit for Amazon S3,https://api.github.com/repos/ets-labs/python-d...,Python dependency injection framework,0.000795,https://api.github.com/repos/ankane/s3tk-https...,False


In [3]:
eval_data_00, eval_data_02 = get_top_generated_data(top=30)

original size:  79584
after dropping pairs with same repos  79218
dropping duplicate pairs:  78717
after dropping duplicate url2  322
after dropping duplicate url1  219
original size:  79584
after dropping pairs with same repos  78966
dropping duplicate pairs:  78301
after dropping duplicate url2  707
after dropping duplicate url1  568


# Get top 30 unique repo pairs based on the best model

In [5]:
all_repos = pd.read_csv('../../data/processed/final_repo_english_whatwhy.csv')

d02 = pd.read_csv('../../data/evaluation/manual/lda_rwr_d02_top6_per_repo.csv')
d00 = pd.read_csv('../../data/evaluation/manual/lda_rwr_d00_top6_per_repo.csv')

In [6]:
d02.head()

Unnamed: 0.1,Unnamed: 0,repo_url1,repo_description1,repo_url2,repo_description2,score
0,0,https://api.github.com/repos/limhenry/earthview,Earth View is a collection of the most beautif...,https://api.github.com/repos/limhenry/earthview,Earth View is a collection of the most beautif...,0.004361
1,1,https://api.github.com/repos/madhavanmalolan/a...,Awesome React Native UI components updated daily,https://api.github.com/repos/madhavanmalolan/a...,Awesome React Native UI components updated daily,0.00336
2,2,https://api.github.com/repos/tmrowco/electrici...,A real-time visualisation of the CO2 emissions...,https://api.github.com/repos/limhenry/earthview,Earth View is a collection of the most beautif...,0.003
3,3,https://api.github.com/repos/limhenry/earthview,Earth View is a collection of the most beautif...,https://api.github.com/repos/tmrowco/electrici...,A real-time visualisation of the CO2 emissions...,0.002833
4,4,https://api.github.com/repos/googlei18n/corpus...,Crawler for linguistic corpora,https://api.github.com/repos/limhenry/earthview,Earth View is a collection of the most beautif...,0.002731


In [6]:
num = len(all_repos)

In [7]:
def generate_random_pair_indeces():
    all_pairs_indeces = []
    for pair in range(0, 35): # 25 pairs
        indeces = np.random.randint(num, size=(1, 2))
        all_pairs_indeces.append(list(indeces[0]))
    return all_pairs_indeces
    
    
    

In [8]:
all_pairs_indeces = generate_random_pair_indeces()

In [9]:
def pair_in_orig(url1, url2, algo_pairs):
    match1 = algo_pairs[((algo_pairs['repo_url1'] == url1) & (algo_pairs['repo_url2'] == url2))]
    match2 = algo_pairs[((algo_pairs['repo_url2'] == url1) & (algo_pairs['repo_url1'] == url2))]
    exist = len(match1) >0 or len(match2)>0
    return exist


def generate_random_pairs(top=None):
    random_list = []
    exists = 0
    for pair in all_pairs_indeces :
        if pair[0] == pair[1]:
            print('duplicate')
            continue

        url1 = all_repos.iloc[pair[0]]['url']
        url2 = all_repos.iloc[pair[1]]['url']

        repo_pair = {}
        if not pair_in_orig(url1, url2, d00) and not pair_in_orig(url1, url2, d02):
            repo1 = all_repos.iloc[pair[0]]
            repo2 = all_repos.iloc[pair[1]]
            
            repo_pair['repo_url1'] = repo1['url']
            repo_pair['repo_description1'] = repo1['description']

            repo_pair['repo_url2'] = repo2['url']
            repo_pair['repo_description2'] = repo2['description']
            
            repo_pair['score'] = 0.0
            random_list.append(repo_pair)
        else:
            exists += 1
        
    df = pd.DataFrame(random_list)
    df['is_random'] = True
    df['algorithm'] = 'random'
    
    if top is not None:
        df = df[:top]
    return df, exists
    
    
    
    
    
    

In [10]:
random_df, exists = generate_random_pairs(top=30)

In [11]:
random_df['algorithm'] = 'random'


# Combine random and top


In [13]:
manual_eval_data = pd.concat([eval_data_00, eval_data_02, random_df])
manual_eval_data['is_random'].value_counts()
manual_eval_data = manual_eval_data.apply(sort_repo12, axis=1)
len(manual_eval_data['repo12'].unique())

90

In [14]:
## batches

batch1 = pd.concat([eval_data_00[:10], eval_data_02[:10], random_df[:10]])
batch2 = pd.concat([eval_data_00[10:20], eval_data_02[10:20], random_df[10:20]])
batch3 = pd.concat([eval_data_00[-10:], eval_data_02[-10:], random_df[-10:]])


In [15]:
print(len(batch1))
print(len(batch2))
print(len(batch3))

30
30
30


In [16]:
from sklearn.utils import shuffle

def _start(batch_num, batch_size=30):
    return (batch_size*(batch_num-1))+1

def _end(batch_num, batch_size=30):
    return _start(batch_num, batch_size) + batch_size 

def shuffle_data(df, batch):
    suffled_df  = shuffle(df.copy())
    suffled_df['id'] = range(_start(batch), _end(batch))
    suffled_df = suffled_df.set_index('id')
    return suffled_df

batch1= shuffle_data(batch1, 1)
batch2= shuffle_data(batch2, 2)
batch3= shuffle_data(batch3,3)


In [17]:
def _replace_empty_desc(row):
    if row['repo_description1'].strip() == '':
        row['repo_description1'] = (all_repos[all_repos['url'] == row['repo_url1']])['name'].values[0]
    if row['repo_description2'].strip() == '':
        row['repo_description2'] = (all_repos[all_repos['url'] == row['repo_url2']])['name'].values[0]
    
    return row


def replace_empty_desc(df):
    df['repo_description1'].fillna('', inplace=True)
    df['repo_description2'].fillna('', inplace=True)
    df =df.apply(_replace_empty_desc, axis=1)
    return df

batch1= replace_empty_desc(batch1)
batch2= replace_empty_desc(batch2)
batch3= replace_empty_desc(batch3)    

In [36]:
batch1['batch'] = 1
batch2['batch'] = 2
batch3['batch'] = 3

## Save batches 

In [37]:
batch1.reset_index().copy().to_csv('../../data/evaluation/manual/batch1.csv', index=False)
batch2.reset_index().copy().to_csv('../../data/evaluation/manual/batch2.csv', index=False)
batch3.reset_index().copy().to_csv('../../data/evaluation/manual/batch3.csv', index=False)

In [38]:
final = pd.concat([batch1, batch2, batch3])
final.reset_index().copy().to_csv('../../data/evaluation/manual/manual_eval_3batches.csv', index=False)

## Copy readme files 

In [32]:
# save readme files under 1 folder
from shutil import copyfile, copy

def copy_readme(url):
    src = '../../data/readme_files/{}.md'.format( '.'.join(url.split('/')[-2:]))
    copy(src, '../../annotation-app/data/readme_files/')
    
def copy_readme_files(batches): # array of dfs
    for idx, row in batches.iterrows():
        copy_readme(row['repo_url1'])
        copy_readme(row['repo_url2'])

        

        

In [33]:
copy_readme_files(final)

In [30]:
u1 = list(set(list(final['repo_url1'].values)))
u1.extend(list(set(list(final['repo_url2'].values))))

In [31]:
len(list(set(u1)))

143

In [35]:
len(final)

90