### 1. Extract all contributors

In [None]:
import pandas as pd

proj_hack_member = pd.read_csv('../data/hackathon_project_contributor.csv')

proj_hack_member

In [None]:
all_links = proj_hack_member['contributors'].dropna().str.split(',')

flat_links = [link.strip() for sublist in all_links for link in sublist]

unique_links = list(set(flat_links))

contributors_df = pd.DataFrame({
    'link': unique_links,
    'name': [link.split('/')[-1] for link in unique_links]
})

contributors_df.to_csv('contributors.csv', index=False)


### 2. Run RABBIT to identify GitHub users as Bot/Human/Unknown (in run_rabbit_parallel.py)

### 3. Process to keep Human users (or Human and Unknown users?)

Human: 34258
Human and Unknown: 174173

In [17]:
import pandas as pd

rabbit_res = pd.read_csv('../data/rabbit_output_parallel.csv')

rabbit_res

Unnamed: 0.1,Unnamed: 0,contributor,type,confidence
0,0,kylaf,Unknown,-
1,1,sophiapeckner,Unknown,-
2,2,LaExploradora,Unknown,-
3,3,ashlaycyriac,Unknown,-
4,4,Erlemar,Bot,0.757
...,...,...,...,...
195039,32499,ajaysub110,Bot,0.213
195040,32500,bodokaiser,Human,0.801
195041,32501,Omig12,Unknown,-
195042,32502,craig-ludington,Unknown,-


Confidence of users marked as Human

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# sns.histplot(data=rabbit_res[rabbit_res["type"] == "Human"], x="confidence", bins=30)
# plt.title("Confidence distribution for type=human")
# plt.show()

rabbit_res["confidence"] = pd.to_numeric(rabbit_res["confidence"], errors="coerce")
human_conf = rabbit_res[(rabbit_res["type"] == "Human") & (rabbit_res["confidence"].notna())]["confidence"]

bins = [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
binned = pd.cut(human_conf, bins=bins)

print(binned.value_counts().sort_index())
print("\nPercent distribution:")
print((binned.value_counts(normalize=True) * 100).sort_index().round(2).astype(str) + "%")



confidence
(0.5, 0.6]     1840
(0.6, 0.7]     2449
(0.7, 0.8]     3493
(0.8, 0.9]     7521
(0.9, 1.0]    12662
Name: count, dtype: int64

Percent distribution:
confidence
(0.5, 0.6]     6.58%
(0.6, 0.7]     8.76%
(0.7, 0.8]    12.49%
(0.8, 0.9]    26.89%
(0.9, 1.0]    45.28%
Name: proportion, dtype: object


In [18]:
is_human = rabbit_res[rabbit_res['type'] == 'Human']

is_human

Unnamed: 0.1,Unnamed: 0,contributor,type,confidence
5,5,ncitron,Human,0.956
13,13,devantler,Human,0.819
15,15,roseeichelmann,Human,0.834
21,21,Aishwaryajakka,Human,0.793
22,22,marcosgopen,Human,0.936
...,...,...,...,...
195031,32491,jackcook,Human,0.85
195032,32492,ottomated,Human,0.813
195034,32494,adriangb,Human,0.937
195035,32495,wanglam,Human,0.921


In [20]:
not_bot = rabbit_res[rabbit_res['type'].isin(['Human', 'Unknown'])]

not_bot

Unnamed: 0.1,Unnamed: 0,contributor,type,confidence
0,0,kylaf,Unknown,-
1,1,sophiapeckner,Unknown,-
2,2,LaExploradora,Unknown,-
3,3,ashlaycyriac,Unknown,-
5,5,ncitron,Human,0.956
...,...,...,...,...
195038,32498,mastrolinux,Unknown,-
195040,32500,bodokaiser,Human,0.801
195041,32501,Omig12,Unknown,-
195042,32502,craig-ludington,Unknown,-


In [21]:
contributors_df = pd.read_csv('../contributors.csv')
contributors_df

Unnamed: 0,link,name
0,https://github.com/kylaf,kylaf
1,https://github.com/sophiapeckner,sophiapeckner
2,https://github.com/LaExploradora,LaExploradora
3,https://github.com/ashlaycyriac,ashlaycyriac
4,https://github.com/Erlemar,Erlemar
...,...,...
195039,https://github.com/ajaysub110,ajaysub110
195040,https://github.com/bodokaiser,bodokaiser
195041,https://github.com/Omig12,Omig12
195042,https://github.com/craig-ludington,craig-ludington


In [22]:
humans = pd.merge(
    is_human, contributors_df,
    how='left',
    left_on='contributor', right_on='name',
)
humans = humans[['link','name']]
humans


Unnamed: 0,link,name
0,https://github.com/ncitron,ncitron
1,https://github.com/devantler,devantler
2,https://github.com/roseeichelmann,roseeichelmann
3,https://github.com/Aishwaryajakka,Aishwaryajakka
4,https://github.com/marcosgopen,marcosgopen
...,...,...
34253,https://github.com/jackcook,jackcook
34254,https://github.com/ottomated,ottomated
34255,https://github.com/adriangb,adriangb
34256,https://github.com/wanglam,wanglam


In [23]:
human_and_unknown = pd.merge(
    not_bot, contributors_df,
    how='left',
    left_on='contributor', right_on='name',
)
human_and_unknown = human_and_unknown[['link','name']]
human_and_unknown

Unnamed: 0,link,name
0,https://github.com/kylaf,kylaf
1,https://github.com/sophiapeckner,sophiapeckner
2,https://github.com/LaExploradora,LaExploradora
3,https://github.com/ashlaycyriac,ashlaycyriac
4,https://github.com/ncitron,ncitron
...,...,...
174168,https://github.com/mastrolinux,mastrolinux
174169,https://github.com/bodokaiser,bodokaiser
174170,https://github.com/Omig12,Omig12
174171,https://github.com/craig-ludington,craig-ludington


Filter out bots by name: if it contains the word 'bot', remove it

In [24]:
humans = humans[
    ~(
        humans['link'].str.contains('bot', case=False, na=False) |
        humans['name'].str.contains('bot', case=False, na=False)
    )
]
humans

Unnamed: 0,link,name
0,https://github.com/ncitron,ncitron
1,https://github.com/devantler,devantler
2,https://github.com/roseeichelmann,roseeichelmann
3,https://github.com/Aishwaryajakka,Aishwaryajakka
4,https://github.com/marcosgopen,marcosgopen
...,...,...
34253,https://github.com/jackcook,jackcook
34254,https://github.com/ottomated,ottomated
34255,https://github.com/adriangb,adriangb
34256,https://github.com/wanglam,wanglam


In [25]:
human_and_unknown = human_and_unknown[
    ~(
        human_and_unknown['link'].str.contains('bot', case=False, na=False) |
        human_and_unknown['name'].str.contains('bot', case=False, na=False)
    )
]
human_and_unknown

Unnamed: 0,link,name
0,https://github.com/kylaf,kylaf
1,https://github.com/sophiapeckner,sophiapeckner
2,https://github.com/LaExploradora,LaExploradora
3,https://github.com/ashlaycyriac,ashlaycyriac
4,https://github.com/ncitron,ncitron
...,...,...
174168,https://github.com/mastrolinux,mastrolinux
174169,https://github.com/bodokaiser,bodokaiser
174170,https://github.com/Omig12,Omig12
174171,https://github.com/craig-ludington,craig-ludington


### 4. Hash users with SHA256

Only human:

In [26]:
import hashlib

def hash_link(link):
    return hashlib.sha256(link.encode('utf-8')).hexdigest()

humans['hash'] = humans['link'].apply(hash_link)

humans.to_csv('../data/humans_hash.csv', index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  humans['hash'] = humans['link'].apply(hash_link)


Human and unknown:

In [27]:
import hashlib

def hash_link(link):
    return hashlib.sha256(link.encode('utf-8')).hexdigest()

human_and_unknown['hash'] = human_and_unknown['link'].apply(hash_link)

human_and_unknown.to_csv('../data/humans_hash_complete.csv', index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  human_and_unknown['hash'] = human_and_unknown['link'].apply(hash_link)


### 5. Only keep Human in hackathon_project_contributor.csv

##### 5.1 Only keep human

In [34]:
proj_hack_member = pd.read_csv('../data/hackathon_project_contributor.csv')
humans_hash = pd.read_csv('../data/humans_hash.csv')

valid_links = set(humans_hash['link'])

# only keep links of human users
def filter_contributors(contributor_str):
    if pd.isna(contributor_str):
        return []
    links = [link.strip() for link in contributor_str.split(',')]
    filtered = [link for link in links if link in valid_links]
    return filtered

proj_hack_member['contributors'] = proj_hack_member['contributors'].apply(filter_contributors)

# Only keep the projects with more than 1 human contributors
proj_hack_member = proj_hack_member[proj_hack_member['contributors'].apply(len) > 1]

# Transfer the contributor list back to string
proj_hack_member['contributors'] = proj_hack_member['contributors'].apply(lambda lst: ','.join(lst))

link_to_name = dict(zip(humans_hash['link'], humans_hash['name']))

def map_links_to_names(contributor_str):
    links = contributor_str.split(',')
    names = [link_to_name.get(link.strip()) for link in links if link.strip() in link_to_name]
    return ','.join(names)

proj_hack_member['contributor_github_username'] = proj_hack_member['contributors'].apply(map_links_to_names)

proj_hack_member.to_csv('../data/proj_hack_human.csv',index=False)
proj_hack_member



Unnamed: 0,hackathon_URL,project_URL,github_links,start_date_format,end_date_format,participants,contributors,contributor_github_username
5,https://supernova.devpost.com/,https://devpost.com/software/meta-yield-liquid...,https://github.com/Narwallets/meta-yield-ic,2022-05-10,2022-06-22,"https://devpost.com/claudio744, https://devpos...","https://github.com/imcsk8,https://github.com/j...","imcsk8,josemariasosa,leomanza"
8,https://supernova.devpost.com/,https://devpost.com/software/nnsdao-protocol,"https://github.com/NnsDao/nnsdao_sdk, https://...",2022-05-10,2022-06-22,https://devpost.com/nnsdaos985,"https://github.com/chuhemiao,https://github.co...","chuhemiao,rosendolu"
18,https://supernova.devpost.com/,https://devpost.com/software/candb,https://github.com/canscale/supernova-candb-de...,2022-05-10,2022-06-22,https://devpost.com/byron-becker,"https://github.com/ByronBecker,https://github....","ByronBecker,skilesare"
21,https://supernova.devpost.com/,https://devpost.com/software/mops,https://github.com/ZenVoich/mops,2022-05-10,2022-06-22,https://devpost.com/ZenVoich,"https://github.com/letmejustputthishere,https:...","letmejustputthishere,peterpeterparker,rvanasa,..."
42,https://supernova.devpost.com/,https://devpost.com/software/deadly,"https://github.com/hiukim/mind-ar-js, https://...",2022-05-10,2022-06-22,"https://devpost.com/avirads, https://devpost.c...","https://github.com/Makio64,https://github.com/...","Makio64,jmswrnr,krpeacock,peterpeterparker,yos..."
...,...,...,...,...,...,...,...,...
76016,https://level-up-society-13094.devpost.com/,https://devpost.com/software/saviour-cwdrmz,"https://github.com/Deblina28/Saviour, https://...",2021-07-16,2021-07-18,"https://devpost.com/avijitdasxp, https://devpo...","https://github.com/Bodobolero,https://github.c...","Bodobolero,Dozingfiretruck,JohnZ03,LynnL4,MauA..."
76017,https://realityfest-exhibition-hall.devpost.com/,https://devpost.com/software/saviour-cwdrmz,"https://github.com/Deblina28/Saviour, https://...",2021-10-18,2021-10-24,"https://devpost.com/avijitdasxp, https://devpo...","https://github.com/Bodobolero,https://github.c...","Bodobolero,Dozingfiretruck,JohnZ03,LynnL4,MauA..."
76081,https://cosmos-hackatom-vi.devpost.com/,https://devpost.com/software/validators-wars-f...,https://github.com/selimerunkut/Validator_Wars,2021-11-11,2021-12-10,"https://devpost.com/alextazh, https://devpost....","https://github.com/adust09,https://github.com/...","adust09,selimerunkut"
76082,https://hackatom-ru.devpost.com/,https://devpost.com/software/validators-wars-f...,https://github.com/selimerunkut/Validator_Wars,2021-03-15,2021-03-28,"https://devpost.com/alextazh, https://devpost....","https://github.com/adust09,https://github.com/...","adust09,selimerunkut"


##### 5.2 Keep Human and Unknown

In [36]:
proj_hack_member = pd.read_csv('../data/hackathon_project_contributor.csv')
humans_hash_complete = pd.read_csv('../data/humans_hash_complete.csv')

valid_links_complete = set(humans_hash_complete['link'])

# print(len(valid_links_complete))

# only keep links of human users
def filter_contributors(contributor_str):
    if pd.isna(contributor_str):
        return []
    links = [link.strip() for link in contributor_str.split(',')]
    filtered = [link for link in links if link in valid_links_complete]
    return filtered

proj_hack_member['contributors'] = proj_hack_member['contributors'].apply(filter_contributors)

# Only keep the projects with more than 1 human contributors
proj_hack_member = proj_hack_member[proj_hack_member['contributors'].apply(len) > 1]

# Transfer the contributor list back to string
proj_hack_member['contributors'] = proj_hack_member['contributors'].apply(lambda lst: ','.join(lst))

link_to_name = dict(zip(humans_hash_complete['link'], humans_hash_complete['name']))

def map_links_to_names(contributor_str):
    links = contributor_str.split(',')
    names = [link_to_name.get(link.strip()) for link in links if link.strip() in link_to_name]
    return ','.join(names)

proj_hack_member['contributor_github_username'] = proj_hack_member['contributors'].apply(map_links_to_names)

proj_hack_member.to_csv('../data/proj_hack_human_complete.csv',index=False)
proj_hack_member



Unnamed: 0,hackathon_URL,project_URL,github_links,start_date_format,end_date_format,participants,contributors,contributor_github_username
0,https://supernova.devpost.com/,https://devpost.com/software/faefolk,"https://github.com/ICCards/faefolk, https://gi...",2022-05-10,2022-06-22,"https://devpost.com/RAW4RMCS, https://devpost....","https://github.com/ALLiDoizCode,https://github...","ALLiDoizCode,maxwisch"
3,https://supernova.devpost.com/,https://devpost.com/software/tingram,https://github.com/tingramtingram/dfinity,2022-05-10,2022-06-22,"https://devpost.com/k-tsytsyn, https://devpost...","https://github.com/stoma655,https://github.com...","stoma655,tingramtingram"
4,https://supernova.devpost.com/,https://devpost.com/software/ant-kingdom,https://github.com/NFPTU/dfinity-fu,2022-05-10,2022-06-22,"https://devpost.com/damtuankhanglm1, https://d...","https://github.com/DatTNT,https://github.com/a...","DatTNT,anhth2912,khangdthe141099,thienvip107"
5,https://supernova.devpost.com/,https://devpost.com/software/meta-yield-liquid...,https://github.com/Narwallets/meta-yield-ic,2022-05-10,2022-06-22,"https://devpost.com/claudio744, https://devpos...","https://github.com/imcsk8,https://github.com/j...","imcsk8,josemariasosa,leomanza"
6,https://supernova.devpost.com/,https://devpost.com/software/4everland,https://github.com/4everland/dashboard-website...,2022-05-10,2022-06-22,https://devpost.com/4everlandoc,"https://github.com/haifun,https://github.com/h...","haifun,hyongnim,indirasieben,saullary,thloyi,y..."
...,...,...,...,...,...,...,...,...
76255,https://hacktech-2022.devpost.com/,https://devpost.com/software/suit-yourself,"https://github.com/LilyPerr/hackscios, https:/...",2022-03-05,2022-03-06,"https://devpost.com/daneshbadlani, https://dev...","https://github.com/huntleym,https://github.com...","huntleym,kimiazargari"
76263,https://lhd-learn-day-3.devpost.com/,https://devpost.com/software/track-your-learni...,https://github.com/ektaarora16/Track-your-Lear...,2021-10-12,2021-10-13,"https://devpost.com/aaquib_dev, https://devpos...","https://github.com/Ruthvik2127,https://github....","Ruthvik2127,ektaarora16,imaaquibali"
76264,https://chainlink-fall-hackathon-2021.devpost....,https://devpost.com/software/tsunami-fler8g,https://github.com/cryptohighway/fall-2021-tsu...,2021-10-22,2021-11-29,"https://devpost.com/Kryptokelli, https://devpo...","https://github.com/PatrickAlphaC,https://githu...","PatrickAlphaC,cryptohighway,dwightjl,pappas999"
76265,https://topblockchainstartup.devpost.com/,https://devpost.com/software/tsunami-fler8g,https://github.com/cryptohighway/fall-2021-tsu...,2021-06-15,2021-08-08,"https://devpost.com/Kryptokelli, https://devpo...","https://github.com/PatrickAlphaC,https://githu...","PatrickAlphaC,cryptohighway,dwightjl,pappas999"
