# dApps' Github Repo Scraper

This scraper takes a list of Ethereum dApps, tries to find their Github projects, and returns the project's meta data

## 1. Finding Github Projects

In [1]:
from ast import literal_eval
import pandas as pd

### 1.1 Import dApp names 

In [2]:
# load the csv using the converters parameter with literal_eval
df_dapps = pd.read_csv(r"ethereum.csv", sep=",", converters={'data': literal_eval}, index_col=0)

# get list of dapp names
df_dapps.name

0                         OpenSea
1                      Uniswap V2
2                      Uniswap V3
3                   MetaMask Swap
4                             Gem
                  ...            
3409                      LatiumX
3410    Mecenate Fine Art Gallery
3411                   NESTFI-Win
3412                        rhino
3413               TomorrowsPrice
Name: name, Length: 3414, dtype: object

### 1.2 Search for Github projects on DuckDuckGo 

Initiated a webdriver instance

In [3]:
# this function uses selenium and chromedriver to search startpage for name, company, and "linkedin"

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
import os
import time
from selenium.webdriver.common.by import By
from random import randrange, uniform

# Init webdriver - webdriver is initiated outside of the function to increase speed
"""
    # Init webdriver headless to increase performance
# get path of webdriver
chrome_driver = webdriver.Chrome("D:/Drive/01_Promotion/31_Code/01_Python/GitHub Readme/chromedriver.exe")
    # set options of webdriver to headless
chrome_options = Options()
chrome_options.add_argument("--headless")
    # set screensize to 1920x1080
chrome_options.add_argument("--window-size=1920x1080")
"""

#driver = webdriver.Chrome(chrome_options=chrome_options, executable_path=chrome_driver)

    # Init webdriver normally to see mistakes
driver = webdriver.Chrome("chromedriver.exe")
driver.maximize_window()


# navigate webdriver
"""try:
    driver.get("https://duckduckgo.com/")
except:
    print("webdriver failure")"""

'try:\n    driver.get("https://duckduckgo.com/")\nexcept:\n    print("webdriver failure")'

Search on startpage

In [4]:
# define find github project name function

def find_github(dapp_name):
    """
    This function takes a dApp name as an argument and searches for github projects with a similar name. It returns the three most often found project names

    Args:
    dapp_name(str): name of a dApp 
    Returns
    dapp_dict(dict): dictonary containing name of the dapp and three most often found github project names 
    """

    # import libraries
    import time
    from random import randrange, uniform
    from collections import Counter
    
    # ini
    github_links = []
    dapp_dict = {}
    
    # navigate webdriver to duckduckgo search engine
    try:
        driver.get("https://duckduckgo.com/")
    except:
        print("webdriver failure")

    try:
        search = driver.find_element(By.ID, "search_form_input_homepage")
    except:
        print("could not find search bar")


    # search for dapp name and github
    try:
        search.send_keys(Keys.CONTROL, 'a')
        search.send_keys(Keys.BACKSPACE)
        time.sleep(uniform(0.5,0.7))
        search.send_keys(dapp_name+","+"github") #enter search terms here
        search.send_keys(Keys.RETURN)
    except:
        print("could not enter search terms")

    time.sleep(uniform(1.2, 1.9))


    # process results page
    try:
        results = driver.find_element(By.ID, "links")
    except:
        print("could not access results page")

    # find all hrefs in table
    #hrefs = results.find_elements(By.XPATH, "//a[@href]")
    try:
        hrefs = results.find_elements(By.TAG_NAME, "a")
    except:
        print("could not find a tags")

    for href in hrefs:
        if href.get_attribute("href")[0:len("https://github.com/")] == "https://github.com/":
            github_links.append(href.get_attribute("href").split("/")[3])

    # extract most often found project names
    most_common = Counter(github_links).most_common(3)

    # create dict to return
    dapp_dict["dapp_name"] = dapp_name
    if len(most_common) >= 1:
        dapp_dict["github_1"] = most_common[0][0]
    else:
        dapp_dict["github_1"] = ""

    if len(most_common) >= 2:
        dapp_dict["github_2"] = most_common[1][0]
    else:
        dapp_dict["github_2"] = ""

    if len(most_common) >= 3:
        dapp_dict["github_3"] = most_common[2][0]
    else:
        dapp_dict["github_3"] = ""

    # return final result
    return dapp_dict


Test function with a single dApp name

In [104]:
dapp_name = "Uniswap V2"

test = find_github(dapp_name)

Loop through list of dApps and store the resulting github projects in a data frame

In [5]:
# looping through all dapps takes ~220 min
# init an empty list to store the returned dicts
dapp_dict_list = []
i = 0

# loop through list of dapps and create dicts
for dapp in df_dapps.name:
    try:
        dapp_dict_list.append(find_github(dapp))
    except:
        print("could not retrieve results for: "+dapp+ " @: "+str(i))
    i += 1

df_dapps_github = pd.DataFrame.from_dict(dapp_dict_list)

df_dapps_github.to_csv("dapps_github.csv", sep = ";")

Instead of using duckduckgo, we can also use google. But for goole we need to adjust the html elements.
We dont use google as it has a more strickt bot protection. 

In [53]:
# search google
driver.get("https://www.google.com/")
search = driver.find_element(By.NAME, "q")


dapp_name = "Uniswap V2"
search.send_keys(Keys.CONTROL, 'a')
search.send_keys(Keys.BACKSPACE)
time.sleep(uniform(0.5,0.7))
search.send_keys(dapp_name+","+"github") #enter search terms here
search.send_keys(Keys.RETURN)

Manual verification of github project names is required

## 2. Retrieving Project Meta Data from Github

authenticate with Github API

In [56]:
from dotenv import load_dotenv, find_dotenv
import requests
import json

# load github credentials from .env
load_dotenv()

USER_NAME = os.getenv("USER_NAME")
TOKEN = os.getenv("TOKEN")


In [57]:
# function to get github meta data for dApps

def get_dapp_github_info(dapp_name, dapp_github_name, USER_NAME, TOKEN):
    """ 
    This function takes the github space name of a dapp and retrieves the number of repositories, followers, and members as a dict

    Args:
    dapp_name (str): name of the dapp
    dapp_github_name (str): dapp's github org name
    USER_NAME (str): github username from .env
    TOKEN (str): github API key from .env

    Returns:
    dapp_github_dict (dict): dict with dapp name, dapp's github name, count of repos, count of followers, count of members 
    """

    #import libraries
    import requests
    import json

    # init empty dict to return
    dapp_github_dict = {}

    # get organization info 
    req = requests.get("https://api.github.com/orgs/"+dapp_github_name, auth=(USER_NAME,TOKEN))

    if req.status_code == 200:
        
        r_dict = json.loads(req.text)

    else:
        print(req.status_code)

    # store results in dict
    dapp_github_dict["dapp_name"] = dapp_name
    dapp_github_dict["dapp_github_name"] = dapp_github_name

    # get number of repos
    dapp_github_dict["public_repos"] = r_dict["public_repos"]


    # get number of followers
    dapp_github_dict["followers"] = r_dict["followers"]


    # get members 
    req = requests.get(" https://api.github.com/orgs/"+dapp_github_name+"/members", auth=(USER_NAME,TOKEN))
    if req.status_code == 200:
        r_dict = json.loads(req.text)
    else:
        print("could not get members; status code: "+str(req.status_code))

    dapp_github_dict["members"] = len(r_dict)


    return dapp_github_dict
 

In [58]:
dapp_name = "Uniswap V2"
dapp_github_name = "Uniswap"

test = get_dapp_github_info(dapp_name, dapp_github_name, USER_NAME, TOKEN)

## This will take a lot of time

get all repositories

In [40]:
# 
req = requests.get("https://api.github.com/orgs/Uniswap/repos?per_page=100", auth=(USER_NAME,TOKEN))

if req.status_code == 200:
    
    r_dict = json.loads(req.text)

else:
    print(req.status_code)

get one repository

In [None]:
https://api.github.com/repos/OWNER/REPO

get members of an organization

In [None]:
  https://api.github.com/orgs/ORG/members

get commits from repo

In [None]:
# "https://api.github.com/repos/torvalds/linux/commits"
req = requests.get("https://api.github.com/repos/torvalds/commits", auth=(USER_NAME,TOKEN))

if req.status_code == 200:
    
    r_dict = json.loads(req.text)

variables to query:
- number of participats
- number of forcs
- stars/watchers

- commit history

Authenticate with Github API

In [None]:
req <- GET("https://api.github.com/repos/torvalds/linux/commits", config=gtoken)