# Dapp Scraper
The purpose of this file is to retrieve all Ethereum dApps from state of the dApps and store them in a DB

### Table of content
1. This scraper retrieves all Ethereum Dapps listed on www.stateofthedapps.com.
To do so it proceeds in the following steps:
-   initiates a webdriver
-   loops through the list of dApps and retrievs high-level info (including link to subpage)
-   loops through all subpages and retrieves detailed data 

2. In the second section, the scraper inserts the data into the Postgres SQL database


## 1. Scraper functions

In [1]:
# import libraries 
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
import os
import os.path
import time


In [6]:
# Initiating the webdriver (normally)
def init_webdriver():
    """ this function initiates the webdriver using the chromedriver.exe located in the directory"""

    from selenium import webdriver
    from selenium.webdriver.common.keys import Keys
    from selenium.webdriver.chrome.options import Options
    driver = webdriver.Chrome("link to webdriver") #insert link to webdriver
    driver.maximize_window()
    #driver.set_window_size(2560, 1440)

    return driver

def quit_webdriver(driver):
    """ this function quits the webdriver session"""
    driver.quit()

In [9]:
# init webdriver
driver = init_webdriver()

In [8]:
# close webdriver
quit_webdriver(driver)

## Function 1 - loop through list of Ethereum Dapps

- navigate to Ethereum Dapp list 
- click on "view more rankings"
- loop through first 50 dApps 
- click next button
- end loop if clicking next button is not possible anymore -> print retrieving list of dApps is complet

In [12]:
# function 1 - dapp meta data

def get_dapp_meta_data(driver):

    from selenium import webdriver
    """
    - this function navigates to the state of the dapp webpage 
    - on the webpage it collects all names and links of all dapps on the ethereum platform
    - it takes a webdriver object as argument

    - it returns a list of dictionaries containting:
        - name of the dapp
        - a short teaser
        - the platform
        - the category of the dapp
        - the link to the subpage

    """

    # go to first page
    try:
        driver.get("https://www.stateofthedapps.com/rankings/platform/ethereum?page=1")
    except:
        print("Driver could not access first page")

    # find number of last page
    try:
        last_page = driver.find_element_by_xpath('//*[@id="__layout"]/div/div/main/div[2]/div[2]/div/div/div[2]/div[2]/button[5]').text
    except:
        print("Driver could not find last page")

    # init counter
    i = 1

    # init list that stores all dapp data
    dapp_list = []

    while i <= int(last_page):
        driver.get("https://www.stateofthedapps.com/rankings/platform/ethereum?page="+ str(i))
        print("collecting page: " + str(i))
        

        # finde table that contains all dapps
        try:
            table_body = driver.find_element_by_class_name("table-body")
        except:
            print("Driver could not find table-body")

        # find all rows of the table
        try:
            rows = table_body.find_elements_by_class_name("table-row")
        except:
            print("Driver could not find rows in table-body")

        # loops through all rows of the table and store the data in a dict. Appends the dict to a list
        for row in rows:
            dapp_dict = {}

            try:
                dapp_dict["d_name"] = row.find_element_by_class_name("name").text
            except:
                dapp_dict["d_name"] = "NaN"
            try:
                dapp_dict["d_teaser"] = row.find_element_by_class_name("teaser").text
            except:
                dapp_dict["d_teaser"] ="NaN"
            try:
                dapp_dict["d_platform"] = row.find_element_by_class_name("RankingTablePlatform").text
            except:
                dapp_dict["d_platform"] = "NaN"
            try:
                dapp_dict["d_cat"] = row.find_element_by_class_name("RankingTableCategory").text
            except:
                dapp_dict["d_cat"] = "NaN"
            try:
                dapp_dict["d_link"] = row.find_element_by_tag_name('a').get_attribute("href")
            except:
                dapp_dict["d_link"] = "NaN"

            dapp_list.append(dapp_dict)

        i += 1

    print("done with collecting all dapps")

    return dapp_list

## Function 1 for abandoned dApps
Abandoned dApps are not listed with the live, wip, beta, and prototype dApps and must be scraped from a different subpage

In [81]:
# function 1 - for abandoned dApps

def get_dapp_meta_data_abandoned(driver):

    from selenium import webdriver
    """
    - this function navigates to the state of the dapp webpage (abandoned subpage)
    - on the webpage it collects all names and links of all dapps on the ethereum platform
    - it takes a webdriver object as argument
    - the collected variables have to match with the function above since both data are concatinated and inserted in the same db later on

    - it returns a list of dictionaries containting:
        - name of the dapp
        - a short teaser
        - the platform
        - the category of the dapp
        - the link to the subpage

    """

    # go to first page
    try:
        driver.get("https://www.stateofthedapps.com/dapps/platform/ethereum?status=abandoned&page=1")
    except:
        print("Driver could not access first page")

    # find number of last page
    try:
        last_page = driver.find_element_by_xpath('/html/body/div/div/div/div/main/div[2]/div/div[2]/div[3]/div/div[2]/button[5]/span').text
    except:
        print("Driver could not find last page")

    # init counter
    i = 1

    # init list that stores all dapp data
    dapp_list = []

    while i <= int(last_page):
        driver.get("https://www.stateofthedapps.com/dapps/platform/ethereum?status=abandoned&page="+ str(i))
        print("collecting page: " + str(i))
        

        # finde table that contains all dapps
        try:
            card_list = driver.find_element_by_class_name("DappCardList")
        except:
            print("Driver could not find DappCardList")

        # find all dApp cards

        try:
            cards = card_list.find_elements_by_tag_name('li') 
            
        except:
            print("Driver could not find rows in table-body")

        # loops through all rows of the table and store the data in a dict. Appends the dict to a list
        for card in cards:
            dapp_dict = {}

            try:
                dapp_dict["d_name"] = card.find_element_by_class_name("title-4").text
            except:
                dapp_dict["d_name"] = "NaN"
            try:
                dapp_dict["d_teaser"] = card.find_element_by_class_name("description").text
            except:
                dapp_dict["d_teaser"] ="NaN"
            try:
                dapp_dict["d_platform"] = "Ethereum"
            except:
                dapp_dict["d_platform"] = "Ethereum"
            try:
                dapp_dict["d_cat"] = card.find_element_by_class_name("category").text
            except:
                dapp_dict["d_cat"] = "NaN"
            try:
                dapp_dict["d_link"] = card.find_element_by_tag_name('a').get_attribute("href")
            except:
                dapp_dict["d_link"] = "NaN"

            dapp_list.append(dapp_dict)

        i += 1

    print("done with collecting all dapps")

    return dapp_list

In [None]:
# test abandoned dapp function

## Function 2 - loop through dapp subpages 

This function collects all data from all dapp subpages

the data comprise:
- status
- author
- license
- description
- last update
- deployed
- development activity
- active users 
- transactions
- volume 
- mainnet contracts (expand contracts)
- critic review 
- critic review link
- page views
- page clicks
- related dapps 
- comparable dapp incl. link
- meta mask recommended? 
- dapp webpage
- social media
    - github, reddit, chat, blog, twitter
- profile strenght
- recommendations
    - positive
    - neutral
    - negative
- ponzi scheme warning

In [82]:
def get_dapp_details(driver, dapp_link):

    """
    this function navigates to a dapp's subpage an retrieves info about the dapp
    it takes 2 arguments:
        - driver: a webdriver object 
        - dapp_link: string (a link to a dapp's subpage)
    """
    # im port libraries
    from selenium import webdriver
    import datetime

    # init return dict
    dapp_dict = {}

    # navigate to subpage
    driver.get(dapp_link)

    # get data
    # get status data 
    try:
        dapp_dict["d_descr"] = driver.find_element_by_class_name("description").text
    except:
        dapp_dict["d_descr"] = "NaN"
    try:
        dapp_dict["d_status"] = driver.find_element_by_class_name("DappDetailBodyContentModulesStatus").text[7:]
    except:
        dapp_dict["d_status"] = "NaN"
    try:
        dapp_dict["d_author"] = driver.find_element_by_class_name("author-data").text
    except:
        dapp_dict["d_author"] = "NaN"
    try:
        dapp_dict["d_license"] = driver.find_element_by_class_name("license-data").text
    except:
        dapp_dict["d_license"] = "NaN"
    try:
        dapp_dict["d_updated"] = driver.find_element_by_class_name("DappDetailBodyContentModulesUpdated").text[21:]
    except:
        dapp_dict["d_updated"] = "NaN"
    try:
        dapp_dict["d_submitted"] = driver.find_element_by_class_name("DappDetailBodyContentModulesSubmitted").text[13:]
    except:
        dapp_dict["d_submitted"] = "NaN"
    try:
        dapp_dict["d_website"] = driver.find_element_by_class_name("DappDetailBodyContentCtas").find_element_by_tag_name('a').get_attribute('href')
    except:
        dapp_dict["d_website"] = "NaN" 
    try:
        dapp_dict["d_profile_str"] = driver.find_element_by_class_name("DappProfile").find_element_by_class_name("description").text
    except:
        dapp_dict["d_profile_str"] = "NaN"

    # get development data
    try:
        dev_data = driver.find_elements_by_class_name("dev-data")
        dapp_dict["d_push"] = dev_data[0].text
        dapp_dict["d_pull"] = dev_data[1].text
    except:
        dapp_dict["d_push"] = "NaN"
        dapp_dict["d_pull"] = "NaN"

    # get usage data
    try:
        stats_data = driver.find_elements_by_class_name("DappDetailBodyContentModulesStats")

        dapp_dict["d_users_d"] = stats_data[0].text.split()[4]
        dapp_dict["d_users_w"] = stats_data[0].text.split()[6]
        dapp_dict["d_users_m"] = stats_data[0].text.split()[8]

        dapp_dict["d_txn_1"]  = stats_data[1].text.split()[3]
        dapp_dict["d_txn_7"]  = stats_data[1].text.split()[6]
        dapp_dict["d_txn_30"] = stats_data[1].text.split()[9]

        dapp_dict["d_eth_1"]  = stats_data[2].text.split()[4]
        dapp_dict["d_eth_7"]  = stats_data[2].text.split()[7]
        dapp_dict["d_eth_30"] = stats_data[2].text.split()[10]

    except:
        dapp_dict["d_users_d"]  = "NaN"
        dapp_dict["d_users_w"]  = "NaN"
        dapp_dict["d_users_m"]  = "NaN"

        dapp_dict["d_txn_1"]    = "NaN"
        dapp_dict["d_txn_7"]    = "NaN"
        dapp_dict["d_txn_30"]   = "NaN"

        dapp_dict["d_eth_1"]    = "NaN"
        dapp_dict["d_eth_7"]    = "NaN"
        dapp_dict["d_eth_30"]   = "NaN"


    # get contract addresses (mainnet)
        # expand list of contracts 
    try:
        driver.find_element_by_class_name("show-hide").click()

    except: 
        #print("no need to expand")
        pass

        # store all contract addresses in a list 
    contract_list = []
    try:
        contract_name = driver.find_element_by_class_name("contract-name").text

        # check if contracts are mainnet contracts 
        if  contract_name == "Mainnet Vertrags (Ethereum)" or contract_name == "Mainnet Vertrag (Ethereum)" or contract_name == "Mainnet contract (Ethereum)"or contract_name == "Mainnet contracts (Ethereum)":

            contract_addresses = driver.find_element_by_class_name("contract-addresses")

            for contract in contract_addresses.find_elements_by_class_name("contract-address-value"):

                contract_list.append(contract.text)

        else:
            contract_list = []
            print("no mainnet contracts found")
    except:
        contract_list = []
        print("Driver could not find contract list element")

    dapp_dict["contract_list"] = str(contract_list)



    # get critical reviews 
    d_review_list = []

    try:
        review_section = driver.find_element_by_class_name("review-list")
        


        for review in review_section.find_elements_by_class_name("review-item"):

            review_dict = {}

            review_dict["summary"] = review.find_element_by_class_name("summary").text
            review_dict["link"] = review.find_element_by_tag_name('a').get_attribute('href')
            d_review_list.append(review_dict)
    except:
        d_review_list = []

    dapp_dict["d_review_list"] = str(d_review_list)



    # get page views and clicks
    try:
        dapp_dict["d_views"] = driver.find_element_by_class_name("ctr-info").find_elements_by_tag_name("strong")[0].text[:-6]
    except:
        dapp_dict["d_views"] = "NaN"
    try:
        dapp_dict["d_clicks"] = driver.find_element_by_class_name("ctr-info").find_elements_by_tag_name("strong")[1].text[:-7]
    except:
        dapp_dict["d_clicks"] = "NaN"
    try:
        dapp_dict["d_ctr"] = driver.find_element_by_class_name("ctr-info").find_elements_by_tag_name("strong")[2].text[:-5]
    except:
        dapp_dict["d_ctr"] = "NaN"


    # get related dapps
    rel_dapp_list = [] 

    try:
        for card in driver.find_elements_by_class_name("DappCardListItem"):
            related_dapp_dict = {} 

            related_dapp_dict["name"] = card.find_element_by_tag_name('a').find_elements_by_tag_name('div')[1].text.split("\n")[0]
            related_dapp_dict["descr"] = card.find_element_by_tag_name('a').find_elements_by_tag_name('div')[1].text.split("\n")[1]
            related_dapp_dict["card_link"] = card.find_element_by_tag_name('a').get_attribute('href')

            rel_dapp_list.append(related_dapp_dict)
    except:
        rel_dapp_list = []

    dapp_dict["rel_dapp_list"] = str(rel_dapp_list)



    # get social media channels 
    social_list = []

    try:
        for social in driver.find_elements_by_class_name("social-item"):
            social_list.append(social.find_element_by_tag_name('a').get_attribute('href'))

        # get comparable dapp
        d_comparable = driver.find_element_by_class_name("DappDetailBodyContentPlatform").find_elements_by_tag_name('a')[1].get_attribute('href')

    except:
        social_list = []

    dapp_dict["social_list"] = str(social_list)




    # get reactions
    try:
        reactions = driver.find_elements_by_class_name("reaction-item")

        dapp_dict["d_reaction_pos"] = reactions[0].text
        dapp_dict["d_reaction_neu"] = reactions[1].text
        dapp_dict["d_reaction_neg"] = reactions[2].text

    except:
        dapp_dict["d_reaction_pos"] = "NaN"
        dapp_dict["d_reaction_neu"] = "NaN"
        dapp_dict["d_reaction_neg"] = "NaN"

    # get tag list
    tag_list = []

    try:
        for item in driver.find_elements_by_class_name("tag-item"):
            tag_list.append(item.text)

    except:
        tag_list = []

    dapp_dict["tag_list"] = str(tag_list)

    # get meta mask recommendation
    try:
        driver.find_element_by_class_name("DappDetailBodyContentPlatform").find_element_by_class_name("software-wrapper")
        dapp_dict["d_metamask_recom"] = 1
    except:
        dapp_dict["d_metamask_recom"] = 0



    # get ponzi warning
    try:
        dapp_dict["d_ponzi_warning"] = driver.find_element_by_class_name("alert-wrapper").text
    except:
        dapp_dict["d_ponzi_warning"] = "No warning"


    dapp_dict["added"] = str(datetime.date.today())
    
    return dapp_dict



# 2. Insert data into DB

In [20]:
driver = init_webdriver()

In [None]:
# call first function 
    # this command collects all live dapps and in a second step all abandoned dapps
driver = init_webdriver()
dapp_list = get_dapp_meta_data(driver)

abandonded_dapp_list = get_dapp_meta_data_abandoned(driver)

dapp_list = dapp_list + abandonded_dapp_list

In [None]:
# call second function 
res_list = []
for dapp_link in dapp_list:
    dapp_detail_dict = get_dapp_details(driver, dapp_link["d_link"])

    res = {**dapp_link, **dapp_detail_dict}
    res_list.append(res)
    

In [None]:
# create insert data 
    # how to concat two dicts and get the values as a list
#insert_data = list({**dapp_dict_1,**dapp_dict_2}.values())

## Create the DB insert call

In [97]:
import psycopg2
# conncet to db

connect = "host=localhost dbname=postgres user=postgres password="
password = "" # load password from environmen or hard code (not recommended)

conn = psycopg2.connect(connect+password)
cur = conn.cursor()

In [98]:
# sql insert statement
    # the insert data has to be a list
dapp_data_insert = ("""INSERT INTO dapp_data (
                            d_name,
                            d_teaser, 
                            d_platform, 
                            d_cat,
                            d_link,
                            d_descr,
                            d_status,
                            d_author,
                            d_license,
                            d_updated,
                            d_submitted,
                            d_website,
                            d_profile_str,
                            d_push,
                            d_pull,
                            d_users_d,
                            d_users_w,
                            d_users_m,
                            d_txn_1,
                            d_txn_7,
                            d_txn_30,
                            d_eth_1,
                            d_eth_7,
                            d_eth_30,
                            d_contract_list,
                            d_review_list,
                            d_views,
                            d_clicks,
                            d_ctr,
                            d_rel_dapp_list,
                            d_social_list,
                            d_reaction_pos,
                            d_reaction_neu,
                            d_reaction_neg,
                            d_tag_list,
                            d_metamask_recom,
                            d_ponzi_warning,
                            added) \
                            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
                            ON CONFLICT (d_name) DO NOTHING
""")

In [99]:
# insert data in db
    # the insert data has to be a list
for result in res_list:
    cur.execute(dapp_data_insert, list(result.values()))
print("data successfully inserted")

data successfully inserted


In [100]:
# changes must be committed and connection must be closed
conn.commit()
conn.close()


## 3. Function that populates the dapp contract link table

In [None]:
import psycopg2
# conncet to db

connect = "host=localhost dbname=postgres user=postgres password="
password = "" #load password from environment or hard code (not recommended)

conn = psycopg2.connect(connect+password)
cur = conn.cursor()

In [None]:
# populate the contract link table
    # the contract list in dapps_data needs to be split into single rows



# create the sql insert command
dapp_contract_link_insert = ("""INSERT INTO dapp_contract_link (
                            c_address,
                            d_name
                            ) \
                            VALUES (%s, %s)
                            ON CONFLICT (c_address) DO NOTHING
""")


# execute contract adddress query
cur.execute("""SELECT d_name, d_contract_list FROM dapp_data""")
sql_results = cur.fetchall()

# loop through all dapps
for dapp_entry in sql_results:

    # get the name of a dapp and store it in a variable
    dapp_name = "NaN"
    dapp_name = dapp_entry[0]
    # get the contract address list and save it as a list
    contract_list = ast.literal_eval(dapp_entry[1])

    # loop through the contract list and save the dapp name with the dapp contract address in the contract_link table
    for contract_address in contract_list:
        insert_list = []
        insert_list.append(contract_address)
        insert_list.append(dapp_name)
        

        # insert in to db
        cur.execute(dapp_contract_link_insert,insert_list)

conn.close()

print("data successfully inserted")

#### Example query to check if contract links are stored in DB

In [None]:
# execute contract adddress query
cur.execute("""SELECT * FROM dapp_contract_link""")

sql_results = cur.fetchall()
print(sql_results[1000])

## Test queries to check if the insert worked

In [None]:
cur.execute("""SELECT * FROM dapp_data""")
#cur.execute(song_select, (row.song, row.artist, row.length))
sql_results = cur.fetchall()
for result in sql_results:
    print(result)

In [103]:
# cheack results 
%load_ext sql

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [None]:
%sql SELECT d_name, d_status FROM dapp_data;

In [None]:
%sql SELECT d_name FROM dapp_data;