# Etherscan Contract Scraper

The purpose of this scraper is to download the info available about a dapps smart contract from etherscan.io

To do so the scraper proceeds as follows:
- first it gets the contract address from the ethereum_data database
- it navigates to the etherscan.io subpage of the contract and retrievs all relevant data 
- it downloads the verified source code (if available) for the etherscan API
- finally, it store all data in the contract_data table of the ethereum_data database 

## 1. Load libraries

In [1]:
# import libraries 
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
import os
import os.path
import time
import pandas as pd

## 2. Init Webdriver

In [1]:
# Initiating the webdriver (normally)
def init_webdriver():
    """ this function initiates the webdriver using the chromedriver.exe located in the directory"""

    from selenium import webdriver
    from selenium.webdriver.common.keys import Keys
    from selenium.webdriver.chrome.options import Options
    driver = webdriver.Chrome(r"insert location of chromedriver") #insert location of chromedriver
    driver.maximize_window()
    #driver.set_window_size(2560, 1440)

    return driver

def quit_webdriver(driver):
    """ this function quits the webdriver session"""
    driver.quit()

In [29]:
driver = init_webdriver()

## 3. Write a function that collects contract info

In [2]:
# define get_contract_info function

def get_contract_info(contract_address, driver):

    """
    this function takes a contract address as argument, navigates to the etherscan subpage and returns four strings containing the contract infos 

    arguments:
    - contract_addresss (string): address of a smart contract
    - driver (webdriver object): webdriver
    """

    # import libraries
    import time

    # create query link
    link = "https://etherscan.io/address/"+contract_address+"#code"


    # try to access the contract subpage and get information
    try:
        driver.get(link)

        # find contract information

        try: 
            nameTag = driver.find_element_by_xpath("/html/body/div[1]/main/div[4]/div[1]/div[1]/div/div[1]/div/span/span").text
        except:                                     
            try: 
                nameTag = driver.find_element_by_xpath("/html/body/div[1]/main/div[4]/div[1]/div[1]/div/div[1]/div/span").text
            except:
                try:
                    nameTag = driver.find_element_by_xpath("/html/body/div[1]/main/div[5]/div[1]/div[1]/div/div[1]/div/span/span").text
                except:
                    nameTag = "notFound"
        try: 
            website = driver.find_element_by_xpath("/html/body/div[1]/main/div[4]/div[1]/div[1]/div/div[1]/div/span/a").get_attribute('href')
        except:
            try:
                website = driver.find_element_by_xpath("/html/body/div[1]/main/div[5]/div[1]/div[1]/div/div[1]/div/span/a").get_attribute('href')
            except:                                    
                website = "notFound"
        try:
            tokenName = driver.find_element_by_xpath("/html/body/div[1]/main/div[4]/div[1]/div[2]/div/div[2]/div[3]/div/div[2]/a").text
        except:                                        
            try:
                tokenName = driver.find_element_by_xpath("/html/body/div[1]/main/div[5]/div[1]/div[2]/div/div[2]/div[3]/div/div[2]/a").text
            except:                                    
                tokenName = "notFound"

        # wait until the contract subpage is completely loaded
        time.sleep(1)
        try:
            contractName = driver.find_element_by_xpath("/html/body/div[1]/main/div[4]/div[3]/div[2]/div/div[8]/div[2]/div[1]/div[2]/div[1]/div[1]/div[2]/span").text
        except:   
            try:
                contractName = driver.find_element_by_xpath("/html/body/div[1]/main/div[5]/div[3]/div[2]/div/div[8]/div[2]/div[1]/div[2]/div[1]/div[1]/div[2]/span").text
            except:
                contractName = "notFound"


        return nameTag, website, tokenName, contractName

    # if the subpage cannot be accessed, throw error message and return string
    except:
        print("could not access contract subpage: "+contract_address)

        return "notAccessed", "notAccessed", "notAccessed", "notAccessed"


 

## 4. Scrape the data and save it

### 4.1 Prepare the inital dataset

In [27]:
df_contracts = pd.read_csv(r"location of csv file", sep=";")
#contract_list = df_contracts["to_address"].to_list()

# set new variables
df_contracts["collected"] = "no"
df_contracts["nametag"] = ""
df_contracts["website"] = ""
df_contracts["tokenName"] = ""
df_contracts["contractName"] = ""

# store dataframe with new variables
df_contracts.to_csv('2021-12 contracts_outside_sample_collected.csv', sep=";",index=False)

df_contracts

Unnamed: 0,to_address,from_count,collected,nametag,website,tokenName,contractName
0,0x7a250d5630b4cf539739df2c5dacb4c659f2488d,51159574,no,,,,
1,0xa0b86991c6218b36c1d19d4a2e9eb0ce3606eb48,16349094,no,,,,
2,0x174bfa6600bf90c885c7c01c7031389ed1461ab9,9088337,no,,,,
3,0xe592427a0aece92de3edee1f18e0157c05861564,4413904,no,,,,
4,0x00000000003b3cc22af3ae1eac0440bcee416b40,4125682,no,,,,
...,...,...,...,...,...,...,...
97035,0x2431b64cdd6d7e9a630046d225ba4f01b3ac9d3b,101,no,,,,
97036,0x3e650349fd52bc9d7fcd4fb1737e269d1e667ddf,101,no,,,,
97037,0xe5f06db4f3473e7e35490f1f98017728496fe81e,101,no,,,,
97038,0x5dc3aab4da6e02088e8abeeb689b7f1a25e00002,101,no,,,,


In [163]:
driver = init_webdriver()

In [None]:
for index, row in df_contracts.iterrows():
    if row["collected"] == "no":
        print(str(get_contract_info(row["to_address"], driver)))

In [49]:
df_contracts = pd.read_csv(r"C:\Users\DanielObermeier\OneDrive\Dokumente\PhD\02 - Forschung\00 - Projects\02 Market for Transactions\Code\Scraping\To change\2021-12 contracts_outside_sample_test.csv", sep=";")



# initiate count as first index that has not been collected yet.
count = df_contracts.index[df_contracts['collected'] == "no"].min()
count

40

### 4.2 Loop through the dataset for the first time (can be restarted)

In [4]:
import pandas as pd
# import libraries
from datetime import datetime

# load dataframe
    # only for first run
#df_contracts = pd.read_csv(r"C:\Users\DanielObermeier\OneDrive\Dokumente\PhD\02 - Forschung\00 - Projects\02 Market for Transactions\Code\Scraping\To change\2021-12 contracts_outside_sample_collected.csv", sep=";")
df_contracts = pd.read_csv(r"C:\Users\DanielObermeier\OneDrive\Dokumente\PhD\02 - Forschung\00 - Projects\02 Market for Transactions\Code\Scraping\To change\2021-12 contracts_outside_sample_test.csv", sep=";")

# initiate headless webdriver
driver = init_webdriver()

# initiate count as first index that has not been collected yet.
count = df_contracts.index[df_contracts['collected'] == "no"].min()

# loop through rows
for index, row in df_contracts.iterrows():

    # collect data only for contracts that have not been collected yet
    if row["collected"] == "no":
        count += 1
        # call function and collect info from etherscan
        result = get_contract_info(row["to_address"], driver)
        
        # assign results to columns
        df_contracts.iloc[index,2] = "yes"
        df_contracts.iloc[index,3] = result[0]
        df_contracts.iloc[index,4] = result[1]
        df_contracts.iloc[index,5] = result[2]
        df_contracts.iloc[index,6] = result[3]

        print("index: "+str(count)+" collected contract: "+ row["to_address"])

        # save the result every 100 
        if count % 100 == 0:
            df_contracts.to_csv('2021-12 contracts_outside_sample_scraped.csv', sep=";", index=False)
            print("results have been saved at: "+str(datetime.now().strftime("%d/%m/%Y %H:%M:%S")))
            
    
print("all contract info has been collected")
df_contracts.to_csv('2021-12 contracts_outside_sample_scraped.csv', sep=";", index=False)
print("results have been saved at: "+str(datetime.now().strftime("%d/%m/%Y %H:%M:%S")))
# quit webdriver
quit_webdriver(driver)


index: 97001 collected contract: 0xb35cece152bc59f95e01e6c65936e069b9f4d326
index: 97002 collected contract: 0xabd10eed378c11e02cb2fac8ba7251aed247cc22
index: 97003 collected contract: 0x8bf3873722b8b643ac9e47e3350336954615c796
index: 97004 collected contract: 0xd5af8c362d39589aeab0ba8ee5b6bbcbbc6dcd03
index: 97005 collected contract: 0x60ad94f70883e78feff2ce9aaaaaab82d2d467ff
index: 97006 collected contract: 0xd85dac80243b1c9d23bbec96c77bdb2eca05d2c9
index: 97007 collected contract: 0xbd880c025e49b903ff8ac8fdaf9aba02340177b3
index: 97008 collected contract: 0x14aa2d0eb40a5b19b67e0ed6644e6538061b4cf6
index: 97009 collected contract: 0x9880531938e409ff35cb52de9c1478efe6c69aa1
index: 97010 collected contract: 0x072a1ba9fa10dea38ad2f4043a8385fbbf108cb2
index: 97011 collected contract: 0x100b8dac74aea27e71c3ec8ed65b9507381125f5
index: 97012 collected contract: 0xb17ec70779a3817281f1b8927b24e26d08ae8997
index: 97013 collected contract: 0x9d0fc4b8e8a501eb664e2270a8581a2c98946fc7
index: 97014

### 4.3 Loop through the dataset for the second time to find double check contracts that could not be accessed

In [15]:
print(df_contracts.iloc[1,3])

nes


# 5. Matching missing contracts

In [5]:
# import the dataset
import pandas as pd

# import unmatched contracts
df_contracts_to_match = pd.read_csv(r"C:\Users\DanielObermeier\OneDrive\Dokumente\PhD\02 - Forschung\00 - Projects\02 Market for Transactions\Code\Scraping\To change\2021-12 contracts_outside_sample_scraped.csv", sep=";")
df_contracts_to_match


# import list of dApps that should be matched
df_dapps_to_match = pd.read_csv(r"C:\Users\DanielObermeier\OneDrive\Dokumente\PhD\02 - Forschung\00 - Projects\02 Market for Transactions\Code\Scraping\To change\2021-12 dapp_contract_link_sotd_defilama_to_match.csv", sep=";")
dapps_to_match_list = list(df_dapps_to_match["d_name"].unique())


In [6]:
# import library

from fuzzywuzzy import fuzz
from fuzzywuzzy import process

def fuzzy_matching(match_string, match_list):

    
    match_string = match_string.lower()
    match_string_prefix = match_string.split(":")[0].split(" ")[0] # only take name before colon

    # use scorer option to set ratio for extract
    
    best_matches_partial = process.extractOne(match_string_prefix, match_list, scorer=fuzz.partial_ratio)
    #best_matches_sort = process.extractOne(match_string, match_list, scorer=fuzz.token_sort_ratio)
    #best_matches_set = process.extractOne(match_string, match_list, scorer=fuzz.token_set_ratio)
    best_matches_Wratio = process.extractOne(match_string, match_list)
    
    match_list = [best_matches_Wratio, best_matches_partial]  # best_matches_set, best_matches_sort,
    

    # If we want only the top one -> process uses the Wratio which is a weighted combination of
    # best_matches = process.extractOne(test_contract_name, dapps_to_match_list)
    #  return best_match
    return match_list


In [None]:
"""# test matching function
fuzzy_matching("Shiba Inu: SHIB Token", dapps_to_match_list)"""

In [None]:
"""# test different matching modes
match_string = "Shiba Inu: SHIB Token"
match_list = dapps_to_match_list

x = process.extractOne(match_string, match_list)
print(x)

match_string_pre = match_string.split(":")[0].split(" ")[0] # only take name before colon
y = process.extractOne(match_string_pre, match_list, scorer=fuzz.partial_ratio)
print(y)"""


In [7]:
# pre-filter df 
    #   only contracts that have been collected 
df_contracts_to_match = df_contracts_to_match.loc[df_contracts_to_match['collected'] == "yes"]
    # only contracts where a name tag was found 
df_contracts_to_match = df_contracts_to_match.loc[df_contracts_to_match['nametag'] != "notFound"]

# show data frame
#df_contracts_to_match

# show number of contract to match
len(df_contracts_to_match)

11702

In [9]:
import swifter
# apply fuzzy matching 
df_contracts_to_match["match"] = df_contracts_to_match["nametag"].swifter.apply(lambda x: fuzzy_matching(x, dapps_to_match_list))

# split list of column to multiple columns
df_contracts_to_match[['match1','match2']] = pd.DataFrame(df_contracts_to_match.match.tolist(), index= df_contracts_to_match.index)
df_contracts_to_match[['match1_name','match1_certainty']] = pd.DataFrame(df_contracts_to_match.match1.tolist(), index= df_contracts_to_match.index)
df_contracts_to_match[['match2_name','match2_certainty']] = pd.DataFrame(df_contracts_to_match.match2.tolist(), index= df_contracts_to_match.index)
#df_contracts_to_match

Pandas Apply: 100%|██████████| 11702/11702 [26:18<00:00,  7.41it/s]


In [11]:
# export matched data 
df_contracts_to_match.to_csv('2021-12 contracts_matched_all.csv', sep=";", index=False)

# OLD To Be Changed for MfT Project

## 3. Get contract addresses from the ethereum_data database

In [30]:
def get_contract_addresses():
    """
    this function connects to the ethereum_data database, retrieves all contract addresses, and returns them in a list

    """
    
    # import libraries
    import psycopg2
    import ast

    # init return list
    contract_address_list = []



    # conncet to db
    conn = psycopg2.connect("host=localhost dbname=ethereum_data user=postgres password=DanielObermeierSuperUser2207")
    cur = conn.cursor()
    print("has successfully connected to db")


    # execute contract adddress query
    cur.execute("""SELECT d_contract_list FROM dapp_data""")
    
    sql_results = cur.fetchall()
    for address_list in sql_results:
        address_list = ast.literal_eval(address_list[0])
        
        for address in address_list:
            contract_address_list.append(address)

    # close connections to the database
    conn.close()

    print("has collected all "+str(len(contract_address_list)) +" contract addresses")

    return contract_address_list

In [31]:
res = get_contract_addresses()

has successfully connected to db
has collected all 3855 contract addresses


In [33]:
res[0]

'0x514910771af9ca656af840dff83e8264ecf986ca'

## 4. Get data from etherscan

### 4.1 Get data from website

variables to collect:
- contract address
- contract name
- contract creator
- token traker
- compiler version
- source code verified 
- contract creation

In [103]:
# function to get data from website

def get_contract_data(contract_address, driver):

    """
    this function takes a contract address as argument, navigates to the etherscan subpage and returns a dict containing    technical contract details 

    arguments:
    - contract_addresss (string): address of a smart contract
    - driver (webdriver object): webdriver
    """
    # init return dict
    contract_dict = {}

    # create url
    contract_addres = "0x514910771af9ca656af840dff83e8264ecf986ca"
    link = "https://etherscan.io/address/"+contract_address+"#code"

    try:
        driver.get(link)
    except:
        print("could not access webpage")

    # store contract address
    contract_dict["c_address"] = contract_address


    # get contract name
    try:
        contract_dict["c_name"] = driver.find_element_by_xpath('//*[@id="ContentPlaceHolder1_contractCodeDiv"]/div[2]/div[1]/div[1]/div[2]/span').text
    except:
        contract_dict["c_name"] = "NaN"

    # get contract creator
    try:
        contract_dict["c_creator"] = driver.find_element_by_xpath('//*[@id="ContentPlaceHolder1_trContract"]/div/div[2]/a').text
    except:
        contract_dict["c_creator"] = "NaN"

    # get token traker
    try:
        contract_dict["c_token_traker"] = driver.find_element_by_xpath('//*[@id="ContentPlaceHolder1_tr_tokeninfo"]/div/div[2]/a').text
    except:
        contract_dict["c_token_traker"] = "NaN"

    # get compiler
    try:
        contract_dict["c_compiler"] = driver.find_element_by_xpath('//*[@id="ContentPlaceHolder1_contractCodeDiv"]/div[2]/div[1]/div[2]/div[2]/span').text
    except:
        contract_dict["c_compiler"] = "NaN"

    # get sc verified
    try:
        c_verified = driver.find_element_by_xpath('//*[@id="ContentPlaceHolder1_contractCodeDiv"]/div[1]/div/h3/strong/span').text
        if c_verified == '(Exact Match)':
            contract_dict["c_verified"] = "1"
        else:
            contract_dict["c_verified"] = "0"
    except:
        contract_dict["c_verified"] = "NaN"

    # get contract creation
        # navigate to creation transaction
    try: 
        c_creation_link = driver.find_element_by_xpath('//*[@id="ContentPlaceHolder1_trContract"]/div/div[2]/span/a').get_attribute('href')
        driver.get(c_creation_link)

            # get creation string
        contract_dict["c_creation"] = driver.find_element_by_xpath('//*[@id="ContentPlaceHolder1_maintable"]/div[3]/div[2]').text
    except:
        contract_dict["c_creation"] = "NaN"


    # get source code from api 
    try: 
        ApiToken = "" # load API token from environment

        url = 'https://api.etherscan.io/api?module=contract&action=getsourcecode&address='+contract_address+'&apikey='+ApiToken
        r = requests.get(url)

        contract_dict["c_source_code"] = str(r.json())
    except:
        contract_dict["c_source_code"] = "NaN"

    return contract_dict


### 4.2 Get data from API

In [None]:
import json
import requests

url = 'https://api.etherscan.io/api?module=contract&action=getsourcecode&address='+contract_address+'&apikey='+ApiToken

r = requests.get(url)
source_code = r.json()
print(source_code)

## Insert into the Database

In [108]:
insert_data = list(res.values())

In [111]:
import psycopg2

# conncet to db
connect = "host=localhost dbname=ethereum_data user=postgres password="
password = "insert password"

conn = psycopg2.connect(connect+password)
cur = conn.cursor()

In [112]:
# insert query
contract_data_insert = ("""INSERT INTO contract_data (
                            c_address,
                            c_name, 
                            c_creator,
                            c_token_traker,
                            c_compiler,
                            c_verified,
                            c_creation,
                            c_source_code) \
                            VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
                            ON CONFLICT (c_address) DO NOTHING
""")

In [114]:
cur.execute(contract_data_insert, list(res.values()))

In [115]:
conn.commit()

In [117]:
conn.close()

In [None]:
cur.execute("""SELECT * FROM contract_data""")
#cur.execute(song_select, (row.song, row.artist, row.length))
sql_results = cur.fetchall()
for result in sql_results:
    print(result)