In [13]:
import json
import time

import pandas as pd
import requests
from bs4 import BeautifulSoup

with open('items.json') as f:
    items = json.load(f)

df = pd.DataFrame(items)
# remove duplicates
df = df.drop_duplicates(subset=['href_id'])

coins = requests.get("https://api.coingecko.com/api/v3/coins/list?include_platform=true").json()
coins = pd.DataFrame(coins)
# Rename id to api_id
coins = coins.rename(columns={'id': 'api_id'})
# Merge, coins and df on id, but leave out name
df = df.merge(coins[['api_id','symbol','platforms']], left_on='href_id', right_on='api_id', how='left')
# Get missing symbols
missing_symbols = df[df['symbol'].isna()]
href_ids = missing_symbols['href_id'].tolist()
href_urls = [f"https://www.coingecko.com/en/coins/{href_id}" for href_id in href_ids]

api_ids = []
for href_url in href_urls:
    try: 
        response = requests.get(href_url)
        soup = BeautifulSoup(response.text, 'html.parser')
        # Find the api_id
        api_id = soup.find('span', string='API id')
        # Text is in next div
        api_id = api_id.find_next_sibling('div').text.strip()
        api_ids.append([api_id, href_url.split('/')[-1].strip()])
        time.sleep(3)

    except:
        print("ERROR")
        time.sleep(3)

# Create a dataframe from api_ids
api_ids = pd.DataFrame(api_ids, columns=['api_id', 'href_id'])
# Merge with coins
api_ids = api_ids.merge(coins[['api_id','symbol','platforms']], on='api_id', how='left')
# Merge with df rows missing symbols
mask = pd.isna(df['symbol'])
# Combine the two dataframes
df_merged = pd.merge(df[mask].dropna(axis=1), api_ids, on='href_id', how='left')
df_combined = pd.concat([df[~mask], df_merged])
df_combined.reset_index(drop=True, inplace=True)
# Turn platforms column into just the string of ethereum contract address
df_combined['platforms'] = df_combined['platforms'].apply(lambda x: x.get('ethereum'))
# Drop rows without ethereum contract
df_combined = df_combined.dropna(subset=['platforms'])
df_combined = df_combined[df_combined['platforms'] != '']
# Test that every platform is an ethereum address, i.e. a string that starts with 0x and is 42 characters long
assert df_combined['platforms'].apply(lambda x: x.startswith('0x') and len(x) == 42).all()

# Yay! 
# Now we have a dataframe with all the coins that gained and have an etherum contract address


In [1]:
import scraping

ModuleNotFoundError: No module named 'scraping'

In [42]:
df_test

Unnamed: 0,name,price,change,href_id,api_id,symbol,platforms
1,Lido DAO,$2.47,7.9%,lido-dao,lido-dao,ldo,0x5a98fcbea516cf06857215779fd812ca3bef1b32
5,Render,$1.33,5.9%,render-token,render-token,rndr,0x6de037ef9ad2725eb40118bb1702ebb27e4aeb24
7,Chainlink,$7.64,5.7%,chainlink,chainlink,link,0x514910771af9ca656af840dff83e8264ecf986ca
9,Frax Share,$8.83,4.2%,frax-share,frax-share,fxs,0x3432b6a60d23ca0dfca7761b7ab56459d9c964d0
15,ApeCoin,$4.18,3.0%,apecoin,apecoin,ape,0x4d224452801aced8b2f0aebe155379bb5d594381
18,Uniswap,$6.05,2.5%,uniswap,uniswap,uni,0x1f9840a85d5af5bf1d1762f925bdaddc4201f984
22,Shiba Inu,$0.000010837354,2.4%,shiba-inu,shiba-inu,shib,0x95ad61b0a150d79219dcf64e1e6cc01f0b64c4ce
23,Aave,$73.21,2.4%,aave,aave,aave,0x7fc66500c84a76ad7e9c93437bfc5ac33e2ddae9
24,Chiliz,$0.120544,2.4%,chiliz,chiliz,chz,0x3506424f91fd33084466f402d5d97f05f8e3b4af
25,Rocket Pool,$43.95,2.3%,rocket-pool,rocket-pool,rpl,0xd33526068d116ce69f19a9ee46f0bd304f21a51f


In [20]:
df_combined.reset_index(drop=True, inplace=True)

In [40]:
# Drop rows where platforms is an empty string
assert df_test['platforms'].apply(lambda x: x.startswith('0x') and len(x) == 42).all()


In [41]:
assert false

NameError: name 'false' is not defined

In [33]:
coins[coins['api_id'] == 'xdce-crowd-sale']

Unnamed: 0,api_id,symbol,name,platforms
10684,xdce-crowd-sale,xdc,XDC Network,{'ethereum': ''}


In [104]:
missing_symbols = df[df['symbol'].isna()]

missing_symbols
# Scrape missing symbols from href_id

href_ids = missing_symbols['href_id'].tolist()
href_urls = [f"https://www.coingecko.com/en/coins/{href_id}" for href_id in href_ids]

# Send a request to each href_url and scrape the api_id every 5 second
import time
import requests
from bs4 import BeautifulSoup

api_ids = []
for href_url in href_urls:
    try: 
        response = requests.get(href_url)
        soup = BeautifulSoup(response.text, 'html.parser')
        # Find the api_id
        api_id = soup.find('span', string='API id')
        # Text is in next div
        api_id = api_id.find_next_sibling('div').text.strip()
        api_ids.append([api_id, href_url.split('/')[-1]])
        time.sleep(3)

    except:
        print("ERROR")
        time.sleep(3)

In [132]:
# Create a dataframe from api_ids
api_ids = pd.DataFrame(api_ids, columns=['api_id', 'href_id'])
# Merge with coins
api_ids = api_ids.merge(coins[['api_id','symbol','platforms']], on='api_id', how='left')
# Merge with df rows missing symbols
mask = pd.isna(df['symbol'])

df_merged = pd.merge(df[mask].dropna(axis=1), api_ids, on='href_id', how='left')

In [134]:
df_combined = pd.concat([df[~mask], df_merged])

In [142]:
# Find all na values in df_combined
df_combined.isna().sum()

name         0
price        0
change       0
href_id      0
api_id       0
symbol       0
platforms    0
dtype: int64

In [127]:
df_merged

Unnamed: 0,name,price,change,href_id,api_id_x,symbol_x,platforms_x,api_id_y,symbol_y,platforms_y
0,Hedera,$0.073822899946,12.6%,hedera,,,,hedera-hashgraph,hbar,{}
1,Toncoin,$2.26,7.5%,toncoin,,,,the-open-network,ton,{'ethereum': '0x582d872a1b094fc48f5de31d3b73f2...
2,Synthetix Network,$2.51,3.5%,synthetix-network-token,,,,havven,snx,{'ethereum': '0xc011a73ee8576fb46f5e1c5751ca3b...
3,XDC Network,$0.042008191359,2.7%,xdc-network,,,,xdce-crowd-sale,xdc,{'ethereum': ''}
4,Conflux,$0.412902,2.1%,conflux,,,,conflux-token,cfx,{}
5,SXP,$0.644865,34.8%,sxp,,,,swipe,sxp,{'ethereum': '0x8ce9137d39326ad0cd6491fb5cc0cb...
6,Nexa,$0.000020625638,24.3%,nexa,,,,nexacoin,nexa,{}
7,OAX,$0.355253,23.9%,oax,,,,openanx,oax,{'ethereum': '0x701c244b988a513c945973defa05de...
8,DSLA Protocol,$0.002478343260,19.3%,dsla-protocol,,,,stacktical,dsla,{'ethereum': '0x3affcca64c2a6f4e3b6bd9c64cd2c9...
9,Horizen,$11.01,6.1%,horizen,,,,zencash,zen,{}
