Find top 100 coins listed on https://coinmarketcap.com/

In [7]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
from os import listdir
import re


headers = {
    'Access-Control-Allow-Origin': '*',
    'Access-Control-Allow-Methods': 'GET',
    'Access-Control-Allow-Headers': 'Content-Type',
    'Access-Control-Max-Age': '3600',
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
    }

url = "https://coinmarketcap.com"
req = requests.get(url, headers)
soup = BeautifulSoup(req.content, 'html.parser')
rows = soup.find_all("tr")
top_currencies = []
for index, row in enumerate(rows[1:]):
    top_currencies.append(url + row.find("a", href=True)["href"])

In [8]:
# get set of existing cells
path = "../data/whitepapers/coinmarketcap/"
existing_coins = set([re.sub("_[0-9]+","", f.split(".")[0].lower()) for f in listdir(path)])
print(sorted(existing_coins))

['aave', 'avalanche', 'axie_infinity', 'bitcoin', 'bnb', 'cardano_ada', 'chainlink', 'cosmos', 'dai', 'decentraland', 'dogecoin', 'elrond', 'eos', 'ethereum', 'filecoin', 'flow', 'ftx_token', 'hedera', 'litecoin', 'maker', 'polkadot', 'polygon', 'quant', 'shiba_inu', 'solana', 'stellar', 'tether', 'tezos', 'the_sandbox', 'theta', 'tron', 'uniswap', 'unus_sed_leo', 'usd_coin_centre', 'vechain', 'wrapped_bitcoin', 'xrp', 'zcash']


In [9]:
df = pd.DataFrame(columns=["coin","symbol","link","whitepaper_link"])

# Create dataframe of coins with whitepaper link if available
for coin in top_currencies:
    req = requests.get(coin, headers)
    soup = BeautifulSoup(req.content, 'html.parser')
    symbol = soup.find("small",  attrs={"class": "nameSymbol"})
    name = symbol.parent.contents[0]
    whitepaper_tags = soup.find_all("a", attrs={"class": "link-button", "rel": "nofollow noopener"}, href=True)
    whitepaper_link = whitepaper_tags[-1]["href"] if len(whitepaper_tags) > 0 else ""
    df.loc[len(df.index)] = [name, symbol.text, coin, whitepaper_link if whitepaper_link and whitepaper_link.endswith(".pdf") else ""] 


Unnamed: 0,coin,symbol,link,whitepaper_link
0,Bitcoin,BTC,https://coinmarketcap.com/currencies/bitcoin/,https://bitcoin.org/bitcoin.pdf
1,Ethereum,ETH,https://coinmarketcap.com/currencies/ethereum/,
2,Tether,USDT,https://coinmarketcap.com/currencies/tether/,https://tether.to/wp-content/uploads/2016/06/T...
3,USD Coin,USDC,https://coinmarketcap.com/currencies/usd-coin/,https://f.hubspotusercontent30.net/hubfs/93046...
4,BNB,BNB,https://coinmarketcap.com/currencies/bnb/,
...,...,...,...,...
95,Holo,HOT,https://coinmarketcap.com/currencies/holo/,
96,Qtum,QTUM,https://coinmarketcap.com/currencies/qtum/,https://qtum.org/user/pages/01.home/Qtum%20whi...
97,XDC Network,XDC,https://coinmarketcap.com/currencies/xinfin/,https://xinfin.org/docs/whitepaper-tech.pdf
98,Gala,GALA,https://coinmarketcap.com/currencies/gala/,


In [39]:
# download pdf if whitepaper not already downloaded
# for index, row in df.iterrows():
#     coin = row["coin"].lower().replace(" ","_")
#     if coin not in existing_coins: print(coin)
whitepaper_downloaded = []
for _, row in df.iterrows():
    name = row["coin"].lower().replace(" ", "_")
    if name not in existing_coins and row["whitepaper_link"]:
        try:
            response = requests.get(row["whitepaper_link"])
            with open("papers/" + str(name)+".pdf", 'wb') as pdf:
                pdf.write(response.content)
                pdf.close()
            whitepaper_downloaded.append(True)
        except:
            whitepaper_downloaded.append(False)
    elif name in existing_coins:
        whitepaper_downloaded.append(True)
    else:
        whitepaper_downloaded.append(False)

df["whitepaper_obtained"] = whitepaper_downloaded

bitcoin  exists
ethereum  exists
tether  exists
usd_coin  downloaded
bnb  exists
xrp  exists
cardano  not available
binance_usd  not available
solana  exists
polkadot  exists
dogecoin  exists
avalanche  exists
dai  exists
polygon  exists
shiba_inu  exists
uniswap  exists
tron  exists
wrapped_bitcoin  exists
ethereum_classic  not available
unus_sed_leo  exists
litecoin  exists
near_protocol  not available
chainlink  exists
ftx_token  exists
cronos  downloaded
cosmos  exists
stellar  exists
flow  exists
monero  downloaded
bitcoin_cash  not available
algorand  not available
vechain  exists
filecoin  exists
apecoin  not available
internet_computer  downloaded
decentraland  exists
hedera  exists
tezos  exists
the_sandbox  exists
theta_network  downloaded
axie_infinity  exists
elrond  exists
aave  exists
quant  exists
eos  exists
trueusd  not available
bitcoin_sv  not available
zcash  exists
okb  not available
helium  not available
maker  exists
kucoin_token  downloaded
fantom  not available

Unnamed: 0,coin,symbol,link,whitepaper_link,whitepaper_obtained
0,Bitcoin,BTC,https://coinmarketcap.com/currencies/bitcoin/,https://bitcoin.org/bitcoin.pdf,True
1,Ethereum,ETH,https://coinmarketcap.com/currencies/ethereum/,,True
2,Tether,USDT,https://coinmarketcap.com/currencies/tether/,https://tether.to/wp-content/uploads/2016/06/T...,True
3,USD Coin,USDC,https://coinmarketcap.com/currencies/usd-coin/,https://f.hubspotusercontent30.net/hubfs/93046...,True
4,BNB,BNB,https://coinmarketcap.com/currencies/bnb/,,True
5,XRP,XRP,https://coinmarketcap.com/currencies/xrp/,https://ripple.com/files/ripple_consensus_whit...,True
6,Cardano,ADA,https://coinmarketcap.com/currencies/cardano/,,False
7,Binance USD,BUSD,https://coinmarketcap.com/currencies/binance-usd/,,False
8,Solana,SOL,https://coinmarketcap.com/currencies/solana/,https://solana.com/solana-whitepaper.pdf,True
9,Polkadot,DOT,https://coinmarketcap.com/currencies/polkadot-...,https://polkadot.network/PolkaDotPaper.pdf,True


In [1]:
#import PyPDF2
from os import listdir
from io import StringIO

from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser

pdf_path = "./papers"
pdfs = [f for f in listdir(pdf_path) if f.endswith(".pdf")]

failed_conversions = []

for pdf in pdfs:
    output_string = StringIO()
    name = pdf.split(".")[0]
    try:
        with open("./papers/" + name + ".pdf", 'rb') as in_file:
            parser = PDFParser(in_file)
            doc = PDFDocument(parser)
            rsrcmgr = PDFResourceManager()
            device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            for page in PDFPage.create_pages(doc):
                interpreter.process_page(page)
            in_file.close()
        with open("./text/" + name + ".txt","w") as out_file:
            out_file.writelines(output_string.getvalue())
            out_file.close()
        print(pdf + " finished")
    except:
        failed_conversions.append(pdf)
        print(pdf + " pls bro no work")



# with open("./papers/qtum.pdf", "rb") as f:
#     try:
#         pdfreader=PyPDF2.PdfFileReader(f)
#         x=pdfreader.numPages
#         pageobj=pdfreader.getPage(1)
#         text=pageobj.extractText()
#         #print(pdfreader.numPages)
#         print(text)
#         f.close()
#     except:
#         print("wtf man cmon",pdf)

#print(pdfs)
# for pdf in sorted(pdfs):
#     with open("./papers/" + pdf, "rb") as f:
#         try:
#             pdfreader=PyPDF2.PdfFileReader(f)
#             x=pdfreader.numPages
#             pageobj=pdfreader.getPage(0)
#             text=pageobj.extractText()
#             #print(pdfreader.numPages)
#             print(text)
#             f.close()
#         except:
#             print("wtf man cmon",pdf)

qtum.pdf finished
bittorrent-new.pdf finished
stacks.pdf finished
dash.pdf finished
theta_network.pdf finished
oasis_network.pdf finished
nexo.pdf finished
THORChain-Whitepaper-May2020.pdf finished
basic_attention_token.pdf finished
fantom.pdf finished
chiliz.pdf finished
gala.pdf finished
tusd.pdf finished
klaytn.pdf finished
neo.pdf finished
usdd.pdf finished
nem.pdf finished
ethereum_classic.pdf finished
usd_coin.pdf finished
cronos.pdf pls bro no work
algorand.pdf finished
enjin_coin.pdf finished
arweave.pdf finished
mina.pdf finished
the-graph-whitepaper.pdf finished
pax_dollar.pdf finished
decred.pdf finished
huobi_token.pdf finished
convex_finance.pdf finished
curve_dao_token.pdf finished
ravencoin.pdf finished
internet_computer.pdf finished
zilliqa.pdf finished
ankr.pdf finished
holo.pdf finished
pax_gold.pdf finished
waves.pdf finished
bitcoin_gold.pdf finished
near_protocol.pdf finished
lido_dao.pdf finished
loopring.pdf finished
xdc_network.pdf finished
trust_wallet_token.pd