In [1]:
import time
from utils import scrape_info_cryptototem
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from bs4 import Tag, NavigableString
from soup2dict import convert
import re
import os
from urllib.parse import urljoin
from timeit import default_timer as timer
import datetime
import requests
import joblib


from utils import decrypt_CryptoTotem

In [2]:
# set folders
CHECKPOINT_FOLDER = '.\\Checkpoints'
RESULTS_FOLDER = '.\\Results'
CRYTOTOT_FOLDER = '.\\Checkpoints\\Cryptototem'

if not os.path.exists(CHECKPOINT_FOLDER):
    os.makedirs(CHECKPOINT_FOLDER)
if not os.path.exists(RESULTS_FOLDER):
    os.makedirs(RESULTS_FOLDER)
if not os.path.exists(CRYTOTOT_FOLDER):
    os.makedirs(CRYTOTOT_FOLDER)


## Get ICOs url

In [6]:
# Download categories
MAIN_PAGE='https://cryptototem.com/'
CATEGORY_PAGE='https://cryptototem.com/ico-list/'


page = requests.get(CATEGORY_PAGE)
soup = BeautifulSoup(page.content, 'html.parser')

tt=convert(soup.find("div", class_="sort-elements"))
option_list=tt['option']

df_cat=pd.DataFrame()
start = timer()
for i, el in enumerate(option_list):
    
    print(f'- Downloading: {i+1} / {len(option_list)}   ', end='\r')
    
    
    url=urljoin(MAIN_PAGE, el['@data-url'])
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    total_pages=1
    if '         Next\n' in soup.prettify() and '\n         Previous' in soup.prettify():
        tt=convert(soup)
        ss=tt['html'][0]['#text']
        sta_ind=ss.find('Previous 1')
        end_ind=ss[sta_ind:].find('Next')
        pages=ss[sta_ind:(sta_ind+end_ind)].replace('Previous', '').strip()
        total_pages=int(pages[-1])
    add_row=pd.DataFrame({'Category': el['#text'],
                         'url': url,
                         'pages': total_pages}, index=[i])
    df_cat=pd.concat([df_cat, add_row])
print('\nTotal elapsed time:', str(datetime.timedelta(seconds=round(timer()-start))))
df_cat=df_cat.sort_values(by='pages', ascending=False).reset_index(drop=True)
display(df_cat)

- Downloading: 81 / 81   
Total elapsed time: 0:01:48


Unnamed: 0,Category,url,pages
0,Other,https://cryptototem.com/other/,8
1,Social Network & Communication,https://cryptototem.com/social-network-and-com...,7
2,Crowdfunding & Lending,https://cryptototem.com/crowdfunding-and-lending/,6
3,Exchanges & Wallets,https://cryptototem.com/exchanges-and-wallets/,5
4,Gambling & Betting,https://cryptototem.com/gambling-and-betting/,5
...,...,...,...
76,Meme,https://cryptototem.com/meme/,1
77,Metaverse,https://cryptototem.com/metaverse/,1
78,Cosmos Ecosystem,https://cryptototem.com/cosmos-ecosystem/,1
79,Near Ecosystem,https://cryptototem.com/near-ecosystem/,1


In [4]:
# apply category in search query and get ICO list

df_list=pd.DataFrame()
start=timer()
download_date=datetime.datetime.now().strftime("%d/%m/%Y")
for index, row in df_cat.iterrows():
    url_list=[row['url']]
    if row['pages'] > 1:
        url_list.extend([urljoin(row['url'], f'page/{x+1}/') for x in range(1, row['pages'])])

    for page_i, url in enumerate(url_list):

        categ=row['Category']
        print(f'- Downloading Category: {categ} ({index+1} / {len(df_cat)})  - page: {page_i+1} / {len(url_list)}   ', end='\r')

        page = requests.get(url)
        soup = BeautifulSoup(page.content, 'html.parser')
        table=convert(soup.find("table", class_="ico-table sortable"))['tbody'][0]['tr']

        for tt_row in table:
            link=urljoin(MAIN_PAGE, tt_row['td'][1]['a'][0]['@href'])
            name=tt_row['td'][1]['#text'].split('(')
            if len(name) > 1:
                name=''.join(name[:-1])
            else:
                name=name[0]
            name=name.strip()
            try:
                interest=tt_row['td'][3]['div'][1]['#text']
            except:
                interest=''
            add_row=pd.DataFrame({'Category': row['Category'],
                                 'url': link,
                                 'name': name,
                                 'interest': interest}, index=[0])
            df_list=pd.concat([df_list, add_row])
    print('')
df_list['ListDownloadedOn']=download_date
print('\nTotal elapsed time:', str(datetime.timedelta(seconds=round(timer()-start))))

pkl_path=os.path.join(CRYTOTOT_FOLDER, 'cryptototem_url_list.pkl')
joblib.dump(df_list, pkl_path, compress=('lzma', 3))
print('\nData saved in', pkl_path)

display(df_list.head(5))
print('Total rows:', len(df_list))
print('\nTotal Categories:', df_list['Category'].nunique())
print('Total unique urls:', df_list['url'].nunique())
print('Total unique names:', df_list['name'].nunique())

multiple_names = df_list.drop(columns='Category').drop_duplicates()['name'].value_counts().to_frame().query("name > 1").index.tolist()
if len(multiple_names) > 0:
    df_multiple = df_list[df_list['name'].isin(multiple_names)].sort_values(by='name')
    n_uniq = df_multiple['name'].nunique()
    print(f'\n\n- {n_uniq} ICOs with multiple names found')
    display(df_multiple.drop(columns='ListDownloadedOn'))
    
    df_multiple.to_csv(os.path.join(RESULTS_FOLDER, '01b_CryptoTotem_ico_list_multiple_names.csv'), index=False, sep=';')
    print('\nMultiple ICOs saved in ', os.path.join(RESULTS_FOLDER, '01b_a_CryptoTotem_ico_list_multiple_names.csv'))

- Downloading Category: Other (81 / 81)  - page: 8 / 8   
- Downloading Category: Social Network & Communication (65 / 81)  - page: 4 / 7   

KeyboardInterrupt: 

## Scrape information from url

In [5]:
URL_ROOT = "https://cryptototem.com/"
RELOAD_PKL=True


df_list=joblib.load(os.path.join(CRYTOTOT_FOLDER, 'cryptototem_url_list.pkl'))
df_scrape_raw = pd.DataFrame()
unique_urls = df_list['url'].unique()
for index, url in enumerate(unique_urls):
    
    print(f'- Scraping: {str(index + 1)} / {len(unique_urls)}   last interaction: {datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S")}      ', end='\r')
    save_path=os.path.join(CRYTOTOT_FOLDER, url.replace(URL_ROOT, '').strip("/").replace('/', '_')+'.pkl').replace('|', '')

    if not RELOAD_PKL or not os.path.exists(save_path):
        try:
            start = timer()
            add_row = scrape_info_cryptototem(url)
            add_row.insert(1, 'scrape_status', 'OK')
            add_row['PklPath']=save_path
            add_row['TotTimeSec']=datetime.timedelta(seconds=round(timer()-start)).total_seconds()

            joblib.dump(add_row, save_path, compress=('lzma', 3))
        except Exception as err:
            add_row = pd.DataFrame({'url': url, 'scrape_status': 'ERROR', 'scrape_error': err}, index=[0])
            
    else:
        add_row=joblib.load(save_path)
    
    df_scrape_raw=pd.concat([df_scrape_raw, add_row])
    
df_scrape_raw.reset_index(drop=True, inplace=True)
display(df_scrape_raw['scrape_status'].value_counts().to_frame())
print('Check "scrape_error"')
display(df_scrape_raw['website_err'].value_counts().to_frame())
display(df_scrape_raw['whitepaper_err'].value_counts().to_frame())
display(df_scrape_raw['overview_err'].value_counts().to_frame())
display(df_scrape_raw['description_err'].value_counts().to_frame())
display(df_scrape_raw['info_err'].value_counts().to_frame())
display(df_scrape_raw['milestone_err'].value_counts().to_frame())
display(df_scrape_raw['team_err'].value_counts().to_frame())
display(df_scrape_raw['social_err'].value_counts().to_frame())

print('\n\nTotal elapsed time:', str(datetime.timedelta(seconds=round(df_scrape_raw['TotTimeSec'].sum()))))

# save
pkl_path=os.path.join(CRYTOTOT_FOLDER, 'cryptototem_scrape_df_raw.pkl')
joblib.dump(df_scrape_raw, pkl_path, compress=('lzma', 3))
print(f'\nData saved in {pkl_path}')

- Scraping: 5142 / 5142   last interaction: 12/04/2024 12:51:39      

Unnamed: 0,scrape_status
OK,5141
ERROR,1


Check "scrape_error"


Unnamed: 0,website_err
,5139
Missing matched typeclass instance for type: NoneType,1
Missing matched typeclass instance for type: NoneType,1


Unnamed: 0,whitepaper_err
,4585
NOT AVAILABLE,556


Unnamed: 0,overview_err
,5141


Unnamed: 0,description_err
,5141


Unnamed: 0,info_err
,5141


Unnamed: 0,milestone_err
,5139
'div',1
'div',1


Unnamed: 0,team_err
,5141


Unnamed: 0,social_err
,5086
only website,55




Total elapsed time: 3:13:49

Data saved in .\Checkpoints\cryptototem_scrape_df_raw.pkl


In [28]:
pd.set_option('display.max_columns', None)
df_scrape_raw

Unnamed: 0,url,scrape_status,last_update,tag,status,website,website_err,whitepaper_url,whitepaper_err,bounty_dummy,MVP_dummy,overview_block,overview_err,price_series_avail,description,description_err,info_block,info_err,milestone_block,milestone_err,team_block,team_err,social_info,social_err,PklPath,TotTimeSec,scrape_error
0,https://cryptototem.com/lido-ldo/,OK,"Aug 11, 2023",[],,https://lido.fi/?utm_source=CryptoTotem,,https://cryptototem.com/wp-ico/img/new project...,,0.0,1.0,Field Value 0 Project...,,1.0,"What is Lido Lido""s liquid staking allows usin...",,category label ...,,Empty DataFrame Columns: [] Index: [],,Member Name \ 0 Team ...,,Social ...,,.\Checkpoints\Cryptototem\lido-ldo.pkl,2.0,
1,https://cryptototem.com/subdao-gov/,OK,"Dec 01, 2022",[DAICO],,https://www.subdao.network/,,https://cryptototem.com/wp-ico/img/new project...,,0.0,0.0,Field Value 0 Project i...,,0.0,What is SubDAO SubDAO is a DAO infrastructure ...,,category label \ 0 D...,,Empty DataFrame Columns: [] Index: [],,Member Name Role Links 0 ...,,Social Lin...,,.\Checkpoints\Cryptototem\subdao-gov.pkl,2.0,
2,https://cryptototem.com/story-protocol/,OK,"Sep 08, 2023",[],,https://www.storyprotocol.xyz/,,,NOT AVAILABLE,0.0,0.0,Field Value 0 Project ...,,0.0,What is Story Protocol Story Protocol was foun...,,category label value...,,Empty DataFrame Columns: [] Index: [],,Member Name ...,,Social ...,,.\Checkpoints\Cryptototem\story-protocol.pkl,1.0,
3,https://cryptototem.com/solv-protocol-solv/,OK,"Dec 31, 2022",[],,https://solv.finance/?utm_source=CryptoTotem,,https://docs.solv.finance/,,0.0,1.0,Field Value 0 Project in...,,0.0,What is Solv Protocol Solv Protocol is a decen...,,category label \ 0 D...,,Empty DataFrame Columns: [] Index: [],,Member Name ...,,Social ...,,.\Checkpoints\Cryptototem\solv-protocol-solv.pkl,2.0,
4,https://cryptototem.com/unstoppable-domains/,OK,"Nov 21, 2022",[],,https://unstoppabledomains.com/?utm_source=Cry...,,https://docs.unstoppabledomains.com/,,0.0,0.0,Field Value 0 Project ...,,0.0,What is Unstoppable Domains Unstoppable Domain...,,category label value...,,Empty DataFrame Columns: [] Index: [],,Member Name ...,,Social ...,,.\Checkpoints\Cryptototem\unstoppable-domains.pkl,2.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5137,https://cryptototem.com/sexservice-ssio-ico/,OK,"May 04, 2018",[ICO],,https://sexservice.io&utm_medium=listing&utm_c...,,https://sexservice.io/wp/ssio-whitepaper-en-1....,,0.0,0.0,Field Value 0 Project industry...,,0.0,"What is Sexservice Wallet, Explorer and Live n...",,category label \ 0 Toke...,,date ...,,Empty DataFrame Columns: [] Index: [],,Social ...,,.\Checkpoints\Cryptototem\sexservice-ssio-ico.pkl,2.0,
5138,https://cryptototem.com/vanillaplay-vpy-ico/,OK,"May 04, 2018",[ICO],,https://www.vanillaplay.io/&utm_medium=listing...,,https://www.vanillaplay.io/whitepaper/VanillaP...,,0.0,0.0,Field Value 0 Project industry...,,0.0,What is VanillaPlay VanillaPlay eliminates the...,,category label \ 0 Toke...,,date ...,,Empty DataFrame Columns: [] Index: [],,Social ...,,.\Checkpoints\Cryptototem\vanillaplay-vpy-ico.pkl,2.0,
5139,https://cryptototem.com/okoin-okoin-ico/,OK,"May 04, 2018",[ICO],,https://okoin.io/&utm_medium=listing&utm_campa...,,https://cryptototem.com/wp-ico/img/files/7gr4n...,,0.0,0.0,Field Value 0 Project...,,0.0,What is OKOIN Decentralized p2p distribution o...,,category label \ 0 ...,,date ...,,Empty DataFrame Columns: [] Index: [],,Social Link...,,.\Checkpoints\Cryptototem\okoin-okoin-ico.pkl,2.0,
5140,https://cryptototem.com/erotix--erx-ico/,OK,"May 04, 2018",[ICO],,https://erotix.io/&utm_medium=listing&utm_camp...,,https://cryptototem.com/wp-ico/img/files/AyaeT...,,0.0,0.0,Field Value 0 Project indus...,,0.0,What is Erotix The main application of the pla...,,category label \ 0 ...,,date ...,,Empty DataFrame Columns: [] Index: [],,Social Lin...,,.\Checkpoints\Cryptototem\erotix--erx-ico.pkl,2.0,


In [6]:
df_scrape_raw['milestone_err'].value_counts()

    60
Name: milestone_err, dtype: int64

In [8]:
url='https://cryptototem.com/breederdao-breed/'
pd.set_option('display.max_columns', None)
aa=scrape_info_cryptototem(url)
aa

Unnamed: 0,url,last_update,tag,status,website,website_err,whitepaper_url,whitepaper_err,bounty_dummy,MVP_dummy,overview_block,overview_err,price_series_avail,description,description_err,info_block,info_err,milestone_block,milestone_err,team_block,team_err,social_info,social_err
0,https://cryptototem.com/sheng-ieo/,"Jun 14, 2020",[IEO],,https://www.sheng.asia/,,https://cryptototem.com/wp-ico/img/new project...,,0,0,Field Value 0 Project indu...,,0,What is SHENG Shengworld is a Business-to-Busi...,,category label ...,,date ...,,Member Name \ 0 Team ...,,Social ...,


In [10]:
aa['milestone_block'][0]

Unnamed: 0,date,milestone
0,Map_June 2019,Customer journey mapping
1,MDP_September 2019,Demo Solutions
2,MVP_May 2020,e-Wallet
3,Admin Interface_August 2020,Web-based Interface
4,MVP_October 2020,E-store setup on web


In [60]:
url='https://cryptototem.com/infinidium-ico/'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')

In [61]:
try:
    tt=convert(soup.find("div", class_="soc-urls"))
    if len(tt) > 0:
        for el in tt['a']:
            try:
                social_name=el['@title'].split(':')[0]
            except:
                social_name=''
            try:
                social_url=el['@href']
            except:
                social_url=''
            social_info=pd.concat([social_info, pd.DataFrame({'Social': social_name, 'Link': social_url}, index=[0])])
except Exception as err:
    social_err=err

In [62]:
social_err

KeyError('a')

In [59]:
tt=convert(soup.find("div", class_="soc-urls"))
tt

{'@class': ['soc-urls'],
 '#text': '',
 'span': [{'@class': ['decr'],
   '@data-u': '{"ct":"OwmH3tDsEpgeX+N58iGvHzrrFaNBRwxU4eBBj76T83tESIq9wQ\\/Pu\\/tvaabFh+P6w1QbMxoEVyoYyL\\/rzkzbdA==","iv":"9e6e5bd93e83cc190a26c3a828fc764c","s":"fa144f18a47ebd28"}',
   '#text': '',
   'img': [{'@src': '/wp-content/themes/ultimate-conversion/assets/img/logos/website.svg',
     '@alt': 'SHENG web-site',
     '@width': '38',
     '@height': '38',
     '#text': ''}]}],
 'a': [{'@href': 'https://www.reddit.com/r/Sheng_Global/',
   '@title': 'Reddit: SHENG',
   '@rel': ['noopener', 'noreferrer', 'nofollow'],
   '@onclick': "this.target='_blank'",
   '#text': '',
   'img': [{'@src': '/wp-content/themes/ultimate-conversion/assets/img/loading-arrow.svg',
     '@data-src': '/wp-content/themes/ultimate-conversion/assets/img/logos/reddit.svg',
     '@alt': 'SHENG Reddit',
     '@class': ['lazyload'],
     '@width': '38',
     '@height': '38',
     '#text': ''}]},
  {'@href': 'https://medium.com/@sheng_world',


In [43]:
milestone_err=''
milestone_block=pd.DataFrame()
try:
    tt=soup.find("div", class_="milestones")
    if tt:
        tt=convert(tt.find("div", class_="box"))
        for i, el in enumerate(tt['div']):
            try:
                ref=list(set(el['div'][1].keys()) - set(['@class', '#text', 'div']))[0]  # can be 'p' or 'li' or 'ul'
                milestone=el['div'][1][ref][0]['#text']
            except:
                milestone=''
            try:
                date=el['div'][1]['div'][1]['#text']
            except:
                date=''
            milestone_block=pd.concat([milestone_block, pd.DataFrame({'date': date, 'milestone': milestone}, index=[i])])
except Exception as err:
    milestone_err=err
milestone_block

Unnamed: 0,date,milestone
0,Q4 2020,Founded Locklet Lite Paper


In [47]:
ref=list(set(el['div'][1].keys()) - set(['@class', '#text', 'div']))[0]  # can be 'p' or 'li' or 'ul'
if '#text' in el['div'][1][ref][0]:
    milestone=el['div'][1][ref][0]['#text']
else:
    ref2=list(set(el['div'][1][ref][0].keys()) - set(['@class', '#text', 'div']))[0]
    milestone=el['div'][1][ref][0][ref2][0]['#text']

In [54]:
el['div'][1]['ul']#['div'][1]['#text']

[{'#text': 'Q1 2021 Locklet Testnet for ERC20 and BEP20 tokens Advisors Onboarding',
  'div': [{'@class': ['condition'],
    '#text': 'Q1 2021',
    'navigablestring': ['Q1 2021']}],
  'li': [{'#text': 'Locklet Testnet for ERC20 and BEP20 tokens',
    'navigablestring': ['Locklet Testnet for ERC20 and BEP20 tokens']},
   {'#text': 'Advisors Onboarding',
    'navigablestring': ['Advisors Onboarding']}]}]

In [50]:
ref

'ul'

In [49]:
el

{'@class': ['row'],
 '#text': '2 Q1 2021 Locklet Testnet for ERC20 and BEP20 tokens Advisors Onboarding',
 'div': [{'@class': ['number'], '#text': '2', 'navigablestring': ['2']},
  {'@class': ['bubble'],
   '#text': 'Q1 2021 Locklet Testnet for ERC20 and BEP20 tokens Advisors Onboarding',
   'div': [{'@class': ['arrow'], '#text': ''}],
   'ul': [{'#text': 'Q1 2021 Locklet Testnet for ERC20 and BEP20 tokens Advisors Onboarding',
     'div': [{'@class': ['condition'],
       '#text': 'Q1 2021',
       'navigablestring': ['Q1 2021']}],
     'li': [{'#text': 'Locklet Testnet for ERC20 and BEP20 tokens',
       'navigablestring': ['Locklet Testnet for ERC20 and BEP20 tokens']},
      {'#text': 'Advisors Onboarding',
       'navigablestring': ['Advisors Onboarding']}]}]}]}

In [48]:
milestone

'Q1 2021 Locklet Testnet for ERC20 and BEP20 tokens Advisors Onboarding'

In [42]:
ref2=list(set(el['div'][1][ref][0].keys()) - set(['@class', '#text', 'div']))[0]
ref2

'li'

In [46]:
el['div'][1][ref][0][ref2]

[{'#text': 'Locklet Testnet for ERC20 and BEP20 tokens',
  'navigablestring': ['Locklet Testnet for ERC20 and BEP20 tokens']},
 {'#text': 'Advisors Onboarding', 'navigablestring': ['Advisors Onboarding']}]

In [41]:
el['div'][1]['ul'][0]

{'#text': 'Q1 2021 Locklet Testnet for ERC20 and BEP20 tokens Advisors Onboarding',
 'div': [{'@class': ['condition'],
   '#text': 'Q1 2021',
   'navigablestring': ['Q1 2021']}],
 'li': [{'#text': 'Locklet Testnet for ERC20 and BEP20 tokens',
   'navigablestring': ['Locklet Testnet for ERC20 and BEP20 tokens']},
  {'#text': 'Advisors Onboarding',
   'navigablestring': ['Advisors Onboarding']}]}

In [37]:
el

{'@class': ['row'],
 '#text': '2 Q1 2021 Locklet Testnet for ERC20 and BEP20 tokens Advisors Onboarding',
 'div': [{'@class': ['number'], '#text': '2', 'navigablestring': ['2']},
  {'@class': ['bubble'],
   '#text': 'Q1 2021 Locklet Testnet for ERC20 and BEP20 tokens Advisors Onboarding',
   'div': [{'@class': ['arrow'], '#text': ''}],
   'ul': [{'#text': 'Q1 2021 Locklet Testnet for ERC20 and BEP20 tokens Advisors Onboarding',
     'div': [{'@class': ['condition'],
       '#text': 'Q1 2021',
       'navigablestring': ['Q1 2021']}],
     'li': [{'#text': 'Locklet Testnet for ERC20 and BEP20 tokens',
       'navigablestring': ['Locklet Testnet for ERC20 and BEP20 tokens']},
      {'#text': 'Advisors Onboarding',
       'navigablestring': ['Advisors Onboarding']}]}]}]}

In [36]:
milestone_err

IndexError('list index out of range')

In [15]:
el['div'][1]['div'][1]['#text']

'Map_June 2019'

In [26]:
list(set(el['div'][1].keys()) - set(['@class', '#text', 'div']))[0]

'li'

In [23]:
el['div'][1]#['li'][0]['#text']

{'@class': ['bubble'],
 '#text': 'Map_June 2019 Customer journey mapping Wireframes mapping',
 'div': [{'@class': ['arrow'], '#text': ''},
  {'@class': ['condition'],
   '#text': 'Map_June 2019',
   'navigablestring': ['Map_June 2019']}],
 'li': [{'#text': 'Customer journey mapping',
   'navigablestring': ['Customer journey mapping']},
  {'#text': 'Wireframes mapping', 'navigablestring': ['Wireframes mapping']}]}

In [18]:
el['div'][1]#['p'][0]['#text']

{'@class': ['bubble'],
 '#text': 'Map_June 2019 Customer journey mapping Wireframes mapping',
 'div': [{'@class': ['arrow'], '#text': ''},
  {'@class': ['condition'],
   '#text': 'Map_June 2019',
   'navigablestring': ['Map_June 2019']}],
 'li': [{'#text': 'Customer journey mapping',
   'navigablestring': ['Customer journey mapping']},
  {'#text': 'Wireframes mapping', 'navigablestring': ['Wireframes mapping']}]}

In [14]:
el

{'@class': ['row'],
 '#text': '1 Map_June 2019 Customer journey mapping Wireframes mapping',
 'div': [{'@class': ['number'], '#text': '1', 'navigablestring': ['1']},
  {'@class': ['bubble'],
   '#text': 'Map_June 2019 Customer journey mapping Wireframes mapping',
   'div': [{'@class': ['arrow'], '#text': ''},
    {'@class': ['condition'],
     '#text': 'Map_June 2019',
     'navigablestring': ['Map_June 2019']}],
   'li': [{'#text': 'Customer journey mapping',
     'navigablestring': ['Customer journey mapping']},
    {'#text': 'Wireframes mapping',
     'navigablestring': ['Wireframes mapping']}]}]}

In [12]:
tt

{'@class': ['box'],
 '#text': '1 Map_June 2019 Customer journey mapping Wireframes mapping 2 MDP_September 2019 Demo Solutions Wireframes 3 MVP_May 2020 e-Wallet Internal exchange Internal exchange Integrated merchant platform Merchant admin portal Marketplace merchant admin portal 4 Admin Interface_August 2020 Web-based Interface Authentication & authorisation and enhance security protocol Web-based integrated merchant platform Web-based merchant admin platform 5 MVP_October 2020 E-store setup on web Product ratings and review Creation of categories Optimisation module Social management portal',
 'div': [{'@class': ['row'],
   '#text': '1 Map_June 2019 Customer journey mapping Wireframes mapping',
   'div': [{'@class': ['number'], '#text': '1', 'navigablestring': ['1']},
    {'@class': ['bubble'],
     '#text': 'Map_June 2019 Customer journey mapping Wireframes mapping',
     'div': [{'@class': ['arrow'], '#text': ''},
      {'@class': ['condition'],
       '#text': 'Map_June 2019',
 

In [6]:
pd.set_option('display.max_columns', None)
aa1

Unnamed: 0,url,last_update,tag,status,website,website_err,whitepaper_url,whitepaper_err,bounty_dummy,MVP_dummy,overview_block,overview_err,price_series_avail,description,description_err,info_block,info_err,team_block,team_err,social_info,social_err
0,https://cryptototem.com/lido-ldo/,"Aug 11, 2023",[],,https://lido.fi/?utm_source=CryptoTotem,,https://cryptototem.com/wp-ico/img/new project...,,0,1,Field Value 0 Project...,,1,"What is Lido Lido""s liquid staking allows usin...",,Empty DataFrame Columns: [] Index: [],name 'Tag' is not defined,Member Name \ 0 Team ...,,Social ...,


In [22]:
df_scrape_raw

Unnamed: 0,url,scrape_status,last_update,tag,status,website,website_err,whitepaper_url,whitepaper_err,bounty_dummy,MVP_dummy,overview_block,overview_err,price_series_avail,description,description_err,info_block,info_err,team_block,team_err,social_info,social_err,PklPath,TotTimeSec,scrape_error
0,https://cryptototem.com/lido-ldo/,OK,"Aug 11, 2023",[],,https://lido.fi/?utm_source=CryptoTotem,,https://cryptototem.com/wp-ico/img/new project...,,0.0,1.0,Field Value 0 Project...,,1.0,"What is Lido Lido""s liquid staking allows usin...",,category label ...,,Member Name \ 0 Team ...,,Social ...,,.\Checkpoints\Cryptototem\lido-ldo.pkl,2.0,
1,https://cryptototem.com/subdao-gov/,OK,"Dec 01, 2022",[DAICO],,https://www.subdao.network/,,https://cryptototem.com/wp-ico/img/new project...,,0.0,0.0,Field Value 0 Project i...,,0.0,What is SubDAO SubDAO is a DAO infrastructure ...,,category label \ 0 D...,,Member Name Role Links 0 ...,,Social Lin...,,.\Checkpoints\Cryptototem\subdao-gov.pkl,2.0,
2,https://cryptototem.com/story-protocol/,OK,"Sep 08, 2023",[],,https://www.storyprotocol.xyz/,,,NON AVAILABLE,0.0,0.0,Field Value 0 Project ...,,0.0,What is Story Protocol Story Protocol was foun...,,category label value...,,Member Name ...,,Social ...,,.\Checkpoints\Cryptototem\story-protocol.pkl,2.0,
3,https://cryptototem.com/solv-protocol-solv/,OK,"Dec 31, 2022",[],,https://solv.finance/?utm_source=CryptoTotem,,https://docs.solv.finance/,,0.0,1.0,Field Value 0 Project in...,,0.0,What is Solv Protocol Solv Protocol is a decen...,,category label \ 0 D...,,Member Name ...,,Social ...,,.\Checkpoints\Cryptototem\solv-protocol-solv.pkl,2.0,
4,https://cryptototem.com/unstoppable-domains/,OK,"Nov 21, 2022",[],,https://unstoppabledomains.com/?utm_source=Cry...,,https://docs.unstoppabledomains.com/,,0.0,0.0,Field Value 0 Project ...,,0.0,What is Unstoppable Domains Unstoppable Domain...,,category label value...,,Member Name ...,,Social ...,,.\Checkpoints\Cryptototem\unstoppable-domains.pkl,2.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5137,https://cryptototem.com/sexservice-ssio-ico/,OK,"May 04, 2018",[ICO],,https://sexservice.io&utm_medium=listing&utm_c...,,https://sexservice.io/wp/ssio-whitepaper-en-1....,,0.0,0.0,Field Value 0 Project industry...,,0.0,"What is Sexservice Wallet, Explorer and Live n...",,category label \ 0 Toke...,,Empty DataFrame Columns: [] Index: [],,Social ...,,.\Checkpoints\Cryptototem\sexservice-ssio-ico.pkl,2.0,
5138,https://cryptototem.com/vanillaplay-vpy-ico/,OK,"May 04, 2018",[ICO],,https://www.vanillaplay.io/&utm_medium=listing...,,https://www.vanillaplay.io/whitepaper/VanillaP...,,0.0,0.0,Field Value 0 Project industry...,,0.0,What is VanillaPlay VanillaPlay eliminates the...,,category label \ 0 Toke...,,Empty DataFrame Columns: [] Index: [],,Social ...,,.\Checkpoints\Cryptototem\vanillaplay-vpy-ico.pkl,2.0,
5139,https://cryptototem.com/okoin-okoin-ico/,OK,"May 04, 2018",[ICO],,https://okoin.io/&utm_medium=listing&utm_campa...,,https://cryptototem.com/wp-ico/img/files/7gr4n...,,0.0,0.0,Field Value 0 Project...,,0.0,What is OKOIN Decentralized p2p distribution o...,,category label \ 0 ...,,Empty DataFrame Columns: [] Index: [],,Social Link...,,.\Checkpoints\Cryptototem\okoin-okoin-ico.pkl,2.0,
5140,https://cryptototem.com/erotix--erx-ico/,OK,"May 04, 2018",[ICO],,https://erotix.io/&utm_medium=listing&utm_camp...,,https://cryptototem.com/wp-ico/img/files/AyaeT...,,0.0,0.0,Field Value 0 Project indus...,,0.0,What is Erotix The main application of the pla...,,category label \ 0 ...,,Empty DataFrame Columns: [] Index: [],,Social Lin...,,.\Checkpoints\Cryptototem\erotix--erx-ico.pkl,1.0,


In [72]:
url='https://cryptototem.com/lido-ldo/'#'https://cryptototem.com/maidaan-mnd/'# 'https://cryptototem.com/aethir/' #'https://cryptototem.com/corestarter-cstr/'# 'https://cryptototem.com/anote-music/'# 'https://cryptototem.com/coin98-c98/'# 'https://cryptototem.com/bitwings-bwn/'# 'https://cryptototem.com/lido-ldo/'

def scrape_info_cryptototem(url):

    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')


    # get website
    website=''
    website_err=''
    try:
        tt=convert(soup.find("td", class_="thumbs-holder").find('span', class_='decr'))
        if tt['#text'] == 'Website':
            crypted=tt['@data-u']
            exec('dd='+crypted)
            website=decrypt_CryptoTotem(dd).replace('?utm_source=cryptototem', '')
    except Exception as err:
        website_err=err

    # get whitepaper
    whitepaper_url=''
    whitepaper_err=''
    if soup.find("td", string="Whitepaper") is not None:
        try:
            tt=convert(soup.find("td", string="Whitepaper").find_next_sibling("td").span)
            if tt['#text'] == 'Open':
                crypted=tt['@data-u']
                exec('dd='+crypted)
                whitepaper_url=decrypt_CryptoTotem(dd)
        except Exception as err:
            whitepaper_err=err

    else:
        whitepaper_err='NON AVAILABLE'

    # get Bounty program
    bounty_dummy=0
    if soup.find("div", string="Bounty program") is not None:
        bounty_dummy=1

    # get MVP
    MVP_dummy=0
    if soup.find("h2", string="MVP") is not None:
        MVP_dummy=1

    # get Overview
    overview_err=''
    try:
        tt=convert(soup.find("td", class_="overview-td"))['table'][0]['tr']
        overview_block=pd.DataFrame()
        for el in tt:
            add_row=[x['#text'] for x in el['td']]
            overview_block=pd.concat([overview_block, pd.DataFrame(add_row, index=['Field', 'Value']).T])
    except Exception as err:
        overview_err=err
        overview_block=pd.DataFrame()

    # check if price chart is available (in case, use coingecko API  https://rapidapi.com/collection/coinmarketcap-api)
    price_avail=int(soup.find("div", class_="chart-holder") is not None)

    # get status and tags (ICO, IEO, ...)
    tt=convert(soup.find("h1", class_="ico-title").parent)['div']
    try:
        last_update=tt[0]['#text'].split('Last updated:')[-1].strip()
    except:
        last_update=None
    try:
        tag=[x['#text'] for x in tt[0]['div']]
    except:
        tag=None
    try:
        status=[x['div'][0]['@class'][-1] for x in tt[2]['div']]
    except:
        status=None

    # get description
    description=''
    description_err=''
    try:
        for el in soup.find_all("div", class_="align-left"):
            if el.find('h2') is not None:
                if "What is" in str(el.find('h2')):
                    description=convert(el)['#text']
    except Exception as err:
        description_err=err

    # get Info
    info_err=''
    info_block=pd.DataFrame()
    try:
        for child in soup.find("table", class_="ico-main-table token-info-table").children:
            for td_column in child:
                h2=''
                wait_for_value=True
                for td in td_column:
                    if isinstance(td, Tag):
                        tag_type=td.prettify().split('\n')[0]
                        if tag_type == '<h2>':
                            main_category=td.text
                        elif tag_type == '<strong>':
                            label=td.text
                            wait_for_value=True
                        elif tag_type == '<address>':
                            label='Office address'
                            value=td.text.split('Office address: ')[-1]
                            wait_for_value=False
                        elif tag_type == '<br/>':
                            continue
                    else:   # NavigableString
                        value=td.text.strip()
                        wait_for_value=False
                    if not wait_for_value:
                        info_block=pd.concat([info_block, pd.DataFrame({'category': main_category, 'label': label,
                                                                        'value': value}, index=[0])])
    except Exception as err:
        info_err=err

    # get Team and Advisors
    team_err=''
    team_block=pd.DataFrame()
    try:
        for label, cls  in zip(['Team', 'Advisor'], ['team-members align-center', 'advisors align-center']):
            tt=soup.find("div", class_=cls)
            if tt is not None:
                tt=convert(tt)
                for el in tt['div']:
                    try:
                        pers_name=el['div'][0]['#text']
                    except:
                        pers_name=None
                    try:
                        pers_role = el['div'][1]['#text']
                    except:
                        pers_role=None
                    try:
                        for el1 in el['span']:
                            crypted=el1['@data-u']
                            exec('dd='+crypted)
                            pers_url.append(decrypt_CryptoTotem(dd).replace('\\r', ''))
                    except:
                        pers_url=[]
                    team_block=pd.concat([team_block, pd.DataFrame({'Member': label, 'Name': pers_name, 'Role': pers_role,
                                                                    'Links': [pers_url]}, index=[0])])
    except Exception as err:
        team_err=err

    # social info
    social_err=''
    social_info=pd.DataFrame()
    try:
        tt=convert(soup.find("div", class_="soc-urls"))
        for el in tt['a']:
            try:
                social_name=el['@title'].split(':')[0]
            except:
                social_name=''
            try:
                social_url=el['@href']
            except:
                social_url=''
            social_info=pd.concat([social_info, pd.DataFrame({'Social': social_name, 'Link': social_url}, index=[0])])
    except Exception as err:
        social_err=err


    add_row=pd.DataFrame({'url': url,
                          'last_update': last_update,
                          'tag': [tag],
                          'status': [status],
                          'website': website,
                          'website_err': website_err,
                          'whitepaper_url': whitepaper_url,
                          'whitepaper_err': whitepaper_err,
                          'bounty_dummy': bounty_dummy,
                          'MVP_dummy': MVP_dummy,
                          'overview_block': [overview_block],
                          'overview_err': overview_err,
                          'price_series_avail': price_avail,
                          'description': description,
                          'description_err': description_err,
                          'info_block': [info_block],
                          'info_err': info_err,
                          'team_block': [team_block],
                          'team_err': team_err,
                          'social_info': [social_info],
                          'social_err': social_err}, index=[0])

    return add_row

In [270]:
df_final.iloc[0]['InfoBlock']

Unnamed: 0,BlockName,Item,ItemUrl,ItemValue
0,General,Website,http://factom.org?utm_source=icomarks,Visit
1,General,White paper,https://www.factom.com/devs/docs/guide/factom-...,Read
2,General,ICO Time,,30 Mar 2015 - 14 May 2015
3,General,Country,,USA
4,Token info,Ticker,,Factoid
5,Token info,Platform,,Blockchain
6,Token info,Total supply,,0.00 Factoid
7,Financial,Raised,,"$ 538,000"
8,Financial,ICO Price,,1 Factoid = 0.00588 BTC
9,Financial,Accepting,,BTC


In [271]:
df_final.iloc[0]['TeamBlock']

Unnamed: 0,Member,Name,Role,Extra,Links
0,Team,Peter Kirby,President,Takes part in this project only,[https://www.linkedin.com/in/peter-kirby-50034a6]
1,Team,Paul Snow,CEO,Takes part in this project only,[https://www.linkedin.com/in/paulsn]
2,Team,David Johnston,Chairman of the Board,Takes part in this project only,[https://www.linkedin.com/in/davidajohnston]
3,Team,Brian Deery,Chief Scientist,Participates in a number of projects,
4,Team,Zachary Lynde,CFO,Participates in a number of projects,
5,Team,Andrew Yashchuk,VP of Product Development,Takes part in this project only,[https://www.linkedin.com/in/andrewyashchuk]
6,Team,Abhi Dobhal,VP of Business Development,Takes part in this project only,[https://www.linkedin.com/in/abhidobhal]
7,Team,Zeen Zhang,China General Manager,Takes part in this project only,[https://www.linkedin.com/in/zeen-zhang-3a41a619]
8,Team,Mahesh Paolini-Subramanya,EVP of Research and Development,Takes part in this project only,[https://www.linkedin.com/in/dieswaytoofast]
9,Team,Laurie Pyle,Executive Vice President,Takes part in this project only,[https://www.linkedin.com/in/lauriepyle/]


In [275]:
df_final.iloc[1]['SocialBlock']

[{'stats':      Social  Users     Rating
  0   Twitter   7273  Very High
  1  Facebook   1009     Medium,
  'timeseries': {'Facebook':                          Date  Users
   0    2018-05-17T07:00:00.000Z    893
   1    2018-05-18T07:00:00.000Z    893
   2    2018-05-19T07:00:00.000Z    893
   3    2018-05-20T07:00:00.000Z    894
   4    2018-05-21T07:00:00.000Z    894
   ..                        ...    ...
   380  2019-07-06T07:00:00.000Z   1011
   381  2019-07-23T07:00:00.000Z   1010
   382  2019-08-05T07:00:00.000Z   1009
   383  2019-08-26T07:00:00.000Z   1009
   384  2019-09-04T07:00:00.000Z   1009
   
   [385 rows x 2 columns],
   'Twitter':                           Date  Users
   0     2018-05-17T07:00:00.000Z   6108
   1     2018-05-18T07:00:00.000Z   6108
   2     2018-05-19T07:00:00.000Z   6106
   3     2018-05-20T07:00:00.000Z   6097
   4     2018-05-21T07:00:00.000Z   6102
   ...                        ...    ...
   1104  2022-01-28T08:00:00.000Z   7240
   1105  2022-01-2

In [269]:
df_final.columns.values

array(['url', 'ListDownloadedOn', 'NViews', 'VerifiedEmailDummy',
       'IsSTODummy', 'IsIEODummy', 'Status', 'StartDate', 'EndDate',
       'LogDurationDays', 'RatingIcomarks', 'TeamSize', 'AdvisorSize',
       'BountyDummy', 'MVPDummy', 'CountryOriginal', 'Country', 'Region',
       'SubRegion', 'Dates', 'ICODateStart', 'ICODateEnd', 'IEODateStart',
       'IEODateEnd', 'IEOLaunchpad', 'STODateStart', 'STODateEnd',
       'PreSaleDummy', 'PreSaleDateStart', 'PreSaleDateEnd',
       'WebsiteDummy', 'WebsiteUrl', 'WhitepaperDummy', 'WhitepaperUrl',
       'WhitepaperDownloaded', 'KYCDummy', 'WhitelistDummy', 'Ticker',
       'Platform', 'TokenType', 'ERC20Dummy', 'TokenAvailForSale',
       'TokenTotSupply', 'FundRaisedUSD', 'FundHardCap', 'FundHardCapUSD',
       'FundSoftCap', 'FundSoftCapUSD', 'CapUSD', 'SuccessPerc',
       'AcceptedCurr', 'BonusDummy', 'ICOPrice', 'IEOPrice', 'STOPrice',
       'PriceUSD', 'PreSalePrice', 'PreSalePriceUSD',
       'SocialSeriesDownloaded', 'Price

In [267]:
df_final=joblib.load(os.path.join(CHECKPOINT_FOLDER, 'df_final_Scientific_Data.pkl'))
df_final

Unnamed: 0,url,ListDownloadedOn,NViews,VerifiedEmailDummy,IsSTODummy,IsIEODummy,Status,StartDate,EndDate,LogDurationDays,...,SocialTwitterDummy,SocialMediumDummy,SocialYoutubeDummy,SocialRedditDummy,SocialGithubDummy,SocialDiscordDummy,SocialVKDummy,SocialLinkedinDummy,SocialSlackDummy,SocialInstagramDummy
0,https://icomarks.ai/ico/factom,06/02/2023,14866,0,0,0,Ended,2015-03-30,14 May 2015,1.662758,...,1,0,1,0,1,1,0,1,0,0
1,https://icomarks.ai/ico/bitcrystals,06/02/2023,15255,0,0,0,Trading,2015-08-04,15 Sep 2015,1.633468,...,1,0,0,0,0,0,0,0,0,0
2,https://icomarks.ai/ico/augur,06/02/2023,19598,0,0,0,Trading,2015-08-17,01 Oct 2015,1.662758,...,1,1,1,1,1,0,0,0,1,0
3,https://icomarks.ai/ico/safex,06/02/2023,9577,0,0,0,Ended,2016-01-01,31 Jan 2016,1.491362,...,1,0,1,1,1,0,0,0,0,0
4,https://icomarks.ai/ico/digixdao,06/02/2023,17118,0,0,0,Trading,2016-03-29,30-mar-16,0.301030,...,1,1,0,1,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8274,https://icomarks.ai/ico/anuvia,06/02/2023,122,1,0,0,Upcoming,2150-01-01,TBA,,...,0,0,0,0,0,0,0,0,0,0
8275,https://icomarks.ai/ico/mthub,06/02/2023,93,0,0,0,Upcoming,2150-01-01,TBA,,...,0,0,0,0,0,0,0,0,0,0
8276,https://icomarks.ai/ico/calvaria,06/02/2023,2238,1,0,0,Upcoming,2150-01-01,TBA,,...,1,1,1,1,0,1,0,0,0,0
8277,https://icomarks.ai/ico/host-games,06/02/2023,17636,0,0,0,Upcoming,2150-01-01,TBA,,...,1,1,1,1,0,0,0,0,0,0
