In [1]:
from bs4 import BeautifulSoup
from utils import get_chromedriver, adjust_date, scrape_info_icomarks, extract_scaping_icomarks, summary_stats
import requests
import pandas as pd
from urllib.parse import urljoin
from soup2dict import convert
import time
from timeit import default_timer as timer
import datetime
import numpy as np
import re
import pandas as pd
import os
import joblib
import sys
from thefuzz import fuzz
from thefuzz import process

In [2]:
CHROMEDRIVER_PATH = r"C:\Users\Alessandro Bitetto\Downloads\UniPV\ICOs\WebDriver\chromedriver"

In [3]:
# set folders
CHECKPOINT_FOLDER = '.\\Checkpoints'
RESULTS_FOLDER = '.\\Results'

if not os.path.exists(CHECKPOINT_FOLDER):
    os.makedirs(CHECKPOINT_FOLDER)
if not os.path.exists(RESULTS_FOLDER):
    os.makedirs(RESULTS_FOLDER)

## Get ICOs url

In [5]:
MAIN_PAGE = "https://icomarks.com/"                # to be added to single ICO url
CATEGORY_PAGE = "https://icomarks.com/icos/"       # used to query the category to be downloaded

# get html
page = requests.get(CATEGORY_PAGE)
soup = BeautifulSoup(page.content, 'html.parser')

# extract list of categories
tag = soup.find_all('div', class_="icoTop__selects", recursive=True)
conv_dict = convert(tag)
while conv_dict['div'][0]['@class'][0] != 'icoTop__selects':
    conv_dict = conv_dict['div'][0]
category_list = conv_dict['div'][0]['form'][0]['select'][0]['option']
category = pd.DataFrame([(v['@value'], v['#text']) for v in category_list if '@value' in v.keys()],
                        columns =['url_ref', 'Category'])
category[['Category', 'Count']] = category['Category'].str.split('(', 1, expand=True)
category['Count'] = category['Count'].apply(lambda x: int(x.replace(')', '')))
display(category)

Unnamed: 0,url_ref,Category,Count
0,artificial-intelligence,AI,505
1,art,Art,99
2,banking,Banking,645
3,big-data,Big Data,412
4,business-services,Business,1341
5,charity,Charity,156
6,communication,Communication,451
7,cryptocurrency,Cryptocurrency,2959
8,defi,DeFi,489
9,education,Education,217


In [20]:
# apply category in search query and get ICO list

cat_list = pd.DataFrame(columns=['Category', 'url', 'n_views', 'verified_email', 'is_STO', 'is_IEO',
                                 'status', 'start_date', 'end_date'])
start = timer()
download_date=datetime.datetime.now().strftime("%d/%m/%Y")
for index, row in category.iterrows():
    
    url_categ = row['url_ref']
    expected_count = row['Count']
    categ = row['Category']
    
    print('\n- Downloading: ' + categ + '  ('+ str(expected_count) + ' expected) ' + str(index + 1) + ' / ' + str(len(category)))
    
    # scroll down till "Show more" button disappear
    show_more_path = '/html/body/section/div[2]/div[2]/div[2]/a'

    print('   - Scrolling down...', end ='')

    driver = get_chromedriver(chromedriver_path = CHROMEDRIVER_PATH)
    driver.get(urljoin(CATEGORY_PAGE, url_categ))

    try:
        while driver.find_element("xpath", show_more_path).is_displayed():

            driver.execute_script("arguments[0].scrollIntoView(true);", driver.find_element("xpath", show_more_path))
            driver.find_element("xpath", show_more_path).click()
            time.sleep(3)
        print('OK')
    except:
        print('SKIPPED')

    # get html
    print('   - Downloading html...', end='')
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    print('OK')

    # extract information from web list
    print('   - Parsing info...', end='')
    tag = soup.find_all('div', class_="icoListContent", recursive=True)
    tag_list = []
    for t in tag[0]:
        if 'div class="newItems"' not in str(t):
            tag_list.append(t)
    # if show more, html structure changes
    nested_tags=soup.find_all('div', class_="newItems", recursive=True)
    if len(nested_tags) > 0:
        for x in soup.find_all('div', class_="newItems", recursive=True):
            tag_list.extend([y for y in x])

    temp_list = pd.DataFrame(columns=cat_list.columns)
    for t in tag_list:

        if 'START' in str(t):

            conv_dict = convert(t)['div']
            for x in conv_dict:

                if x['@class'][0] == 'icoListItem__info':
                    sup = x['a'][0]['sup']
                    n_views = [y['#text'] for y in sup if y['@class'][0] == "sup_views"][0]
                    is_sto = int(any([True if y['@class'][0] == "sup_is_sto" else False for y in sup]))
                    is_ieo = int(any([True if y['@class'][0] == "sup_is_ieo" else False for y in sup]))
                    ver_email = int(any([True if y['@class'][0] == "sup_email_confirmed" else False for y in sup]))
                    url = x['a'][0]['@href']
                if x['@class'][0] == 'icoListItem__raised':
                    status = x['#text']#[v['#text'] for k, v in x['span'][0].items() if ]
                if x['@class'][0] == 'icoListItem__start':
                    start_date = x['navigablestring'][0]
                if x['@class'][0] == 'icoListItem__end':
                    end_date = x['navigablestring'][0]

            temp_list = temp_list.append(pd.DataFrame({
                'Category':categ,
                'url': urljoin(MAIN_PAGE, url),
                'n_views': int(n_views.replace(' Views', '').replace(',', '')),
                'verified_email': ver_email,
                'is_STO': is_sto,
                'is_IEO': is_ieo,
                'status': status.replace('STATUS ', ''),
                'start_date': start_date,
                'end_date': end_date
            }, index = [0]))

    temp_list['List downloaded on']=download_date
    cat_list = cat_list.append(temp_list)
    
    # save results
    cat_list.to_csv(os.path.join(RESULTS_FOLDER,'01a_ICOmarks_ico_list.csv'), index=False, sep=';')

    if temp_list.shape[0] != expected_count:
        print('   ####### warning, expected number of elements (' + str(expected_count) + ') mismatch. Found ' + str(temp_list.shape[0]))
    else:
        print('OK')

    driver.close()
    
print('\n\nTotal elapsed time:', str(datetime.timedelta(seconds=round(timer()-start))))
print('\nData saved in ', os.path.join(RESULTS_FOLDER,'01a_ICOmarks_ico_list.csv'))


- Downloading: AI   (505 expected) 1 / 33
   - Scrolling down...OK
   - Downloading html...OK
   - Parsing info...OK

- Downloading: Art   (99 expected) 2 / 33
   - Scrolling down...OK
   - Downloading html...OK
   - Parsing info...OK

- Downloading: Banking   (645 expected) 3 / 33
   - Scrolling down...OK
   - Downloading html...OK
   - Parsing info...OK

- Downloading: Big Data   (412 expected) 4 / 33
   - Scrolling down...OK
   - Downloading html...OK
   - Parsing info...OK

- Downloading: Business   (1341 expected) 5 / 33
   - Scrolling down...OK
   - Downloading html...OK
   - Parsing info...OK

- Downloading: Charity   (156 expected) 6 / 33
   - Scrolling down...OK
   - Downloading html...OK
   - Parsing info...OK

- Downloading: Communication   (451 expected) 7 / 33
   - Scrolling down...OK
   - Downloading html...OK
   - Parsing info...OK

- Downloading: Cryptocurrency   (2959 expected) 8 / 33
   - Scrolling down...OK
   - Downloading html...OK
   - Parsing info...OK

- Downlo

### Check downloaded list and remove duplicates

In [7]:
cat_list=pd.read_csv(os.path.join(RESULTS_FOLDER,'01a_ICOmarks_ico_list_raw.csv'), sep=';')

cat_list.drop_duplicates(inplace=True)

# adjust dates
cat_list['start_date']=cat_list['start_date'].map(adjust_date)
cat_list['end_date']=cat_list['end_date'].map(adjust_date)

# find url with multiple entries (due to IEO/STO) and keep all categories and minimum start date and max end date
multiple_url=cat_list[['url', 'n_views']].drop_duplicates()['url'].value_counts().to_frame().reset_index().query('url > 1')['index']
if len(multiple_url) > 0:
    
    print('\n-- Url with multiple entries found. Keeping single information only')
    new_df=pd.DataFrame(columns=cat_list.columns)
    for t_url in multiple_url:
        t_df=cat_list[cat_list['url']==t_url].copy()

        status=t_df['status'].value_counts().index[0]
        if t_df['status'].nunique() > 1:
            u_val=t_df['status'].unique()
            if 'Ended' in u_val:
                status='Ended'
            if 'Active' in u_val:
                status='Active'
            print(f"    - {t_url}: Multiple status found: {u_val}. Keeping '{status}'")

        try:
            start_date=pd.to_datetime(t_df['start_date'].loc[lambda x : x != 'TBA'], infer_datetime_format=True).min().strftime('%d %b %Y')
        except:
            start_date=t_df['start_date'].unique()[0]
        try:
            end_date=pd.to_datetime(t_df['end_date'].loc[lambda x : x != 'TBA'], infer_datetime_format=True).max().strftime('%d %b %Y')
        except:
            end_date=t_df['end_date'].unique()[0]

        add_df=pd.DataFrame({'Category': t_df['Category'].unique(),
                            'url': t_url,
                            'n_views': t_df['n_views'].max(),
                            'verified_email': t_df['verified_email'].max(),
                            'is_STO': t_df['is_STO'].max(),
                            'is_IEO': t_df['is_IEO'].max(),
                            'status': status,
                            'start_date': start_date,
                            'end_date': end_date,
                            'List downloaded on': t_df['List downloaded on'].values[0]})

        new_df=pd.concat([new_df, add_df])
        
    cat_list=cat_list[~cat_list['url'].isin(multiple_url)]
    cat_list=pd.concat([cat_list, new_df]) 

# get dummy for category
cat_list['Category']=cat_list['Category'].str.replace(' ', '')
cat_dummy=pd.concat([cat_list['url'], pd.get_dummies(cat_list['Category'], drop_first=False, prefix='CATEGORY')], axis=1)
cat_dummy=cat_dummy.groupby('url').sum()
if cat_dummy.max().max() != 1:
    print('\n ### "Category" dummy variable has value greater than 1')
cat_dummy.reset_index(inplace=True)

# create final dataset
cat_list=cat_list.drop(columns='Category').drop_duplicates()
cat_list=cat_list.merge(cat_dummy, on='url', how='left')

if cat_list['url'].nunique() != cat_list.shape[0]:
    print('\n ##### Unique urls do not match number of rows')

print('\n-- Total ICOs found:', cat_list['url'].nunique())
display(cat_list['status'].value_counts().to_frame())
    
# save csv
cat_list.to_csv(os.path.join(RESULTS_FOLDER, '01b_ICOmarks_ico_list_adjusted.csv'), index=False, sep=';')
print('\nData saved in ', os.path.join(RESULTS_FOLDER, '01b_ICOmarks_ico_list_adjusted.csv'))


-- Url with multiple entries found. Keeping single information only
    - https://icomarks.com/ico/ins: Multiple status found: ['Trading' 'Ended']. Keeping 'Ended'
    - https://icomarks.com/ico/unifox: Multiple status found: ['Pre-Sale Ended' 'Ended']. Keeping 'Ended'

-- Total ICOs found: 8279


Unnamed: 0,status
Ended,5034
Upcoming,1890
Trading,726
Pre-Sale Ended,399
Active,164
Pre-Sale,66



Data saved in  .\Results\01b_ICOmarks_ico_list_adjusted.csv


## Scrape information from url

In [4]:
ICOMARKS_FOLDER=os.path.join(CHECKPOINT_FOLDER, 'Icomarks')
URL_ROOT='https://icomarks.com/ico/'    # will be removed from url to save pickle in ICOMARKS_FOLDER
RELOAD_PKL=True
SKIP_MISSING=False     # if True skip attempt to scrape missing pickles

if not os.path.exists(ICOMARKS_FOLDER):
    os.makedirs(ICOMARKS_FOLDER)

cat_list=pd.read_csv(os.path.join(RESULTS_FOLDER, '01b_ICOmarks_ico_list_adjusted.csv'), sep=';')

In [5]:
scrape_df=pd.DataFrame()
for index, row in cat_list.iterrows():
    
    print(f'- Scraping: {str(index + 1)} / {len(cat_list)}   last interaction: {datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S")}', end='\r')
    
    url=row['url']
    save_path=os.path.join(ICOMARKS_FOLDER, url.replace(URL_ROOT, '')+'.json').replace('|', '')
    
    if not RELOAD_PKL or not os.path.exists(save_path):
    
        if SKIP_MISSING and not os.path.exists(save_path):
            add_row=pd.DataFrame({'url': url, 'ScrapeStatus': 'ERROR'}, index=[0])
            scrape_df=pd.concat([scrape_df, add_row])
            continue
    
        try:
            start = timer()
            add_row=scrape_info_icomarks(url=url, chromedriver_path=CHROMEDRIVER_PATH, skip_social=False)
            add_row.insert(1, 'ScrapeStatus', 'OK')
            add_row['PklPath']=save_path
            add_row['TotTimeSec']=datetime.timedelta(seconds=round(timer()-start)).total_seconds()
            add_row.to_json(save_path, orient='table')
        except:
            add_row=pd.DataFrame({'url': url, 'ScrapeStatus': 'ERROR'}, index=[0])
    
    else:
        add_row=pd.read_json(save_path, orient='table')
        # re-format nested dataframe from json schema
        add_row['InfoBlock']=[pd.DataFrame(add_row['InfoBlock'][0])]
        if 'TeamBlock' in add_row.columns:
            add_row['TeamBlock']=[pd.DataFrame(add_row['TeamBlock'][0])]
        if 'SocialBlock' in add_row.columns:
            social_df=pd.DataFrame(add_row['SocialBlock'][0][0]['stats'])
            series_dict={}
            for k in add_row['SocialBlock'][0][0]['timeseries'].keys():
                series_dict[k]=pd.DataFrame(add_row['SocialBlock'][0][0]['timeseries'][k])
            add_row['SocialBlock']=[[{'stats': social_df, 'timeseries': series_dict}]]
        
    scrape_df=pd.concat([scrape_df, add_row])

scrape_df.reset_index(drop=True, inplace=True)
display(scrape_df['ScrapeStatus'].value_counts().to_frame())
   
print('\n\nTotal elapsed time:', str(datetime.timedelta(seconds=round(scrape_df['TotTimeSec'].sum()))))

# save
pkl_path=os.path.join(CHECKPOINT_FOLDER, 'scrape_df_raw.pkl')
joblib.dump(scrape_df, pkl_path, compress=('lzma', 3))
print(f'\nData saved in {pkl_path}')

- Scraping: 8279 / 8279   last interaction: 08/02/2023 23:09:02

Unnamed: 0,ScrapeStatus
OK,8279




Total elapsed time: 1 day, 9:00:33

Data saved in .\Checkpoints\scrape_df_raw.pkl


## Format scraped information and save final dataset

In [None]:
pkl_path=os.path.join(CHECKPOINT_FOLDER, 'scrape_df_raw.pkl')
scrape_df=joblib.load(pkl_path)

# extract from raw data
print('-- Extracting information from raw scraped data')
format_df=extract_scaping_icomarks(scrape_df)

pkl_path=os.path.join(CHECKPOINT_FOLDER, 'scrape_df_extracted.pkl')
joblib.dump(format_df, pkl_path, compress=('lzma', 3))
print(f'\n   - Data saved in {pkl_path}')

# format nested column and merge with cat_list dataset
cat_list=pd.read_csv(os.path.join(RESULTS_FOLDER, '01b_ICOmarks_ico_list_adjusted.csv'), sep=';')

In [43]:
add_row=pd.read_json(save_path, orient='table')
add_row

Unnamed: 0,url,ScrapeStatus,PageScreenshot,Rating_TOTAL_SCORE,Rating_PROFILE_(weight:_45%),Rating_SOCIAL_ACTIVITY_(weight:_35%),Rating_TEAM_PROOF_(weight:_20%),InfoBlock,TeamSize,AdvisorSize,TeamBlock,PklPath,TotTimeSec
0,https://icomarks.com/ico/baby-token,OK,"[\nLast screenshot taken on, Last screenshot t...",5.7,5.8,3,10,"[{'BlockName': 'General', 'Item': 'Website', '...",4,3,"[{'Member': 'Team', 'Name': 'Evgeniy Zhiharev'...",.\Checkpoints\Icomarks\baby-token.json,2.0


In [30]:
url='https://icomarks.com/ico/777-bingo'
save_path=os.path.join(ICOMARKS_FOLDER, url.replace(URL_ROOT, '')+'.json')
add_row=pd.read_json(save_path, orient='table')
add_row

Unnamed: 0,url,ScrapeStatus,PageScreenshot,Rating_TOTAL_SCORE,Rating_PROFILE_(weight:_45%),Rating_SOCIAL_ACTIVITY_(weight:_35%),Rating_TEAM_PROOF_(weight:_20%),InfoBlock,TeamSize,AdvisorSize,TeamBlock,SocialWithRating,SocialSeriesStatus,SocialBlock,PklPath,TotTimeSec
0,https://icomarks.com/ico/777-bingo,OK,[\nLast screenshot taken on],5.2,7.3,5,1,"[{'BlockName': 'General', 'Item': 'Website', '...",4,2,"[{'Member': 'Team', 'Name': 'Cris Shintae Park...",2,DOWNLOADED,"[{'stats': [{'Social': 'Telegram', 'Users': 36...",.\Checkpoints\Icomarks\777-bingo.json,9.0


In [31]:
add_row['InfoBlock']=[pd.DataFrame(add_row['InfoBlock'][0])]
add_row['TeamBlock']=[pd.DataFrame(add_row['TeamBlock'][0])]
social_df=pd.DataFrame(add_row['SocialBlock'][0][0]['stats'])
series_dict={}
for k in add_row['SocialBlock'][0][0]['timeseries'].keys():
    series_dict[k]=pd.DataFrame(add_row['SocialBlock'][0][0]['timeseries'][k])
add_row['SocialBlock']=[[{'stats': social_df, 'timeseries': series_dict}]]

In [44]:
add_row

Unnamed: 0,url,ScrapeStatus,PageScreenshot,Rating_TOTAL_SCORE,Rating_PROFILE_(weight:_45%),Rating_SOCIAL_ACTIVITY_(weight:_35%),Rating_TEAM_PROOF_(weight:_20%),InfoBlock,TeamSize,AdvisorSize,TeamBlock,PklPath,TotTimeSec
0,https://icomarks.com/ico/baby-token,OK,"[\nLast screenshot taken on, Last screenshot t...",5.7,5.8,3,10,"[{'BlockName': 'General', 'Item': 'Website', '...",4,3,"[{'Member': 'Team', 'Name': 'Evgeniy Zhiharev'...",.\Checkpoints\Icomarks\baby-token.json,2.0


In [40]:
add_row['SocialBlock'][0][0]['timeseries']['Twitter']

Unnamed: 0,Date,Users
0,2018-05-17T07:00:00.000Z,622
1,2018-05-18T07:00:00.000Z,619
2,2018-05-19T07:00:00.000Z,621
3,2018-05-20T07:00:00.000Z,619
4,2018-05-21T07:00:00.000Z,621
...,...,...
1104,2022-01-28T08:00:00.000Z,523
1105,2022-01-29T08:00:00.000Z,523
1106,2022-01-30T08:00:00.000Z,523
1107,2022-01-31T08:00:00.000Z,523


In [10]:
datetime.timedelta(seconds=round(timer()-start)).total_seconds()

32.0

In [5]:
url='https://icomarks.com/ico/coti'
add_row=scrape_info_icomarks(url=url, chromedriver_path=CHROMEDRIVER_PATH)

In [6]:
add_row

Unnamed: 0,url,PageScreenshot,Rating_TOTAL_SCORE,Rating_PROFILE_(weight:_45%),Rating_SOCIAL_ACTIVITY_(weight:_35%),Rating_TEAM_PROOF_(weight:_20%),InfoBlock,TeamSize,AdvisorSize,TeamBlock,SocialWithRating,SocialSeriesStatus,SocialBlock
0,https://icomarks.com/ico/coti,"[\nLast screenshot taken on, Last screenshot t...",9.1,8.1,10,10,BlockName Item \ 0 Token ...,23,17,Member Name \ 0 Tea...,3,DOWNLOADED,"[{'stats': ['Social', 'Users', 'Rating'], 'tim..."


In [None]:
sys.setrecursionlimit(10000)
add_row.to_pickle('raw.pkl', protocol=-1)

In [1]:
scrape_df.TotTime.sum()

NameError: name 'scrape_df' is not defined

In [38]:
scrape_df

Unnamed: 0,url,ScrapeStatus,PageScreenshot,Rating_TOTAL_SCORE,Rating_PROFILE_(weight:_45%),Rating_SOCIAL_ACTIVITY_(weight:_35%),Rating_TEAM_PROOF_(weight:_20%),InfoBlock,TeamSize,AdvisorSize,TeamBlock,SocialWithRating,SocialSeriesStatus,SocialBlock,PklPath,TotTime
0,https://icomarks.com/ico/synthetics-ai,OK,[\nLast screenshot taken on],7.0,5.8,8,8,BlockName Item \ 0 ...,5,0,Member Name Role ...,2,DOWNLOADED,"[{'stats': ['Social', 'Users'], 'timeseries': ...",.\Checkpoints\Icomarks\00000_synthetics-ai.pkl,0 days 00:00:27
0,https://icomarks.com/ico/777-bingo,OK,[\nLast screenshot taken on],5.2,7.3,5,1,BlockName Item \ 0 ...,4,2,Member Name Role ...,2,DOWNLOADED,"[{'stats': ['Social', 'Users'], 'timeseries': ...",.\Checkpoints\Icomarks\00001_777-bingo.pkl,0 days 00:00:09
0,https://icomarks.com/ico/sonic,OK,[\nLast screenshot taken on],4.7,5.0,4,5,BlockName Item \ 0 ...,4,0,Member Name Rol...,2,DOWNLOADED,"[{'stats': ['Social', 'Users'], 'timeseries': ...",.\Checkpoints\Icomarks\00002_sonic.pkl,0 days 00:00:31
0,https://icomarks.com/ico/botchain,OK,[\nLast screenshot taken on],7.2,6.2,7,10,BlockName Item \ 0 ...,9,4,Member Name \ 0 T...,2,DOWNLOADED,"[{'stats': ['Social', 'Users'], 'timeseries': ...",.\Checkpoints\Icomarks\00003_botchain.pkl,0 days 00:00:19
0,https://icomarks.com/ico/eclipse,OK,[\nLast screenshot taken on],5.1,6.5,1,9,BlockName Item \ 0 ...,12,0,Member Name \ 0 Te...,2,DOWNLOADED,"[{'stats': ['Social', 'Users'], 'timeseries': ...",.\Checkpoints\Icomarks\00004_eclipse.pkl,0 days 00:00:40
0,https://icomarks.com/ico/the-mill-of-blood,OK,[\nLast screenshot taken on],4.2,3.1,3,9,BlockName Item \ 0 Genera...,6,0,Member Name Ro...,2,DOWNLOADED,"[{'stats': ['Social', 'Users'], 'timeseries': ...",.\Checkpoints\Icomarks\00005_the-mill-of-blood...,0 days 00:00:29


In [10]:
index=10
save_path=os.path.join(ICOMARKS_FOLDER, str(index).zfill(5)+'_'+url.replace(URL_ROOT, '')+'.pkl')

In [11]:
save_path

'.\\Checkpoints\\Icomarks\\00010_azbit.pkl'

In [14]:
add_row

Unnamed: 0,url,PageScreenshot,Rating_TOTAL_SCORE,Rating_PROFILE_(weight:_45%),Rating_SOCIAL_ACTIVITY_(weight:_35%),Rating_TEAM_PROOF_(weight:_20%),InfoBlock,TeamSize,AdvisorSize,TeamBlock,SocialWithRating,SocialSeriesStatus,SocialBlock
0,https://icomarks.com/ico/azbit,[\nLast screenshot taken on],8.1,8.5,10,4,BlockName Item \ 0 ...,11,8,Member Name \ 0 ...,3,DOWNLOADED,"[{'stats': ['Social', 'Users', 'Rating'], 'tim..."


In [7]:
sys.setrecursionlimit(10000)

In [24]:
vv=sys.getrecursionlimit()
vv

10000

In [21]:
import sys
print(sys.getrecursionlimit())

10000


In [25]:
print(sys.setrecursionlimit(3000))

None


In [26]:
add_row.to_pickle(save_path, protocol=-1)

RecursionError: maximum recursion depth exceeded

In [12]:
joblib.dump(add_row, save_path, compress=('lzma', 3))

RecursionError: maximum recursion depth exceeded

In [27]:
aa=joblib.load(save_path)
aa

EOFError: 

In [6]:
url="https://icomarks.com/ico/azbit"
add_row=scrape_info_icomarks(url=url, chromedriver_path=CHROMEDRIVER_PATH)
add_row

Unnamed: 0,url,PageScreenshot,Rating_TOTAL_SCORE,Rating_PROFILE_(weight:_45%),Rating_SOCIAL_ACTIVITY_(weight:_35%),Rating_TEAM_PROOF_(weight:_20%),InfoBlock,TeamSize,AdvisorSize,TeamBlock,SocialWithRating,SocialSeriesStatus,SocialBlock
0,,[\nLast screenshot taken on],8.1,8.5,10,4,BlockName Item \ 0 ...,11,8,Member Name \ 0 ...,3,DOWNLOADED,"[{'stats': ['Social', 'Users', 'Rating'], 'tim..."


In [33]:
display(cat_list['status'].value_counts().to_frame())

Unnamed: 0,status
Ended,5034
Upcoming,1890
Trading,726
Pre-Sale Ended,399
Active,164
Pre-Sale,66


In [22]:
cat_dummy.reset_index()

Unnamed: 0,url,CATEGORY_AI,CATEGORY_Art,CATEGORY_Banking,CATEGORY_BigData,CATEGORY_Business,CATEGORY_Charity,CATEGORY_Communication,CATEGORY_Cryptocurrency,CATEGORY_DeFi,...,CATEGORY_Other,CATEGORY_Platform,CATEGORY_Realestate,CATEGORY_Retail,CATEGORY_SmartContract,CATEGORY_Software,CATEGORY_Sports,CATEGORY_Tourism,CATEGORY_Unknown,CATEGORY_VirtualReality
0,https://icomarks.com/ico/034ego034-coin,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,https://icomarks.com/ico/0chain,1,0,1,1,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
2,https://icomarks.com/ico/0penproductdb,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,https://icomarks.com/ico/0x,0,0,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
4,https://icomarks.com/ico/0xcert,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8274,https://icomarks.com/ico/zupply,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8275,https://icomarks.com/ico/zuum,1,0,0,1,1,0,0,1,0,...,0,1,0,0,1,1,0,0,0,0
8276,https://icomarks.com/ico/zwoop,1,0,0,0,0,0,0,1,0,...,0,1,0,1,0,0,0,0,0,0
8277,https://icomarks.com/ico/zxc-exchange,0,0,0,0,1,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0


In [18]:
cat_list[cat_list['url']=='https://icomarks.com/ico/vezet']

Unnamed: 0,Category,url,n_views,verified_email,is_STO,is_IEO,status,start_date,end_date,List downloaded on
1735,Business,https://icomarks.com/ico/vezet,12325,0,0,0,Upcoming,TBA,TBA,06/02/2023
1736,Business,https://icomarks.com/ico/vezet,12325,0,0,0,Upcoming,TBA,TBA,06/02/2023
3798,Cryptocurrency,https://icomarks.com/ico/vezet,12325,0,0,0,Upcoming,TBA,TBA,06/02/2023
3799,Cryptocurrency,https://icomarks.com/ico/vezet,12325,0,0,0,Upcoming,TBA,TBA,06/02/2023
8936,Infrastructure,https://icomarks.com/ico/vezet,12325,0,0,0,Upcoming,TBA,TBA,06/02/2023
11323,Manufacturing,https://icomarks.com/ico/vezet,12325,0,0,0,Upcoming,TBA,TBA,06/02/2023
11324,Manufacturing,https://icomarks.com/ico/vezet,12325,0,0,0,Upcoming,TBA,TBA,06/02/2023
12626,Platform,https://icomarks.com/ico/vezet,12325,0,0,0,Upcoming,TBA,TBA,06/02/2023
12627,Platform,https://icomarks.com/ico/vezet,12325,0,0,0,Upcoming,TBA,TBA,06/02/2023


In [21]:
cat_dummy.max().max()

1

In [9]:
# find url with multiple entries (due to IEO/STO) and keep all categories and minimum start date and max end date
if len(multiple_url) > 0:
    
    print('\n-- Url with multiple entries found. Keeping single information only')
    multiple_url=cat_list[['url', 'n_views']].drop_duplicates()['url'].value_counts().to_frame().reset_index().query('url > 1')['index']
    new_df=pd.DataFrame(columns=cat_list.columns)
    for t_url in multiple_url:
        t_df=cat_list[cat_list['url']==t_url].copy()

        status=t_df['status'].value_counts().index[0]
        if t_df['status'].nunique() > 1:
            u_val=t_df['status'].unique()
            if 'Ended' in u_val:
                status='Ended'
            if 'Active' in u_val:
                status='Active'
            print(f"    - {t_url}: Multiple status found: {u_val}. Keeping '{status}'")

        try:
            start_date=pd.to_datetime(t_df['start_date'].loc[lambda x : x != 'TBA'], infer_datetime_format=True).min().strftime('%d %b %Y')
        except:
            start_date=t_df['start_date'].unique()[0]
        try:
            end_date=pd.to_datetime(t_df['end_date'].loc[lambda x : x != 'TBA'], infer_datetime_format=True).max().strftime('%d %b %Y')
        except:
            end_date=t_df['end_date'].unique()[0]

        add_df=pd.DataFrame({'Category': t_df['Category'].unique(),
                            'url': t_url,
                            'n_views': t_df['n_views'].max(),
                            'verified_email': t_df['verified_email'].max(),
                            'is_STO': t_df['is_STO'].max(),
                            'is_IEO': t_df['is_IEO'].max(),
                            'status': status,
                            'start_date': start_date,
                            'end_date': end_date,
                            'List downloaded on': t_df['List downloaded on'].values[0]})

        new_df=pd.concat([new_df, add_df])
        
    cat_list=cat_list[~cat_list['url'].isin(multiple_url)]
    cat_list=pd.concat([cat_list, new_df])

- https://icomarks.com/ico/ins: Multiple status found: ['Trading' 'Ended']. Keeping 'Ended'
- https://icomarks.com/ico/unifox: Multiple status found: ['Pre-Sale Ended' 'Ended']. Keeping 'Ended'


In [13]:
cat_list1=cat_list[~cat_list['url'].isin(multiple_url)]
cat_list1

Unnamed: 0,Category,url,n_views,verified_email,is_STO,is_IEO,status,start_date,end_date,List downloaded on
1,AI,https://icomarks.com/ico/synthetics-ai,17727,0,0,0,Upcoming,TBA,TBA,06/02/2023
2,AI,https://icomarks.com/ico/777-bingo,18810,0,0,0,Upcoming,TBA,TBA,06/02/2023
3,AI,https://icomarks.com/ico/sonic,19082,0,0,0,Upcoming,TBA,TBA,06/02/2023
4,AI,https://icomarks.com/ico/botchain,16721,0,0,0,Upcoming,TBA,TBA,06/02/2023
5,AI,https://icomarks.com/ico/eclipse,17390,0,0,0,Upcoming,TBA,TBA,06/02/2023
...,...,...,...,...,...,...,...,...,...,...
18579,VirtualReality,https://icomarks.com/ico/varcrypt,14572,0,0,0,Ended,18 Dec 2017,14 Jan 2018,06/02/2023
18580,VirtualReality,https://icomarks.com/ico/vr-platform-oko,14819,0,0,0,Ended,14 Dec 2017,14 Jan 2018,06/02/2023
18581,VirtualReality,https://icomarks.com/ico/spectiv,15026,0,0,0,Ended,08 Dec 2017,31 Dec 2017,06/02/2023
18582,VirtualReality,https://icomarks.com/ico/terra-virtua,12713,0,0,0,Ended,30 Nov 2017,30 Nov 2017,06/02/2023


In [14]:
cat_list.shape

(18584, 10)

In [31]:
t_df['status'].unique()[0]

'Ended'

In [26]:
t_df['status'].value_counts().index[0]

'Ended'

In [29]:
pd.to_datetime(t_df['start_date'].loc[lambda x : x == ''], infer_datetime_format=True).min()

NaT

In [8]:
t_url='https://icomarks.com/ico/ins'
t_df=cat_list[cat_list['url']==t_url].copy()

status=t_df['status'].value_counts().index[0]
if t_df['status'].nunique() > 1:
    u_val=t_df['status'].unique()
    if 'Ended' in u_val:
        status='Ended'
    if 'Active' in u_val:
        status='Active'
    print(f"- {t_url}: Multiple status found: {u_val}. Keeping '{status}'")

try:
    start_date=pd.to_datetime(t_df['start_date'].loc[lambda x : x != 'TBA'], infer_datetime_format=True).min().strftime('%d %b %Y')
except:
    start_date=t_df['start_date'].unique()[0]
try:
    end_date=pd.to_datetime(t_df['end_date'].loc[lambda x : x != 'TBA'], infer_datetime_format=True).max().strftime('%d %b %Y')
except:
    end_date=t_df['end_date'].unique()[0]

add_df=pd.DataFrame({'Category': t_df['Category'].unique(),
                    'url': t_url,
                    'n_views': t_df['n_views'].max(),
                    'verified_email': t_df['verified_email'].max(),
                    'is_STO': t_df['is_STO'].max(),
                    'is_IEO': t_df['is_IEO'].max(),
                    'status': status,
                    'start_date': start_date,
                    'end_date': end_date,
                    'List downloaded on': t_df['List downloaded on'].values[0]})

add_df

- https://icomarks.com/ico/ins: Multiple status found: ['Trading' 'Ended']. Keeping 'Ended'


Unnamed: 0,Category,url,n_views,verified_email,is_STO,is_IEO,status,start_date,end_date,List downloaded on
0,Platform,https://icomarks.com/ico/ins,22965,0,0,0,Ended,04 Dec 2017,25 Dec 2017,06/02/2023
1,Retail,https://icomarks.com/ico/ins,22965,0,0,0,Ended,04 Dec 2017,25 Dec 2017,06/02/2023


In [7]:
t_df

Unnamed: 0,Category,url,n_views,verified_email,is_STO,is_IEO,status,start_date,end_date,List downloaded on
15521,Platform,https://icomarks.com/ico/ins,22965,0,0,0,Trading,04 Dec 2017,25 Dec 2017,06/02/2023
15534,Platform,https://icomarks.com/ico/ins,10374,0,0,0,Ended,04 Dec 2017,25 Dec 2017,06/02/2023
16443,Retail,https://icomarks.com/ico/ins,22965,0,0,0,Trading,04 Dec 2017,25 Dec 2017,06/02/2023
16444,Retail,https://icomarks.com/ico/ins,10374,0,0,0,Ended,04 Dec 2017,25 Dec 2017,06/02/2023


In [44]:
t_df['List downloaded on'].values[0]

'06/02/2023'

In [36]:
status=t_df['status'].value_counts().index[0]

In [40]:
uu=t_df['status'].unique()
print(f'- {t_url}: Multiple status found: {uu}. Keeping {status}')

- https://icomarks.com/ico/vendicoins: Multiple status found: ['Ended']. Keeping Ended


In [33]:
t_df['status'].unique()

array(['Ended'], dtype=object)

In [17]:
cat_dummy.sort_values(by='CATEGORY_Business', ascending=False)

Unnamed: 0_level_0,CATEGORY_AI,CATEGORY_Art,CATEGORY_Banking,CATEGORY_BigData,CATEGORY_Business,CATEGORY_Charity,CATEGORY_Communication,CATEGORY_Cryptocurrency,CATEGORY_DeFi,CATEGORY_Education,...,CATEGORY_Other,CATEGORY_Platform,CATEGORY_Realestate,CATEGORY_Retail,CATEGORY_SmartContract,CATEGORY_Software,CATEGORY_Sports,CATEGORY_Tourism,CATEGORY_Unknown,CATEGORY_VirtualReality
url,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
https://icomarks.com/ico/vezet,0,0,0,0,2,0,0,2,0,0,...,0,2,0,0,0,0,0,0,0,0
https://icomarks.com/ico/saolachain,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
https://icomarks.com/ico/oath-protocol,0,0,0,0,1,0,0,1,0,1,...,0,0,0,1,1,1,0,0,0,0
https://icomarks.com/ico/t8ex,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
https://icomarks.com/ico/taas,0,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
https://icomarks.com/ico/finix,0,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
https://icomarks.com/ico/finiteyfi,0,0,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
https://icomarks.com/ico/finecryptonetwork,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
https://icomarks.com/ico/finebit-token,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [44]:
cat_list['Category'].unique()

array(['AI', 'Art', 'Banking', 'BigData', 'Business', 'Charity',
       'Communication', 'Cryptocurrency', 'DeFi', 'Education',
       'Electronics', 'Energy', 'Entertainment', 'Exchange&Launchpad',
       'Gambling', 'Health', 'Infrastructure', 'Internet', 'Investment',
       'Legal', 'Manufacturing', 'MarketingAgency', 'Media', 'Other',
       'Platform', 'Realestate', 'Retail', 'SmartContract', 'Software',
       'Sports', 'Tourism', 'Unknown', 'VirtualReality'], dtype=object)

In [46]:
pd.get_dummies(cat_list['Category'], drop_first=False, prefix='CATEGORY')

Unnamed: 0,CATEGORY_AI,CATEGORY_Art,CATEGORY_Banking,CATEGORY_BigData,CATEGORY_Business,CATEGORY_Charity,CATEGORY_Communication,CATEGORY_Cryptocurrency,CATEGORY_DeFi,CATEGORY_Education,...,CATEGORY_Other,CATEGORY_Platform,CATEGORY_Realestate,CATEGORY_Retail,CATEGORY_SmartContract,CATEGORY_Software,CATEGORY_Sports,CATEGORY_Tourism,CATEGORY_Unknown,CATEGORY_VirtualReality
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18579,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
18580,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
18581,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
18582,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [21]:
cat_list.url.value_counts()

https://icomarks.com/ico/qkip            21
https://icomarks.com/ico/iou             19
https://icomarks.com/ico/madana          17
https://icomarks.com/ico/govearn         17
https://icomarks.com/ico/e2c             16
                                         ..
https://icomarks.com/ico/tokenstub        1
https://icomarks.com/ico/dnet             1
https://icomarks.com/ico/dolzio           1
https://icomarks.com/ico/nobu-finance     1
https://icomarks.com/ico/terra-virtua     1
Name: url, Length: 8279, dtype: int64

In [57]:
cat_list=pd.read_csv('./Results/01a_ICOmarks_ico_list.csv', sep=';')
cat_list

Unnamed: 0,Category,url,n_views,verified_email,is_STO,is_IEO,status,start_date,end_date,List downloaded on
0,AI,https://icomarks.com/ico/niqbix,19069,0,0,0,Upcoming,TBA,TBA,06/02/2023
1,AI,https://icomarks.com/ico/synthetics-ai,17727,0,0,0,Upcoming,TBA,TBA,06/02/2023
2,AI,https://icomarks.com/ico/777-bingo,18810,0,0,0,Upcoming,TBA,TBA,06/02/2023
3,AI,https://icomarks.com/ico/sonic,19082,0,0,0,Upcoming,TBA,TBA,06/02/2023
4,AI,https://icomarks.com/ico/botchain,16721,0,0,0,Upcoming,TBA,TBA,06/02/2023
...,...,...,...,...,...,...,...,...,...,...
18579,Virtual Reality,https://icomarks.com/ico/varcrypt,14572,0,0,0,Ended,18 Dec 2017,14 Jan 2018,06/02/2023
18580,Virtual Reality,https://icomarks.com/ico/vr-platform-oko,14819,0,0,0,Ended,14 Dec 2017,14 Jan 2018,06/02/2023
18581,Virtual Reality,https://icomarks.com/ico/spectiv,15026,0,0,0,Ended,08 Dec 2017,31 Dec 2017,06/02/2023
18582,Virtual Reality,https://icomarks.com/ico/terra-virtua,12713,0,0,0,Ended,30 Nov 2017,30 Nov 2017,06/02/2023


In [4]:
url_categ = 'art'#'artificial-intelligence'
expected_count=98
categ = 'AI'

cat_list = pd.DataFrame(columns=['Category', 'url', 'n_views', 'verified_email', 'is_STO', 'is_IEO',
                                 'status', 'start_date', 'end_date'])

In [8]:
# scroll down till "Show more" button disappear
show_more_path = '/html/body/section/div[2]/div[2]/div[2]/a'

print('   - Scrolling down')

driver = get_chromedriver(chromedriver_path = CHROMEDRIVER_PATH)
driver.get(urljoin(CATEGORY_PAGE, url_categ))

while driver.find_element("xpath", show_more_path).is_displayed():

    driver.execute_script("arguments[0].scrollIntoView(true);", driver.find_element("xpath", show_more_path))
    driver.find_element("xpath", show_more_path).click()
    time.sleep(2)

# get html
print('   - Downloading html')
soup = BeautifulSoup(driver.page_source, 'html.parser')

# extract information from web list
print('   - Parsing info')
tag = soup.find_all('div', class_="icoListContent", recursive=True)
tag_list = []
for t in tag[0]:
    if 'div class="newItems"' not in str(t):
        tag_list.append(t)
# if show more, html structure changes
nested_tags=soup.find_all('div', class_="newItems", recursive=True)
if len(nested_tags) > 0:
    for x in soup.find_all('div', class_="newItems", recursive=True):
        tag_list.extend([y for y in x])

temp_list = pd.DataFrame(columns=cat_list.columns)

for t in tag_list:

#     print('-----')
#     print(str(t))
#     print(conv_dict)
    
#     if 'STATUS' in str(t):
#     if any([i in str(t) for i in category['url_ref'].values]):
    if 'START' in str(t):
        
        conv_dict = convert(t)['div']
#         print('-----')
#         print(type(conv_dict))
#         print(conv_dict)
        for x in conv_dict:
    
            if x['@class'][0] == 'icoListItem__info':
                sup = x['a'][0]['sup']
                n_views = [y['#text'] for y in sup if y['@class'][0] == "sup_views"][0]
                is_sto = int(any([True if y['@class'][0] == "sup_is_sto" else False for y in sup]))
                is_ieo = int(any([True if y['@class'][0] == "sup_is_ieo" else False for y in sup]))
                ver_email = int(any([True if y['@class'][0] == "sup_email_confirmed" else False for y in sup]))
                url = x['a'][0]['@href']
            if x['@class'][0] == 'icoListItem__raised':
                status = x['#text']#[v['#text'] for k, v in x['span'][0].items() if ]
            if x['@class'][0] == 'icoListItem__start':
                start_date = x['navigablestring'][0]
            if x['@class'][0] == 'icoListItem__end':
                end_date = x['navigablestring'][0]

        temp_list = temp_list.append(pd.DataFrame({
            'Category':categ,
            'url': urljoin(MAIN_PAGE, url),
            'n_views': int(n_views.replace(' Views', '').replace(',', '')),
            'verified_email': ver_email,
            'is_STO': is_sto,
            'is_IEO': is_ieo,
            'status': status.replace('STATUS ', ''),
            'start_date': start_date,
            'end_date': end_date
        }, index = [0]))

cat_list = cat_list.append(temp_list)

if temp_list.shape[0] != expected_count:
    print('   ####### warning, expected number of elements (' + str(expected_count) + ') mismatch. Found ' + str(temp_list.shape[0]))

driver.close()

   - Scrolling down
   - Downloading html
   - Parsing info


NameError: name 'drive' is not defined

In [9]:
cat_list

Unnamed: 0,Category,url,n_views,verified_email,is_STO,is_IEO,status,start_date,end_date
0,AI,https://icomarks.com/ico/flyguyz,861,0,0,0,Upcoming,01 Nov 2022,TBA
0,AI,https://icomarks.com/ico/lookscoin,13037,0,0,0,Active,12 May 2021,12 May 2023
0,AI,https://icomarks.com/ico/sonic,18773,0,0,0,Upcoming,TBA,TBA
0,AI,https://icomarks.com/ico/all-of-art,17211,0,0,0,Upcoming,TBA,TBA
0,AI,https://icomarks.com/ico/allpublicart,15949,0,0,0,Upcoming,TBA,TBA
...,...,...,...,...,...,...,...,...,...
0,AI,https://icomarks.com/ico/pixinch,14624,0,0,0,Ended,27 Feb 2018,13 Apr 2018
0,AI,https://icomarks.com/ico/fenix-cash,15171,0,0,0,Ended,21 Mar 2018,06 Apr 2018
0,AI,https://icomarks.com/ico/pibble,16257,0,0,0,Trading,14 Mar 2018,26 Mar 2018
0,AI,https://icomarks.com/ico/movieschain,14535,0,0,0,Ended,01 Feb 2018,15 Mar 2018


In [452]:
temp_list.shape

(98, 9)

In [455]:
pd.set_option('display.max_rows', 400)
temp_list

Unnamed: 0,Category,url,n_views,verified_email,is_STO,is_IEO,status,start_date,end_date
0,AI,https://icomarks.com/ico/flyguyz,861,0,0,0,Upcoming,01 Nov 2022,TBA
0,AI,https://icomarks.com/ico/lookscoin,13037,0,0,0,Active,12 May 2021,12 May 2023
0,AI,https://icomarks.com/ico/sonic,18773,0,0,0,Upcoming,TBA,TBA
0,AI,https://icomarks.com/ico/all-of-art,17211,0,0,0,Upcoming,TBA,TBA
0,AI,https://icomarks.com/ico/allpublicart,15949,0,0,0,Upcoming,TBA,TBA
0,AI,https://icomarks.com/ico/artchain-global,13921,0,0,0,Upcoming,TBA,TBA
0,AI,https://icomarks.com/ico/codex,16208,1,0,0,Upcoming,TBA,TBA
0,AI,https://icomarks.com/ico/dresscode,13355,0,0,0,Upcoming,TBA,TBA
0,AI,https://icomarks.com/ico/newcater,14024,0,0,0,Upcoming,TBA,TBA
0,AI,https://icomarks.com/ico/ethergo,12649,0,0,0,Upcoming,TBA,TBA


In [389]:
convert(tag_list[99])

{'@class': ['icoListItem'],
 '#text': 'Saisho 13,502 Views Saisho is the organized, decentralized, liquid and stable market ecosystem that connects artists, collectors and patrons to facilitate the creation of... Ended START 23 July 2018 END 23 Mar 2019 6.7 undefined undefined',
 'div': [{'@class': ['icoListItem__img'],
   '#text': '',
   'img': [{'@src': '/icache/files/companies/42/83db5a2cd1d6a2a6a5c108140a7a299e_50x50.jpg',
     '@alt': 'icoList',
     '#text': ''}]},
  {'@class': ['icoListItem__info'],
   '#text': 'Saisho 13,502 Views Saisho is the organized, decentralized, liquid and stable market ecosystem that connects artists, collectors and patrons to facilitate the creation of...',
   'a': [{'@class': ['icoListItem__title'],
     '@href': '/ico/saisho',
     '#text': 'Saisho 13,502 Views',
     'navigablestring': ['Saisho'],
     'sup': [{'@class': ['sup_views'],
       '#text': '13,502 Views',
       'navigablestring': ['13,502 Views']}]}],
   'div': [{'@class': ['icoListIte

In [390]:
convert(tag_list[41])

{'@class': ['icoListItem'],
 '#text': 'Buffy Inu 504 Views EMAIL confirmed The new Buffy meme token that will be investigating the true identity! Buffy Inu uses blockchain to align incentives of different stakeholders, developers,... STATUS Upcoming START TBA END TBA 1.6 undefined undefined',
 'div': [{'@class': ['icoListItem__img'],
   '#text': '',
   'img': [{'@src': '/icache/files/companies/84/8374155bcdb9ee16606c93a0889c0681b831_50x50.jpeg',
     '@class': ['lazyload'],
     '#text': ''}]},
  {'@class': ['icoListItem__info'],
   '#text': 'Buffy Inu 504 Views EMAIL confirmed The new Buffy meme token that will be investigating the true identity! Buffy Inu uses blockchain to align incentives of different stakeholders, developers,...',
   'a': [{'@class': ['icoListItem__title'],
     '@href': '/ico/buffy-inu',
     '#text': 'Buffy Inu 504 Views EMAIL confirmed',
     'navigablestring': ['Buffy Inu'],
     'sup': [{'@class': ['sup_views'],
       '#text': '504 Views',
       'navigables

In [380]:
temp_list

Unnamed: 0,Category,url,n_views,verified_email,is_STO,is_IEO,status,start_date,end_date
0,AI,https://icomarks.com/ico/flyguyz,860,0,0,0,Upcoming,01 Nov 2022,TBA
0,AI,https://icomarks.com/ico/lookscoin,13037,0,0,0,Active,12 May 2021,12 May 2023
0,AI,https://icomarks.com/ico/sonic,18773,0,0,0,Upcoming,TBA,TBA
0,AI,https://icomarks.com/ico/all-of-art,17211,0,0,0,Upcoming,TBA,TBA
0,AI,https://icomarks.com/ico/allpublicart,15949,0,0,0,Upcoming,TBA,TBA
0,AI,https://icomarks.com/ico/artchain-global,13921,0,0,0,Upcoming,TBA,TBA
0,AI,https://icomarks.com/ico/codex,16208,1,0,0,Upcoming,TBA,TBA
0,AI,https://icomarks.com/ico/dresscode,13355,0,0,0,Upcoming,TBA,TBA
0,AI,https://icomarks.com/ico/newcater,14024,0,0,0,Upcoming,TBA,TBA
0,AI,https://icomarks.com/ico/ethergo,12649,0,0,0,Upcoming,TBA,TBA


In [348]:
soup.find_all('div', class_="newItems", recursive=True)

[]

In [347]:
[x for x in soup.find_all('div', class_="newItems", recursive=True)[0]]

IndexError: list index out of range

In [313]:
temp_list = pd.DataFrame(columns=cat_list.columns)

for t in tag[0]:

    if 'STATUS' in str(t):
        ll.append(convert(t)['div'])
        conv_dict = convert(t)['div']
        for x in conv_dict:
    
            if x['@class'][0] == 'icoListItem__info':
                sup = x['a'][0]['sup']
                n_views = [y['#text'] for y in sup if y['@class'][0] == "sup_views"][0]
                is_sto = int(any([True if y['@class'][0] == "sup_is_sto" else False for y in sup]))
                is_ieo = int(any([True if y['@class'][0] == "sup_is_ieo" else False for y in sup]))
                ver_email = int(any([True if y['@class'][0] == "sup_email_confirmed" else False for y in sup]))
                url = x['a'][0]['@href']
            if x['@class'][0] == 'icoListItem__raised':
                status = x['#text']#[v['#text'] for k, v in x['span'][0].items() if ]
            if x['@class'][0] == 'icoListItem__start':
                start_date = x['navigablestring'][0]
            if x['@class'][0] == 'icoListItem__end':
                end_date = x['navigablestring'][0]

        temp_list = temp_list.append(pd.DataFrame({
            'Category':categ,
            'url': urljoin(MAIN_PAGE, url),
            'n_views': int(n_views.replace(' Views', '').replace(',', '')),
            'verified_email': ver_email,
            'is_STO': is_sto,
            'is_IEO': is_ieo,
            'status': status.replace('STATUS ', ''),
            'start_date': start_date,
            'end_date': end_date
        }, index = [0]))

In [314]:
temp_list

Unnamed: 0,Category,url,n_views,verified_email,is_STO,is_IEO,status,start_date,end_date
0,AI,https://icomarks.com/ico/flyguyz,860,0,0,0,Upcoming,01 Nov 2022,TBA
0,AI,https://icomarks.com/ico/lookscoin,13037,0,0,0,Active,12 May 2021,12 May 2023
0,AI,https://icomarks.com/ico/sonic,18773,0,0,0,Upcoming,TBA,TBA
0,AI,https://icomarks.com/ico/all-of-art,17211,0,0,0,Upcoming,TBA,TBA
0,AI,https://icomarks.com/ico/allpublicart,15949,0,0,0,Upcoming,TBA,TBA
0,AI,https://icomarks.com/ico/artchain-global,13920,0,0,0,Upcoming,TBA,TBA
0,AI,https://icomarks.com/ico/codex,16207,1,0,0,Upcoming,TBA,TBA
0,AI,https://icomarks.com/ico/dresscode,13354,0,0,0,Upcoming,TBA,TBA
0,AI,https://icomarks.com/ico/newcater,14023,0,0,0,Upcoming,TBA,TBA
0,AI,https://icomarks.com/ico/ethergo,12648,0,0,0,Upcoming,TBA,TBA


In [315]:
temp_list.shape

(40, 9)

In [302]:
print('\n   ####### warning, expected number of elements (' + str(expected_count) + ') mismatch. Found ' + str(cat_list.shape[0]))




In [307]:
soup = BeautifulSoup(driver.page_source, 'html.parser')
print(soup.prettify())

<html lang="en-EN">
 <head>
  <meta charset="utf-8"/>
  <title>
   Art ICOs | ICOmarks
  </title>
  <link href="/assets/img/favicon.png?11" rel="shortcut icon" type="image/png"/>
  <meta content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no" name="viewport"/>
  <meta content="Art ICOs - Get all information about cryptocurrency ICOs (Initial Coin Offering) from category - Art." name="description"/>
  <meta content="ico, cryptocurrency, blockchain, token, market, rating, price, initial, coin, offering, list, Art" name="keywords"/>
  <link href="/assets/css/main.min.css?v27" rel="stylesheet"/>
  <link href="/apple-touch-icon.png" rel="apple-touch-icon" sizes="180x180"/>
  <link href="/favicon-32x32.png" rel="icon" sizes="32x32" type="image/png"/>
  <link href="/favicon-16x16.png" rel="icon" sizes="16x16" type="image/png"/>
  <link href="/manifest.json" rel="manifest"/>
  <link color="#3b4c85" href="/safari-pinned-tab.svg" rel="mask-icon"/>
  <meta content="#f

In [283]:
tag = soup.find_all('div', class_="icoListContent", recursive=True)

In [318]:
len(tag[0])

84

In [317]:
# extract information from web list
tag = soup.find_all('div', class_="icoListContent", recursive=True)

temp_list = pd.DataFrame(columns=['Category', 'url', 'n_views', 'verified_email', 'is_STO', 'is_IEO',
                                 'status', 'start_date', 'end_date'])

cc = 0
ll=[]
for t in tag[0]:

    if 'STATUS' in str(t):
        ll.append(convert(t)['div'])
        conv_dict = convert(t)['div']
        for x in conv_dict:
    
            if x['@class'][0] == 'icoListItem__info':
                sup = x['a'][0]['sup']
                n_views = [y['#text'] for y in sup if y['@class'][0] == "sup_views"][0]
                is_sto = int(any([True if y['@class'][0] == "sup_is_sto" else False for y in sup]))
                is_ieo = int(any([True if y['@class'][0] == "sup_is_ieo" else False for y in sup]))
                ver_email = int(any([True if y['@class'][0] == "sup_email_confirmed" else False for y in sup]))
                url = x['a'][0]['@href']
            if x['@class'][0] == 'icoListItem__raised':
                status = x['#text']#[v['#text'] for k, v in x['span'][0].items() if ]
            if x['@class'][0] == 'icoListItem__start':
                start_date = x['navigablestring'][0]
            if x['@class'][0] == 'icoListItem__end':
                end_date = x['navigablestring'][0]

        temp_list = temp_list.append(pd.DataFrame({
            'Category':categ,
            'url': urljoin(MAIN_PAGE, url),
            'n_views': int(n_views.replace(' Views', '').replace(',', '')),
            'verified_email': ver_email,
            'is_STO': is_sto,
            'is_IEO': is_ieo,
            'status': status.replace('STATUS ', ''),
            'start_date': start_date,
            'end_date': end_date
        }, index = [cc]))
        cc += 1
temp_list

Unnamed: 0,Category,url,n_views,verified_email,is_STO,is_IEO,status,start_date,end_date
0,AI,https://icomarks.com/ico/flyguyz,860,0,0,0,Upcoming,01 Nov 2022,TBA
1,AI,https://icomarks.com/ico/lookscoin,13037,0,0,0,Active,12 May 2021,12 May 2023
2,AI,https://icomarks.com/ico/sonic,18773,0,0,0,Upcoming,TBA,TBA
3,AI,https://icomarks.com/ico/all-of-art,17211,0,0,0,Upcoming,TBA,TBA
4,AI,https://icomarks.com/ico/allpublicart,15949,0,0,0,Upcoming,TBA,TBA
5,AI,https://icomarks.com/ico/artchain-global,13920,0,0,0,Upcoming,TBA,TBA
6,AI,https://icomarks.com/ico/codex,16207,1,0,0,Upcoming,TBA,TBA
7,AI,https://icomarks.com/ico/dresscode,13354,0,0,0,Upcoming,TBA,TBA
8,AI,https://icomarks.com/ico/newcater,14023,0,0,0,Upcoming,TBA,TBA
9,AI,https://icomarks.com/ico/ethergo,12648,0,0,0,Upcoming,TBA,TBA


In [294]:
driver.find_element("xpath", show_more_path).is_displayed()

True

In [299]:
# scroll down till "Show more" button disappear
show_more_path = '/html/body/section/div[2]/div[2]/div[2]/a'

driver = get_chromedriver(chromedriver_path = CHROMEDRIVER_PATH)
driver.get(urljoin(MAIN_PAGE, url_categ))


while driver.find_element("xpath", show_more_path).is_displayed():

    driver.execute_script("arguments[0].scrollIntoView(true);", driver.find_element("xpath", show_more_path))
    driver.find_element("xpath", show_more_path).click()
    time.sleep(2)

In [None]:
/html/body/section/div[2]/div[2]/div[2]/a

In [275]:
driver.find_element("xpath", '/html/body/section/div[2]/div[2]/div[2]/a').click()

In [269]:
driver.find_element("xpath", '/html/body/section/div[2]/div[2]/div[2]/a')

<selenium.webdriver.remote.webelement.WebElement (session="e51878aec4964ad1ec669181cbfabf84", element="34d2b556-b1c4-491b-a232-c32472484592")>

In [277]:
options = driver.find_element("xpath", '/html/body/section/div[2]/div[2]/div[2]/a')
driver.execute_script("arguments[0].scrollIntoView(true);",options)

In [None]:
/html/body/section/div[2]/div[2]/div[2]/a

In [252]:
int('18,790,000 Views'.replace(' Views', '').replace(',', ''))

18790000

In [233]:
conv_dict = ll[38]
sup = conv_dict[1]['a'][0]['sup']
sup

[{'@class': ['sup_is_ieo'], '#text': 'IEO', 'navigablestring': ['IEO']},
 {'@class': ['sup_views'],
  '#text': '2,238 Views',
  'navigablestring': ['2,238 Views']},
 {'@class': ['sup_email_confirmed'],
  '#text': 'EMAIL confirmed',
  'navigablestring': ['EMAIL confirmed']}]

In [247]:
ll[38]

[{'@class': ['icoListItem__img'],
  '#text': '',
  'img': [{'@src': '/icache/files/companies/73/7299f16d3faa498ba654115d79b8022dc81f_50x50.jpg',
    '@class': ['lazyload'],
    '#text': ''}]},
 {'@class': ['icoListItem__info'],
  '#text': 'Baby Token IEO 2,238 Views EMAIL confirmed Baby Token project aims to use blockchain and AI technologies in telemedicine, employment and financial services for expectant mothers and families through...',
  'a': [{'@class': ['icoListItem__title'],
    '@href': '/ico/baby-token',
    '#text': 'Baby Token IEO 2,238 Views EMAIL confirmed',
    'navigablestring': ['Baby Token'],
    'sup': [{'@class': ['sup_is_ieo'],
      '#text': 'IEO',
      'navigablestring': ['IEO']},
     {'@class': ['sup_views'],
      '#text': '2,238 Views',
      'navigablestring': ['2,238 Views']},
     {'@class': ['sup_email_confirmed'],
      '#text': 'EMAIL confirmed',
      'navigablestring': ['EMAIL confirmed']}]}],
  'div': [{'@class': ['icoListItem__description'],
    '#t

In [224]:
ll[38]

[{'@class': ['icoListItem__img'],
  '#text': '',
  'img': [{'@src': '/icache/files/companies/73/7299f16d3faa498ba654115d79b8022dc81f_50x50.jpg',
    '@class': ['lazyload'],
    '#text': ''}]},
 {'@class': ['icoListItem__info'],
  '#text': 'Baby Token IEO 2,238 Views EMAIL confirmed Baby Token project aims to use blockchain and AI technologies in telemedicine, employment and financial services for expectant mothers and families through...',
  'a': [{'@class': ['icoListItem__title'],
    '@href': '/ico/baby-token',
    '#text': 'Baby Token IEO 2,238 Views EMAIL confirmed',
    'navigablestring': ['Baby Token'],
    'sup': [{'@class': ['sup_is_ieo'],
      '#text': 'IEO',
      'navigablestring': ['IEO']},
     {'@class': ['sup_views'],
      '#text': '2,238 Views',
      'navigablestring': ['2,238 Views']},
     {'@class': ['sup_email_confirmed'],
      '#text': 'EMAIL confirmed',
      'navigablestring': ['EMAIL confirmed']}]}],
  'div': [{'@class': ['icoListItem__description'],
    '#t

In [217]:
x

{'@class': ['icoListItem__start'],
 '#text': 'START 01 July 2022',
 'div': [{'@class': ['mobile'],
   '#text': 'START',
   'a': [{'@class': ['icoTop__sort', 'js-sort'],
     '#text': 'START',
     'navigablestring': ['START']}]}],
 'navigablestring': ['01 July 2022']}

In [214]:
t

<div class="icoListItem">
<div class="icoListItem__img">
<img class="lazyload" src="/assets/img/noicon.png"/>
</div>
<div class="icoListItem__info">
<a class="icoListItem__title" href="/ico/justbam">JustBam                                                                                                                                                 <sup class="sup_views">333 Views</sup>
</a>
<div class="icoListItem__description">BAM is the first Social Media to Earn Token, users Can socialize to earn BAM through a point system.</div>
</div>
<div class="icoListItem__raised">
<div class="mobile">
<a class="icoTop__sort js-sort" data-field="started">STATUS</a>
</div>
<span class="circle-active"></span>
<span>Active</span>
</div>
<div class="icoListItem__start">
<div class="mobile">
<a class="icoTop__sort js-sort">START</a>
</div>
01 July 2022
</div>
<div class="icoListItem__end">
<div class="mobile">
<a class="icoTop__sort js-sort">END</a>
</div>
28 Dec 2022
</div>
<div class="icoListItem

In [208]:
conv_dict

[{'@class': ['icoListItem__img'],
  '#text': '',
  'img': [{'@src': '/assets/img/noicon.png',
    '@class': ['lazyload'],
    '#text': ''}]},
 {'@class': ['icoListItem__info'],
  '#text': 'JustBam 333 Views BAM is the first Social Media to Earn Token, users Can socialize to earn BAM through a point system.',
  'a': [{'@class': ['icoListItem__title'],
    '@href': '/ico/justbam',
    '#text': 'JustBam 333 Views',
    'navigablestring': ['JustBam'],
    'sup': [{'@class': ['sup_views'],
      '#text': '333 Views',
      'navigablestring': ['333 Views']}]}],
  'div': [{'@class': ['icoListItem__description'],
    '#text': 'BAM is the first Social Media to Earn Token, users Can socialize to earn BAM through a point system.',
    'navigablestring': ['BAM is the first Social Media to Earn Token, users Can socialize to earn BAM through a point system.']}]},
 {'@class': ['icoListItem__raised'],
  '#text': 'STATUS Active',
  'div': [{'@class': ['mobile'],
    '#text': 'STATUS',
    'a': [{'@clas

In [206]:
t

<div class="icoListItem">
<div class="icoListItem__img">
<img class="lazyload" src="/assets/img/noicon.png"/>
</div>
<div class="icoListItem__info">
<a class="icoListItem__title" href="/ico/justbam">JustBam                                                                                                                                                 <sup class="sup_views">333 Views</sup>
</a>
<div class="icoListItem__description">BAM is the first Social Media to Earn Token, users Can socialize to earn BAM through a point system.</div>
</div>
<div class="icoListItem__raised">
<div class="mobile">
<a class="icoTop__sort js-sort" data-field="started">STATUS</a>
</div>
<span class="circle-active"></span>
<span>Active</span>
</div>
<div class="icoListItem__start">
<div class="mobile">
<a class="icoTop__sort js-sort">START</a>
</div>
01 July 2022
</div>
<div class="icoListItem__end">
<div class="mobile">
<a class="icoTop__sort js-sort">END</a>
</div>
28 Dec 2022
</div>
<div class="icoListItem

In [200]:
x

{'@class': ['icoListItem__img'],
 '#text': '',
 'img': [{'@src': '/assets/img/noicon.png',
   '@class': ['lazyload'],
   '#text': ''}]}

In [196]:
x

{'@class': ['icoListItem__img'],
 '#text': '',
 'img': [{'@src': '/assets/img/noicon.png',
   '@class': ['lazyload'],
   '#text': ''}]}

In [197]:
'STATUS' in str(x)

False

In [185]:
conv_dict = convert(cc[1])['div']
conv_dict

[{'@class': ['icoListItem__img'],
  '#text': '',
  'img': [{'@src': '/icache/files/companies/91/90607213eda00d3e7ad44c102ad84121d042_50x50.png',
    '@class': ['lazyload'],
    '#text': ''}]},
 {'@class': ['icoListItem__info'],
  '#text': 'Vietnam Smarthub Logistics 435 Views EMAIL confirmed Vietnam Smarthub Logistics (VSL) - The technology ecosystem connects logistics activities, operating 24/24. Create an intermediary operating center, handle...',
  'a': [{'@class': ['icoListItem__title'],
    '@href': '/ico/vietnam-smarthub-logistics',
    '#text': 'Vietnam Smarthub Logistics 435 Views EMAIL confirmed',
    'navigablestring': ['Vietnam Smarthub Logistics'],
    'sup': [{'@class': ['sup_views'],
      '#text': '435 Views',
      'navigablestring': ['435 Views']},
     {'@class': ['sup_email_confirmed'],
      '#text': 'EMAIL confirmed',
      'navigablestring': ['EMAIL confirmed']}]}],
  'div': [{'@class': ['icoListItem__description'],
    '#text': 'Vietnam Smarthub Logistics (VSL) -

In [None]:
for x in conv_dict:
    
    if x['@class'][0] == 'icoListItem__info':
        n_views = x['a'][0]['sup'][0]['#text']
    if x['@class'][0] == 'icoListItem__raised':
        status = x['span'][0]['#text']
    if x['@class'][0] == 'icoListItem__start':
        start_date = x['div'][0]['navigablestring'][0]
    if x['@class'][0] == 'icoListItem__end':
        start_date = x['div'][0]['navigablestring'][0]

In [170]:
dd.keys()

dict_keys(['@class', '#text', 'div'])

In [178]:
tag = soup.find_all('div', class_="icoListContent", recursive=True)
tag

[<div class="icoListContent">
 <div class="icoListItem">
 <div class="icoListItem__img">
 <img class="lazyload" src="/assets/img/noicon.png"/>
 </div>
 <div class="icoListItem__info">
 <a class="icoListItem__title" href="/ico/justbam">JustBam                                                                                                                                                 <sup class="sup_views">333 Views</sup>
 </a>
 <div class="icoListItem__description">BAM is the first Social Media to Earn Token, users Can socialize to earn BAM through a point system.</div>
 </div>
 <div class="icoListItem__raised">
 <div class="mobile">
 <a class="icoTop__sort js-sort" data-field="started">STATUS</a>
 </div>
 <span class="circle-active"></span>
 <span>Active</span>
 </div>
 <div class="icoListItem__start">
 <div class="mobile">
 <a class="icoTop__sort js-sort">START</a>
 </div>
 01 July 2022
 </div>
 <div class="icoListItem__end">
 <div class="mobile">
 <a class="icoTop__sort js-sort">EN

In [5]:
tag = soup.find_all('div', class_='companyTab active', recursive=True)
tag

[<div class="companyTab active" id="ico">
 <div class="container">
 <div class="icoinfo">
 <div class="icoinfo-left">
 <div class="icoinfo-block">
 <div class="icoinfo-block__title">General</div>
 <div class="icoinfo-block-content">
 <div class="icoinfo-block__item">
 <span>Website:</span>
 <a class="icoinfo-block__view" href="https://mindsync.ai?utm_source=icomarks" target="_blank">
 <svg class="icon icon-link">
 <use xlink:href="/assets/img/spritesvg.svg#link"></use>
 </svg>Visit
 </a>
 </div>
 <div class="icoinfo-block__item">
 <span>White paper:</span>
 <a class="icoinfo-block__read" href="https://mindsync.ai/docs/whitepaper.pdf" target="_blank">
 <svg class="icon icon-whitepaper">
 <use xlink:href="/assets/img/spritesvg.svg#whitepaper"></use>
 </svg>Read
 </a>
 </div>
 <div class="icoinfo-block__item">
 <span>Bounty:</span>
 <a class="icoinfo-block__read" href="https://bitcointalk.org/index.php?topic=5077069.0" style="color:#9c27b0" target="_blank">
 <svg class="icon icon-bounty" 

In [7]:
tag = soup.find_all('div', class_='container')[5]
tag

<div class="container">
<ul class="swimm-menu js-swimm-menu">
<li><a class="active" href="#ico">ICO Details</a></li>
<li><a href="#social">Social Stats</a></li> <li><a href="#images">Images (7)</a></li> <li><a href="#milestones">Milestones (13)</a></li> <li><a href="#team">Team (7)</a></li> <li><a href="#news">News</a></li> <li><a href="#widget">Widget</a></li>
<li><a href="#comments">Comments</a></li>
</ul>
</div>

In [6]:
from soup2dict import convert

In [None]:
dict_tag = convert(tag)


In [45]:
convert(tag)['div'][0]['div'][0]['div'][0]

{'@class': ['icoinfo'],
 '#text': 'General Website: Visit White paper: Read Bounty: Bounty MVP: Available Pre-sale Time: 15 Dec 2018  -  15 Jan 2019 ICO Time: 01 Mar 2019  -  30 Apr 2021 Whitelist/KYC: Whitelist Country: UK Token info Ticker: MAI Platform: Ethereum Token Type: Utility Available for sale: 75,000,000 MAI (50%) Total supply: 150,000,000.00 MAI Financial Raised $ 4,900,000 ICO Price: 1 MAI = 0.14 USD Accepting: ETH, BTC, USDT, LTC & etc Soft cap: 30,000,000 MAI Hard cap: 75,000,000 MAI Social media Links: Bitcointalk Facebook Twitter Telegram Reddit Github Medium Instagram Youtube',
 'div': [{'@class': ['icoinfo-left'],
   '#text': 'General Website: Visit White paper: Read Bounty: Bounty MVP: Available Pre-sale Time: 15 Dec 2018  -  15 Jan 2019 ICO Time: 01 Mar 2019  -  30 Apr 2021 Whitelist/KYC: Whitelist Country: UK Token info Ticker: MAI Platform: Ethereum Token Type: Utility Available for sale: 75,000,000 MAI (50%) Total supply: 150,000,000.00 MAI',
   'div': [{'@class

In [37]:
dd = convert(tag)['div'][0]['div'][0]['div'][0]['div']
dd

[{'@class': ['icoinfo-left'],
  '#text': 'General Website: Visit White paper: Read Bounty: Bounty MVP: Available Pre-sale Time: 15 Dec 2018  -  15 Jan 2019 ICO Time: 01 Mar 2019  -  30 Apr 2021 Whitelist/KYC: Whitelist Country: UK Token info Ticker: MAI Platform: Ethereum Token Type: Utility Available for sale: 75,000,000 MAI (50%) Total supply: 150,000,000.00 MAI',
  'div': [{'@class': ['icoinfo-block'],
    '#text': 'General Website: Visit White paper: Read Bounty: Bounty MVP: Available Pre-sale Time: 15 Dec 2018  -  15 Jan 2019 ICO Time: 01 Mar 2019  -  30 Apr 2021 Whitelist/KYC: Whitelist Country: UK',
    'div': [{'@class': ['icoinfo-block__title'],
      '#text': 'General',
      'navigablestring': ['General']},
     {'@class': ['icoinfo-block-content'],
      '#text': 'Website: Visit White paper: Read Bounty: Bounty MVP: Available Pre-sale Time: 15 Dec 2018  -  15 Jan 2019 ICO Time: 01 Mar 2019  -  30 Apr 2021 Whitelist/KYC: Whitelist Country: UK',
      'div': [{'@class': ['ico

In [41]:
dd[0]

{'@class': ['icoinfo-left'],
 '#text': 'General Website: Visit White paper: Read Bounty: Bounty MVP: Available Pre-sale Time: 15 Dec 2018  -  15 Jan 2019 ICO Time: 01 Mar 2019  -  30 Apr 2021 Whitelist/KYC: Whitelist Country: UK Token info Ticker: MAI Platform: Ethereum Token Type: Utility Available for sale: 75,000,000 MAI (50%) Total supply: 150,000,000.00 MAI',
 'div': [{'@class': ['icoinfo-block'],
   '#text': 'General Website: Visit White paper: Read Bounty: Bounty MVP: Available Pre-sale Time: 15 Dec 2018  -  15 Jan 2019 ICO Time: 01 Mar 2019  -  30 Apr 2021 Whitelist/KYC: Whitelist Country: UK',
   'div': [{'@class': ['icoinfo-block__title'],
     '#text': 'General',
     'navigablestring': ['General']},
    {'@class': ['icoinfo-block-content'],
     '#text': 'Website: Visit White paper: Read Bounty: Bounty MVP: Available Pre-sale Time: 15 Dec 2018  -  15 Jan 2019 ICO Time: 01 Mar 2019  -  30 Apr 2021 Whitelist/KYC: Whitelist Country: UK',
     'div': [{'@class': ['icoinfo-block

In [42]:
dd[1]

{'@class': ['icoinfo-right'],
 '#text': 'Financial Raised $ 4,900,000 ICO Price: 1 MAI = 0.14 USD Accepting: ETH, BTC, USDT, LTC & etc Soft cap: 30,000,000 MAI Hard cap: 75,000,000 MAI Social media Links: Bitcointalk Facebook Twitter Telegram Reddit Github Medium Instagram Youtube',
 'div': [{'@class': ['icoinfo-block'],
   '#text': 'Financial Raised $ 4,900,000 ICO Price: 1 MAI = 0.14 USD Accepting: ETH, BTC, USDT, LTC & etc Soft cap: 30,000,000 MAI Hard cap: 75,000,000 MAI',
   'div': [{'@class': ['icoinfo-block__title'],
     '#text': 'Financial',
     'navigablestring': ['Financial']},
    {'@class': ['icoinfo-block-content'],
     '#text': 'Raised $ 4,900,000 ICO Price: 1 MAI = 0.14 USD Accepting: ETH, BTC, USDT, LTC & etc Soft cap: 30,000,000 MAI Hard cap: 75,000,000 MAI',
     'div': [{'@class': ['icoinfo-block__item'],
       '#text': 'Raised $ 4,900,000',
       'span': [{'#text': 'Raised', 'navigablestring': ['Raised']}],
       'i': [{'@data-position': 'up',
         '@style

In [38]:
len(dd)

2

In [40]:
dd.keys()

AttributeError: 'list' object has no attribute 'keys'

In [10]:
children = tag.findChildren("a" , recursive=False)
for child in children:
    print(child)

AttributeError: ResultSet object has no attribute 'findChildren'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?

In [44]:
['ICO Details' in str(x) for x in soup.find_all('div', class_='container')] 

[False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False]

In [43]:
[x for x in soup.find_all('div', class_='container') if 'ICO Details' in str(x)] 

[<div class="container">
 <ul class="swimm-menu js-swimm-menu">
 <li><a class="active" href="#ico">ICO Details</a></li>
 <li><a href="#social">Social Stats</a></li> <li><a href="#images">Images (7)</a></li> <li><a href="#milestones">Milestones (13)</a></li> <li><a href="#team">Team (7)</a></li> <li><a href="#news">News</a></li> <li><a href="#widget">Widget</a></li>
 <li><a href="#comments">Comments</a></li>
 </ul>
 </div>]

In [40]:
'Submit ICO' in str(soup.find_all('div', class_='container')[0])

True

In [25]:
soup.find(id="ico")

<div class="companyTab active" id="ico">
<div class="container">
<div class="icoinfo">
<div class="icoinfo-left">
<div class="icoinfo-block">
<div class="icoinfo-block__title">General</div>
<div class="icoinfo-block-content">
<div class="icoinfo-block__item">
<span>Website:</span>
<a class="icoinfo-block__view" href="https://mindsync.ai?utm_source=icomarks" target="_blank">
<svg class="icon icon-link">
<use xlink:href="/assets/img/spritesvg.svg#link"></use>
</svg>Visit
</a>
</div>
<div class="icoinfo-block__item">
<span>White paper:</span>
<a class="icoinfo-block__read" href="https://mindsync.ai/docs/whitepaper.pdf" target="_blank">
<svg class="icon icon-whitepaper">
<use xlink:href="/assets/img/spritesvg.svg#whitepaper"></use>
</svg>Read
</a>
</div>
<div class="icoinfo-block__item">
<span>Bounty:</span>
<a class="icoinfo-block__read" href="https://bitcointalk.org/index.php?topic=5077069.0" style="color:#9c27b0" target="_blank">
<svg class="icon icon-bounty" style="fill:#9c27b0">
<use 

In [21]:
list(soup.children)

['html',
 '\n',
 <html lang="en-EN">
 <head>
 <meta charset="utf-8"/>
 <title>Mindsync (MAI) - ICO Rating and Overview | ICOmarks</title>
 <link href="/assets/img/favicon.png?11" rel="shortcut icon" type="image/png"/>
 <meta content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no" name="viewport"/>
 <meta content="Mindsync ICO ✅ Get full information about Mindsync - ICO details, Rating, (MAI) Token price, White paper, Team and more." name="description">
 <meta content="ico, cryptocurrency, blockchain, token, market, rating, price, initial, coin, offering, list, Mindsync" name="keywords">
 <link href="/assets/css/main.min.css?v27" rel="stylesheet"/>
 <link href="/assets/libs/PhotoSwipe/photoswipe.css" rel="stylesheet" type="text/css"><link href="/assets/libs/PhotoSwipe/default-skin/default-skin.css" rel="stylesheet" type="text/css"> <link href="/apple-touch-icon.png" rel="apple-touch-icon" sizes="180x180"/>
 <link href="/favicon-32x32.png" rel="icon" sizes="3

In [19]:
soup1

<!DOCTYPE html>

<html lang="en-EN">
<head>
<meta charset="utf-8"/>
<title>Mindsync (MAI) - ICO Rating and Overview | ICOmarks</title>
<link href="/assets/img/favicon.png?11" rel="shortcut icon" type="image/png"/>
<meta content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no" name="viewport"/>
<meta content="Mindsync ICO ✅ Get full information about Mindsync - ICO details, Rating, (MAI) Token price, White paper, Team and more." name="description">
<meta content="ico, cryptocurrency, blockchain, token, market, rating, price, initial, coin, offering, list, Mindsync" name="keywords">
<link href="/assets/css/main.min.css?v27" rel="stylesheet"/>
<link href="/assets/libs/PhotoSwipe/photoswipe.css" rel="stylesheet" type="text/css"><link href="/assets/libs/PhotoSwipe/default-skin/default-skin.css" rel="stylesheet" type="text/css"> <link href="/apple-touch-icon.png" rel="apple-touch-icon" sizes="180x180"/>
<link href="/favicon-32x32.png" rel="icon" sizes="32x32" type

In [8]:
html = driver.page_source
soup = BeautifulSoup(html)

In [10]:
for script in soup(["script", "style"]):
    script.extract()

In [13]:
soup

<html lang="en-EN"><head>
<meta charset="utf-8"/>
<title>Mindsync (MAI) - ICO Rating and Overview | ICOmarks</title>
<link href="/assets/img/favicon.png?11" rel="shortcut icon" type="image/png"/>
<meta content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no" name="viewport"/>
<meta content="Mindsync ICO ✅ Get full information about Mindsync - ICO details, Rating, (MAI) Token price, White paper, Team and more." name="description"/>
<meta content="ico, cryptocurrency, blockchain, token, market, rating, price, initial, coin, offering, list, Mindsync" name="keywords"/>
<link href="/assets/css/main.min.css?v27" rel="stylesheet"/>
<link href="/assets/libs/PhotoSwipe/photoswipe.css" rel="stylesheet" type="text/css"/><link href="/assets/libs/PhotoSwipe/default-skin/default-skin.css" rel="stylesheet" type="text/css"/> <link href="/apple-touch-icon.png" rel="apple-touch-icon" sizes="180x180"/>
<link href="/favicon-32x32.png" rel="icon" sizes="32x32" type="image/png"/>

In [12]:
text = soup.get_text()
text

"\n\nMindsync (MAI) - ICO Rating and Overview | ICOmarks\n\n\n\n\n\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nThis website uses cookies to ensure you get the best experience on our website. Learn moreGot it!\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n BTC $ 38,308.28 -0.138315%\n ETH $ 2,751.54  0.461085%\n MARKET CAP $  0%\n ICOs 8,196\n\n\nSubmit ICO /\nAirdrop\n★ Get Promotion\n\n\n\n\n\n\n \n\nNFTsProjectsDeFiAirdropsSTOsPromotionMarks newsWatchlist (0)\n\n\n\n\n\n\n\n\n\n\n\n\nNFTs\nProjects\n★ DeFi\nAirdrops\nSTOs\n★ Promotion\nMarks news\n\n\nEmail*Subscribe \n\n\n\n\n\n0\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nICOmarks\n\n\n\n\n\nICOs\n\n\n\n\nMindsync\n\n\n\n\n\nFollow us:\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nMindsync\nICO Ended 50,965 Views\n\n\n\n\n\nVisit Website\n\n\n\n\n\nLast screenshot taken on20 Mar 2020.\nWebsite  is active by the 23 May 2021\n\n\n\n\n\n\n\n\n\n\nAdd to Watchlist\n\n\n\n\n\n\n\n\n\n\n\n\n\nMindsync is a platform to solve customer's tasks

In [52]:
HTMLFile = open('C:\\Users\\Alessandro Bitetto\\Downloads\\Ended ICO & IEO (Token Sale) List with ratings and analysis - ICO Drops.html', "r", encoding='utf-8')
index = HTMLFile.read()
soup = BeautifulSoup(index, 'html.parser') 

In [53]:
print(soup.prettify())

<!DOCTYPE html>
<!-- saved from url=(0040)https://icodrops.com/category/ended-ico/ -->
<html id="admin" lang="en-US" prefix="og: http://ogp.me/ns#">
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="width=device-width, initial-scale=1, minimum-scale=1" name="viewport"/>
  <meta content="summary" name="twitter:card"/>
  <meta content="@ICODrops" name="twitter:site"/>
  <meta content="ICO Drops" name="twitter:title"/>
  <meta content="The Simple ICO List." name="twitter:description"/>
  <meta content="http://icodrops.com/wp-content/uploads/2017/08/cropped-Group.png" name="twitter:image"/>
  <link href="https://gmpg.org/xfn/11" rel="profile"/>
  <title>
   Ended ICO &amp; IEO (Token Sale) List with ratings and analysis - ICO Drops
  </title>
  <meta content="max-image-preview:large" name="robots"/>
  <meta content="The only complete Ended ICOs &amp; IEO (Initial Exchange Offering) list. All ended token sales are sorted by date, have our rating