In [1]:
from bs4 import BeautifulSoup
from utils import get_chromedriver, adjust_date, eval_duration, scrape_info_icomarks
from utils import extract_scaping_icomarks, summary_stats, format_columns
import requests
import pandas as pd
from urllib.parse import urljoin
from soup2dict import convert
import time
from timeit import default_timer as timer
import datetime
import numpy as np
import re
import pandas as pd
import os
import joblib
import sys
from thefuzz import fuzz
from thefuzz import process

In [2]:
CHROMEDRIVER_PATH = r"C:\Users\Alessandro Bitetto\Downloads\UniPV\ICOs\WebDriver\chromedriver"

In [3]:
# set folders
CHECKPOINT_FOLDER = '.\\Checkpoints'
RESULTS_FOLDER = '.\\Results'
ICOMARKS_FOLDER=os.path.join(CHECKPOINT_FOLDER, 'Icomarks')

if not os.path.exists(CHECKPOINT_FOLDER):
    os.makedirs(CHECKPOINT_FOLDER)
if not os.path.exists(RESULTS_FOLDER):
    os.makedirs(RESULTS_FOLDER)
if not os.path.exists(ICOMARKS_FOLDER):
    os.makedirs(ICOMARKS_FOLDER)

## Get ICOs url

In [6]:
MAIN_PAGE = "https://icomarks.com/"                # to be added to single ICO url
CATEGORY_PAGE = "https://icomarks.com/icos/"       # used to query the category to be downloaded

# get html
page = requests.get(CATEGORY_PAGE)
soup = BeautifulSoup(page.content, 'html.parser')

# extract list of categories
tag = soup.find_all('div', class_="icoTop__selects", recursive=True)
conv_dict = convert(tag)
while conv_dict['div'][0]['@class'][0] != 'icoTop__selects':
    conv_dict = conv_dict['div'][0]
category_list = conv_dict['div'][0]['form'][0]['select'][0]['option']
category = pd.DataFrame([(v['@value'], v['#text']) for v in category_list if '@value' in v.keys()],
                        columns =['url_ref', 'Category'])
category[['Category', 'Count']] = category['Category'].str.split('(', 1, expand=True)
category['Count'] = category['Count'].apply(lambda x: int(x.replace(')', '')))
display(category)

  category[['Category', 'Count']] = category['Category'].str.split('(', 1, expand=True)


Unnamed: 0,url_ref,Category,Count
0,artificial-intelligence,AI,505
1,art,Art,99
2,banking,Banking,645
3,big-data,Big Data,412
4,business-services,Business,1340
5,charity,Charity,156
6,communication,Communication,451
7,cryptocurrency,Cryptocurrency,2958
8,defi,DeFi,489
9,education,Education,217


In [5]:
# apply category in search query and get ICO list

cat_list = pd.DataFrame(columns=['Category', 'url', 'NViews', 'VerifiedEmailDummy', 'IsSTODummy', 'IsIEODummy',
                                 'Status', 'StartDate', 'EndDate'])
start = timer()
download_date=datetime.datetime.now().strftime("%d/%m/%Y")
for index, row in category.iterrows():
    
    url_categ = row['url_ref']
    expected_count = row['Count']
    categ = row['Category']
    
    print('\n- Downloading: ' + categ + '  ('+ str(expected_count) + ' expected) ' + str(index + 1) + ' / ' + str(len(category)))
    
    # scroll down till "Show more" button disappear
    show_more_path = '/html/body/section/div[2]/div[2]/div[2]/a'

    print('   - Scrolling down...', end ='')

    driver = get_chromedriver(chromedriver_path = CHROMEDRIVER_PATH)
    driver.get(urljoin(CATEGORY_PAGE, url_categ))

    try:
        while driver.find_element("xpath", show_more_path).is_displayed():

            driver.execute_script("arguments[0].scrollIntoView(true);", driver.find_element("xpath", show_more_path))
            driver.find_element("xpath", show_more_path).click()
            time.sleep(3)
        print('OK')
    except:
        print('SKIPPED')

    # get html
    print('   - Downloading html...', end='')
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    print('OK')

    # extract information from web list
    print('   - Parsing info...', end='')
    tag = soup.find_all('div', class_="icoListContent", recursive=True)
    tag_list = []
    for t in tag[0]:
        if 'div class="newItems"' not in str(t):
            tag_list.append(t)
    # if show more, html structure changes
    nested_tags=soup.find_all('div', class_="newItems", recursive=True)
    if len(nested_tags) > 0:
        for x in soup.find_all('div', class_="newItems", recursive=True):
            tag_list.extend([y for y in x])

    temp_list = pd.DataFrame(columns=cat_list.columns)
    for t in tag_list:

        if 'START' in str(t):

            conv_dict = convert(t)['div']
            for x in conv_dict:

                if x['@class'][0] == 'icoListItem__info':
                    sup = x['a'][0]['sup']
                    n_views = [y['#text'] for y in sup if y['@class'][0] == "sup_views"][0]
                    is_sto = int(any([True if y['@class'][0] == "sup_is_sto" else False for y in sup]))
                    is_ieo = int(any([True if y['@class'][0] == "sup_is_ieo" else False for y in sup]))
                    ver_email = int(any([True if y['@class'][0] == "sup_email_confirmed" else False for y in sup]))
                    url = x['a'][0]['@href']
                if x['@class'][0] == 'icoListItem__raised':
                    status = x['#text']#[v['#text'] for k, v in x['span'][0].items() if ]
                if x['@class'][0] == 'icoListItem__start':
                    start_date = x['navigablestring'][0]
                if x['@class'][0] == 'icoListItem__end':
                    end_date = x['navigablestring'][0]

            temp_list = temp_list.append(pd.DataFrame({
                'Category':categ,
                'url': urljoin(MAIN_PAGE, url),
                'NViews': int(n_views.replace(' Views', '').replace(',', '')),
                'VerifiedEmailDummy': ver_email,
                'IsSTODummy': is_sto,
                'IsIEODummy': is_ieo,
                'Status': status.replace('STATUS ', ''),
                'StartDate': start_date,
                'EndDate': end_date
            }, index = [0]))

    temp_list['ListDownloadedOn']=download_date
    cat_list = cat_list.append(temp_list)
    
    # save results
    cat_list.to_csv(os.path.join(RESULTS_FOLDER,'01a_ICOmarks_ico_list.csv'), index=False, sep=';')

    if temp_list.shape[0] != expected_count:
        print('   ####### warning, expected number of elements (' + str(expected_count) + ') mismatch. Found ' + str(temp_list.shape[0]))
    else:
        print('OK')

    driver.close()
    
print('\n\nTotal elapsed time:', str(datetime.timedelta(seconds=round(timer()-start))))
print('\nData saved in ', os.path.join(RESULTS_FOLDER,'01a_ICOmarks_ico_list_raw.csv'))


- Downloading: AI   (505 expected) 1 / 33
   - Scrolling down...OK
   - Downloading html...OK
   - Parsing info...OK

- Downloading: Art   (99 expected) 2 / 33
   - Scrolling down...OK
   - Downloading html...OK
   - Parsing info...OK

- Downloading: Banking   (645 expected) 3 / 33
   - Scrolling down...OK
   - Downloading html...OK
   - Parsing info...OK

- Downloading: Big Data   (412 expected) 4 / 33
   - Scrolling down...OK
   - Downloading html...OK
   - Parsing info...OK

- Downloading: Business   (1341 expected) 5 / 33
   - Scrolling down...OK
   - Downloading html...OK
   - Parsing info...OK

- Downloading: Charity   (156 expected) 6 / 33
   - Scrolling down...OK
   - Downloading html...OK
   - Parsing info...OK

- Downloading: Communication   (451 expected) 7 / 33
   - Scrolling down...OK
   - Downloading html...OK
   - Parsing info...OK

- Downloading: Cryptocurrency   (2959 expected) 8 / 33
   - Scrolling down...OK
   - Downloading html...OK
   - Parsing info...OK

- Downlo

### Check downloaded list and remove duplicates

In [12]:
cat_list=pd.read_csv(os.path.join(RESULTS_FOLDER,'01a_ICOmarks_ico_list_raw.csv'), sep=';')

cat_list.drop_duplicates(inplace=True)

# adjust dates
cat_list['StartDate']=cat_list['StartDate'].map(adjust_date)
cat_list['EndDate']=cat_list['EndDate'].map(adjust_date)

# find url with multiple entries (due to IEO/STO) and keep all categories and minimum start date and max end date
multiple_url=cat_list[['url', 'NViews']].drop_duplicates()['url'].value_counts().to_frame().reset_index().query('url > 1')['index']
if len(multiple_url) > 0:
    
    print('\n-- Url with multiple entries found. Keeping single information only')
    new_df=pd.DataFrame(columns=cat_list.columns)
    for t_url in multiple_url:
        t_df=cat_list[cat_list['url']==t_url].copy()

        status=t_df['Status'].value_counts().index[0]
        if t_df['Status'].nunique() > 1:
            u_val=t_df['Status'].unique()
            if 'Ended' in u_val:
                status='Ended'
            if 'Active' in u_val:
                status='Active'
            print(f"    - {t_url}: Multiple status found: {u_val}. Keeping '{status}'")

        try:
            start_date=pd.to_datetime(t_df['StartDate'].loc[lambda x : x != 'TBA'], infer_datetime_format=True).min().strftime('%d %b %Y')
        except:
            start_date=t_df['StartDate'].unique()[0]
        try:
            end_date=pd.to_datetime(t_df['EndDate'].loc[lambda x : x != 'TBA'], infer_datetime_format=True).max().strftime('%d %b %Y')
        except:
            end_date=t_df['EndDate'].unique()[0]

        add_df=pd.DataFrame({'Category': t_df['Category'].unique(),
                            'url': t_url,
                            'NViews': t_df['NViews'].max(),
                            'VerifiedEmailDummy': t_df['VerifiedEmailDummy'].max(),
                            'IsSTODummy': t_df['IsSTODummy'].max(),
                            'IsIEODummy': t_df['IsIEODummy'].max(),
                            'Status': status,
                            'StartDate': start_date,
                            'EndDate': end_date,
                            'ListDownloadedOn': t_df['ListDownloadedOn'].values[0]})

        new_df=pd.concat([new_df, add_df])
        
    cat_list=cat_list[~cat_list['url'].isin(multiple_url)]
    cat_list=pd.concat([cat_list, new_df]) 

# get dummy for category
cat_list['Category']=cat_list['Category'].str.replace(' ', '')
cat_dummy=pd.concat([cat_list['url'], pd.get_dummies(cat_list['Category'], drop_first=False, prefix='Category', prefix_sep='')], axis=1)
cat_dummy=cat_dummy.groupby('url').sum()
cat_dummy.columns=cat_dummy.columns+'Dummy'
if cat_dummy.max().max() != 1:
    print('\n ### "Category" dummy variable has value greater than 1')
cat_dummy.reset_index(inplace=True)

# evaluate duration
cat_list.loc[cat_list['url'] == 'https://icomarks.com/ico/kaizen-coin', 'StartDate']='18 Aug 2017'
cat_list.loc[cat_list['url'] == 'https://icomarks.com/ico/curveblock', 'EndDate']='31-mar-19'
cat_list.loc[cat_list['url'] == 'https://icomarks.com/ico/0chain', 'EndDate']='19-feb-18'
cat_list.loc[cat_list['url'] == 'https://icomarks.com/ico/hunibit', 'StartDate']='21 apr 2019'
cat_list.loc[cat_list['url'] == 'https://icomarks.com/ico/hunibit', 'EndDate']='03 may 2019'
cat_list.loc[cat_list['url'] == 'https://icomarks.com/ico/clearaid', 'EndDate']='01 may 2019'
cat_list.loc[cat_list['url'] == 'https://icomarks.com/ico/ultrashares', 'StartDate']='23 apr 2018'
cat_list.loc[cat_list['url'] == 'https://icomarks.com/ico/ultrashares', 'EndDate']='30 jun 2018'
cat_list.loc[cat_list['url'] == 'https://icomarks.com/ico/dentix', 'StartDate']='01-mar-18'
cat_list.loc[cat_list['url'] == 'https://icomarks.com/ico/eos', 'EndDate']='04 jun 2018'
cat_list.loc[cat_list['url'] == 'https://icomarks.com/ico/mundus', 'StartDate']='31 aug 2017'
cat_list.loc[cat_list['url'] == 'https://icomarks.com/ico/mundus', 'EndDate']='30 oct 2017'
cat_list.loc[cat_list['url'] == 'https://icomarks.com/ico/vr-park', 'StartDate']='24 aug 2019'
cat_list.loc[cat_list['url'] == 'https://icomarks.com/ico/vr-park', 'EndDate']='17 apr 2020'
cat_list.loc[cat_list['url'] == 'https://icomarks.com/ico/goldminecoin', 'EndDate']='15 mar 2018'
cat_list.loc[cat_list['url'] == 'https://icomarks.com/ico/horsechain', 'EndDate']='14 Jul 2019'
cat_list['LogDurationDays']=cat_list.apply(eval_duration, axis=1)
move_col = cat_list.pop('LogDurationDays')
cat_list.insert(cat_list.columns.get_loc("EndDate")+1, 'LogDurationDays', move_col)
check_error=cat_list[cat_list['LogDurationDays'] < 0][['url', 'Status', 'StartDate', 'EndDate']].drop_duplicates()
if len(check_error):
    print(f'\n-- {len(check_error)} rows with error in "LogDurationDays":')
    display(check_error['Status'].value_counts().to_frame())
    check_error.to_csv(os.path.join(RESULTS_FOLDER, '01b_ICOmarks_ico_list_adjusted_DurationError.csv'), index=False, sep=';')
    print('Log saved in ', os.path.join(RESULTS_FOLDER, '01b_ICOmarks_ico_list_adjusted_DurationError.csv'))
    

# create final dataset
cat_list=cat_list.drop(columns='Category').drop_duplicates()
cat_list=cat_list.merge(cat_dummy, on='url', how='left')
move_col = cat_list.pop('ListDownloadedOn')
cat_list.insert(cat_list.columns.get_loc("url")+1, 'ListDownloadedOn', move_col)

if cat_list['url'].nunique() != cat_list.shape[0]:
    print('\n ##### Unique urls do not match number of rows')

print('\n-- Total ICOs found:', cat_list['url'].nunique())
display(cat_list['Status'].value_counts().to_frame())
    
# save csv
cat_list.to_csv(os.path.join(RESULTS_FOLDER, '01b_ICOmarks_ico_list_adjusted.csv'), index=False, sep=';')
print('\nData saved in ', os.path.join(RESULTS_FOLDER, '01b_ICOmarks_ico_list_adjusted.csv'))


-- Url with multiple entries found. Keeping single information only
    - https://icomarks.com/ico/ins: Multiple status found: ['Trading' 'Ended']. Keeping 'Ended'
    - https://icomarks.com/ico/unifox: Multiple status found: ['Pre-Sale Ended' 'Ended']. Keeping 'Ended'

-- 7 rows with error in "LogDurationDays":


Unnamed: 0,Status
Upcoming,7


Log saved in  .\Results\01b_ICOmarks_ico_list_adjusted_DurationError.csv

-- Total ICOs found: 8279


Unnamed: 0,Status
Ended,5034
Upcoming,1890
Trading,726
Pre-Sale Ended,399
Active,164
Pre-Sale,66



Data saved in  .\Results\01b_ICOmarks_ico_list_adjusted.csv


## Scrape information from url

In [4]:
URL_ROOT='https://icomarks.com/ico/'    # will be removed from url to save pickle in ICOMARKS_FOLDER
RELOAD_PKL=True
SKIP_MISSING=False     # if True skip attempt to scrape missing pickles

if not os.path.exists(ICOMARKS_FOLDER):
    os.makedirs(ICOMARKS_FOLDER)

cat_list=pd.read_csv(os.path.join(RESULTS_FOLDER, '01b_ICOmarks_ico_list_adjusted.csv'), sep=';')

In [5]:
scrape_df=pd.DataFrame()
for index, row in cat_list.iterrows():
    
    print(f'- Scraping: {str(index + 1)} / {len(cat_list)}   last interaction: {datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S")}', end='\r')
    
    url=row['url']
    save_path=os.path.join(ICOMARKS_FOLDER, url.replace(URL_ROOT, '')+'.json').replace('|', '')
    
    if not RELOAD_PKL or not os.path.exists(save_path):
    
        if SKIP_MISSING and not os.path.exists(save_path):
            add_row=pd.DataFrame({'url': url, 'ScrapeStatus': 'ERROR'}, index=[0])
            scrape_df=pd.concat([scrape_df, add_row])
            continue
    
        try:
            start = timer()
            add_row=scrape_info_icomarks(url=url, chromedriver_path=CHROMEDRIVER_PATH, skip_social=False, skip_price=False)
            add_row.insert(1, 'ScrapeStatus', 'OK')
            add_row['PklPath']=save_path
            add_row['TotTimeSec']=datetime.timedelta(seconds=round(timer()-start)).total_seconds()
            add_row.to_json(save_path, orient='table')
        except:
            add_row=pd.DataFrame({'url': url, 'ScrapeStatus': 'ERROR'}, index=[0])
    
    else:
        add_row=pd.read_json(save_path, orient='table')
        # re-format nested dataframe from json schema
        add_row['InfoBlock']=[pd.DataFrame(add_row['InfoBlock'][0])]
        if 'TeamBlock' in add_row.columns:
            add_row['TeamBlock']=[pd.DataFrame(add_row['TeamBlock'][0])]
        if 'SocialBlock' in add_row.columns:
            social_df=pd.DataFrame(add_row['SocialBlock'][0][0]['stats'])
            series_dict={}
            for k in add_row['SocialBlock'][0][0]['timeseries'].keys():
                series_dict[k]=pd.DataFrame(add_row['SocialBlock'][0][0]['timeseries'][k])
            add_row['SocialBlock']=[[{'stats': social_df, 'timeseries': series_dict}]]
        if 'MarketPriceSeries' in add_row.columns:
            add_row['MarketPriceSeries']=[pd.DataFrame(add_row['MarketPriceSeries'][0])]
        
    scrape_df=pd.concat([scrape_df, add_row])

scrape_df.reset_index(drop=True, inplace=True)
display(scrape_df['ScrapeStatus'].value_counts().to_frame())
   
print('\n\nTotal elapsed time:', str(datetime.timedelta(seconds=round(scrape_df['TotTimeSec'].sum()))))

# save
pkl_path=os.path.join(CHECKPOINT_FOLDER, 'scrape_df_raw.pkl')
joblib.dump(scrape_df, pkl_path, compress=('lzma', 3))
print(f'\nData saved in {pkl_path}')

- Scraping: 8279 / 8279   last interaction: 08/02/2023 23:09:02

Unnamed: 0,ScrapeStatus
OK,8279


- Social Media time series status:


Unnamed: 0,SocialSeriesStatus
DOWNLOADED,6398
SOCIAL_TAB_MISSING,1654
DOWNLOAD_NOT_AVAILABLE,227


- Market Price time series status:


Unnamed: 0,MarketPriceSeriesStatus
DOWNLOAD_NOT_AVAILABLE,7555
DOWNLOADED,724




Total elapsed time: 1 day, 9:00:33

Data saved in .\Checkpoints\scrape_df_raw.pkl


## Format scraped information and save final dataset

In [7]:
pkl_path=os.path.join(CHECKPOINT_FOLDER, 'scrape_df_raw.pkl')
scrape_df=joblib.load(pkl_path)

start=timer()

#### extract from raw data
print('----######   Extracting information from raw scraped data   ######----\n')
format_df=extract_scaping_icomarks(scrape_df).reset_index(drop=True)

pkl_path=os.path.join(CHECKPOINT_FOLDER, 'scrape_df_extracted.pkl')
joblib.dump(format_df, pkl_path, compress=('lzma', 3))
print(f'\n   - Data saved in {pkl_path}')


#### format nested column and merge with cat_list dataset
cat_list=pd.read_csv(os.path.join(RESULTS_FOLDER, '01b_ICOmarks_ico_list_adjusted.csv'), sep=';')
print('\n\n\n----######   Formatting columns   ######----')
if len(cat_list) != len(format_df):
    raise ValueError('\n\n ########### Error: "cat_list" and "format_df" must have same rows')
format_df_rows=format_df.shape[0]
format_df=format_columns(format_df, cat_list=cat_list, format_df_rows=format_df_rows, results_folder=RESULTS_FOLDER)
final_df['WebsiteUrl']=final_df['WebsiteUrl'].str.replace('?utm_source=icomarks', '', regex=False)
## todo
# - platform
# - TokenAvailForSale
# - TokenTotSupply
# - AcceptedCurr ?
print('\n\n\n----######   Merging with category dataset   ######----')
cat_list_rows=cat_list.shape[0]
final_df=cat_list.copy().merge(format_df, on='url', how='left')
display(final_df['Status'].value_counts().to_frame())
print(f'   - Total rows: {len(final_df)}')
if final_df.shape[0] != cat_list_rows:
    print('########## "final_df" expected rows do not match')
# save file
save_path=os.path.join(RESULTS_FOLDER, '01d_ICOmarks_ico_list_scraped_formatted.csv')
save_path_pkl=os.path.join(CHECKPOINT_FOLDER, 'formatted_df.pkl')
final_df.to_csv(save_path, index=False, sep=';')
final_df.to_pickle(save_path_pkl, protocol=-1)
print(f'\n   - Data saved in {save_path}')
print(f'   - Pickle saved in {save_path_pkl}')
# save stats
save_path=os.path.join(RESULTS_FOLDER, '01d_ICOmarks_ico_list_scraped_formatted_Stats.csv')
print(f'   - Stats saved in {save_path}')
summary_stats(final_df).to_csv(save_path, index=False, sep=';')

print('\n\nTotal elapsed time:', str(datetime.timedelta(seconds=round(timer()-start))))

----######   Extracting information from raw scraped data   ######----

   - Processing 8279 / 8279
   - Data saved in .\Checkpoints\scrape_df_extracted.pkl



----######   Formatting columns   ######----

** Formatting "FundRaised"

** Formatting "Country"
- Mapped countries with low accuracy:


Unnamed: 0,Country,Country_adj,country,accuracy
98,Grand Cayman,Grand Cayman,Cayman Islands,73
231,Singapura,Singapura,Singapore,78
256,Singarope,Singarope,Singapore,78
160,Malte,Malte,Malta,80
167,Nederland,Nederland,Netherlands,80
222,Melta,Melta,Malta,80
85,Brasil,Brasil,Brazil,83
159,Сanada,Сanada,Canada,83
259,Tunis,Tunis,Tunisia,83
174,St Vincent,St Vincent,Saint Vincent and the Grenadines,86



** Formatting "SocialMedia"
- Counts for SocialMedia dummy:


Unnamed: 0,val,count
10,Twitter,7677
9,Telegram,6915
2,Facebook,6030
6,Medium,4619
12,Youtube,4001
0,Bitcointalk,3836
7,Reddit,3588
3,Github,3214
5,Linkedin,1169
4,Instagram,1041



** Formatting "ICOPrice", "IEOPrice", "STOPrice"
- Errors when parsing "ICOPrice", "IEOPrice", "STOPrice":


                                                                    6398
currency range error missing label-missing currency unit numeric      13
missing currency unit numeric                                         10
multiple token or currency index                                       3
currency error float-missing currency unit numeric                     2
multiple token or currency index -missing currency unit numeric        1
Name: error, dtype: int64

- Error log saved in .\Results\01c_ICOmarks_formatted_price_error_log.csv
- 17 rows removed because currency FX rate not available


QTUM       1
GOFGOLD    1
WAN        1
CENTS      1
USA        1
CAD        1
BSCX       1
FTM        1
ET         1
TTC        1
TH         1
RMB        1
BTCM       1
XEM        1
JPY        1
GA         1
ADA        1
Name: currency_lab, dtype: int64

- Taking closest available FX rate for 7 rows. Currency: ['VET' 'KRW' 'USDT']
- Price available for 6381 entries
- Price log saved in .\Results\01c_ICOmarks_formatted_price_log.csv

** Formatting "PreSalePrice"
- Errors when parsing "PreSalePrice":


                                                                    1697
currency range error missing label-missing currency unit numeric       8
missing currency unit numeric                                          4
multiple token or currency index                                       1
Name: error, dtype: int64

- Error log saved in .\Results\01c_ICOmarks_formatted_PreSaleprice_error_log.csv
- 3 rows removed because currency FX rate not available
- Taking closest available FX rate for 1 rows. Currency: ['USDT']
- Price available for 1694 entries
- Price log saved in .\Results\01c_ICOmarks_formatted_PreSaleprice_log.csv

** Formatting "FundHardCap" and "FundSoftCap"
- Errors when parsing "FundHardCap" and "FundSoftCap":


                                                                      7769
missing currency unit numeric                                           26
currency range error missing label-missing currency unit numeric        23
currency range error multiple labels-missing currency unit numeric       1
multiple token or currency index                                         1
Name: error, dtype: int64

- Error log saved in .\Results\01c_ICOmarks_formatted_HardSoftCap_error_log.csv
- 25 rows in "only_token_df" (with Hard/Soft Cap in tokens) skipped because of missing "PriceUSD"
- 475 rows remaing in "only_token_df" (with Hard/Soft Cap in tokens)
- 23 rows in "only_currency_df" (with Hard/Soft Cap in currency) skipped because FX rate not available


MILLION          4
WEEDO            2
PLN              2
MATRIX           2
DCO              1
BCC              1
MATC             1
GC               1
USO              1
OFTOTALSUPPLY    1
DTC              1
US               1
M                1
TOKENS           1
QTUM             1
IDAP             1
PXS              1
Name: currency_lab, dtype: int64

- Taking closest available FX rate for 1 rows. Currency: ['KRW']
- 7242 rows remaing in "only_currency_df" (with Hard/Soft Cap in currency)
- All 4 rows in "both_df" (with Hard/Soft Cap in currency AND token) skipped because of mismatch

- Hard/Soft Cap available for 7717 entries
- Price log saved in .\Results\01c_ICOmarks_formatted_HardSoftCap_log.csv



----######   Merging with category dataset   ######----


Unnamed: 0,Status
Ended,5034
Upcoming,1890
Trading,726
Pre-Sale Ended,399
Active,164
Pre-Sale,66


   - Total rows: 8279

   - Data saved in .\Results\01d_ICOmarks_ico_list_scraped_formatted.csv
   - Pickle saved in .\Checkpoints\formatted_df.pkl
   - Stats saved in .\Results\01d_ICOmarks_ico_list_scraped_formatted_Stats.csv


Total elapsed time: 0:12:10


## Extract Market Price and Social Users list

In [41]:
pkl_path=os.path.join(CHECKPOINT_FOLDER, 'scrape_df_raw.pkl')
scrape_df=joblib.load(pkl_path)

In [69]:
price_df=pd.DataFrame()
social_df=pd.DataFrame()
for index, row in scrape_df.iterrows():
    
    # price
    if row['MarketPriceSeriesStatus'] == 'DOWNLOADED':
        tt=row['MarketPriceSeries'].copy()
        tt.insert(0, 'url', row['url'])
        price_df=pd.concat([price_df, tt])
    
    # social users
    if row['SocialSeriesStatus'] == 'DOWNLOADED':
        tt=pd.DataFrame()
        for social_name, social_data in row['SocialBlock'][0]['timeseries'].items():
            df=social_data.copy()
            df.insert(0, 'Social', social_name)
            df.insert(0, 'url', row['url'])
            tt=pd.concat([tt, df])
        social_df=pd.concat([social_df, tt])
     
tot_url=price_df['url'].nunique()
print(f'- Downloaded Market Price: {tot_url}  Total rows: {len(price_df)}')
save_path=os.path.join(RESULTS_FOLDER, '01e_ICOmarks_market_price_series.csv')
save_path_pkl=os.path.join(CHECKPOINT_FOLDER, 'ICOmarks_market_price_series.pkl')
price_df.to_csv(save_path, index=False, sep=';')
price_df.to_pickle(save_path_pkl, protocol=-1)
print(f'   - Data saved in {save_path}')
print(f'   - Pickle saved in {save_path_pkl}')


tot_url=social_df['url'].nunique()
print(f'\n- Downloaded Social Users: {tot_url}  Total rows: {len(social_df)}')
save_path=os.path.join(RESULTS_FOLDER, '01e_ICOmarks_social_users_series.csv')
save_path_pkl=os.path.join(CHECKPOINT_FOLDER, 'ICOmarks_social_users_series.pkl')
# social_df.to_csv(save_path, index=False, sep=';')
social_df.to_pickle(save_path_pkl, protocol=-1)
print(f'   - Data saved in {save_path}')
print(f'   - Pickle saved in {save_path_pkl}')

- Downloaded Market Price: 724  Total rows: 335746
   - Data saved in .\Results\01e_ICOmarks_market_price_series.csv
   - Pickle saved in .\Checkpoints\ICOmarks_market_price_series.pkl

- Downloaded Social Users: 6398  Total rows: 7276469
   - Data saved in .\Results\01e_ICOmarks_social_users_series.csv
   - Pickle saved in .\Checkpoints\ICOmarks_social_users_series.pkl
