In [2]:
import os
import requests
import pandas as pd
import numpy as np
from timeit import default_timer as timer
import datetime
from unidecode import unidecode
import pickle
import pdfkit
import joblib
import func_timeout
# from lingua import Language, LanguageDetectorBuilder
from utils import get_chromedriver, download_from_drive_dropbox, pdf_to_text

TESSERACT_PATH=r'C:\Program Files\Tesseract-OCR\tesseract.exe'   # used for tesseract OCR. See pdf_to_text()

In [3]:
CHROMEDRIVER_PATH = r"C:\Users\Alessandro Bitetto\Downloads\UniPV\ICOs\WebDriver\chromedriver"

In [4]:
# set folders
WHITEPAPER_FOLDER = '.\\Whitepaper'
ORIGINAL_FOLDER = '.\\Whitepaper\\Original'
CONVERTED_FOLDER = '.\\Whitepaper\\Converted_to_txt'
RECOVERED_FOLDER = '.\\Whitepaper\\Recovered'
CHECKPOINT_FOLDER = '.\\Checkpoints'
RESULTS_FOLDER = '.\\Results'

if not os.path.exists(WHITEPAPER_FOLDER):
    os.makedirs(WHITEPAPER_FOLDER)
if not os.path.exists(ORIGINAL_FOLDER):
    os.makedirs(ORIGINAL_FOLDER)
if not os.path.exists(CONVERTED_FOLDER):
    os.makedirs(CONVERTED_FOLDER)
if not os.path.exists(RECOVERED_FOLDER):
    os.makedirs(RECOVERED_FOLDER)

In [4]:
NEW_DOMAIN = ".ai"      # replace the old domani ".com", changed after first run

In [21]:
# load whitepaper url
download_df = pd.read_csv(os.path.join(RESULTS_FOLDER, '01g_ICOmarks_ico_list_scraped_formatted.csv'), sep = ";")

download_df = download_df[['url', 'WhitepaperUrl']].dropna().reset_index(drop = True)
download_df['Status'] = ""
download_df['Error'] = ""
download_df['Path_Original'] = ""

download_df

Unnamed: 0,url,WhitepaperUrl,Status,Error,Path_Original
0,https://icomarks.ai/ico/tokelite,https://www.docdroid.net/xvdu93N/tokelite-whit...,,,
1,https://icomarks.ai/ico/ucbi-banking,https://ucbibanking.com/UCBI_Whitepaper_EN.pdf,,,
2,https://icomarks.ai/ico/capitual,https://capitual.io/whitepaper.pdf,,,
3,https://icomarks.ai/ico/btccredit,https://btccredit.io/pdf/BTCCredit_Whitepaper_...,,,
4,https://icomarks.ai/ico/ledgerium,https://whitepaper.ledgerium.io/,,,
...,...,...,...,...,...
7741,https://icomarks.ai/ico/deepcloud,https://www.dropbox.com/s/st6ldsd5shfdz3y/Deep...,,,
7742,https://icomarks.ai/ico/tap4-menu,http://tap4.menu/wp-content/uploads/2019/02/Wh...,,,
7743,https://icomarks.ai/ico/ultrablock,https://ultrablock.io/pdf/whitepaper.pdf,,,
7744,https://icomarks.ai/ico/avinoc,https://static.avinoc.cloud/downloads/AVINOC_W...,,,


## Download pdf

In [4]:
HEADERS = {"User-Agent": "Chrome/51.0.2704.103"}
URL_ROOT='https://icomarks.com/ico/'    # will be removed from url to create pdf name
RELOAD_PDF=True

start = timer()
OK_count = 0
for index, row in download_df.iterrows():
    
    url = row['WhitepaperUrl']
    file_name = os.path.join(ORIGINAL_FOLDER, row['url'].replace(URL_ROOT.replace(".com", NEW_DOMAIN), '').replace('|', '') + '.pdf')

    print('Downloading ' + str(index + 1) + ' / ' + str(len(download_df)) + '  - Total OK: ' + str(OK_count), end = '\r')
    
    if not RELOAD_PDF or not os.path.exists(file_name):
    
        try:
            # connect
            response = requests.get(url, headers = HEADERS)

            # check response and save pdf
            if response.status_code == 200:
                with open(file_name, "wb") as f:
                    f.write(response.content)
                download_df.loc[index, 'Status'] = 'OK'
                download_df.loc[index, 'Path_Original'] = os.path.join(os.getcwd(), file_name)
                OK_count += 1
            else:
                download_df.loc[index, 'Status'] = response.status_code

        except Exception as e:
            download_df.loc[index, 'Status'] = 'ERROR'
            download_df.loc[index, 'Error'] = e
    
    else:
        download_df.loc[index, 'Status'] = 'OK'
        download_df.loc[index, 'Path_Original'] = os.path.join(os.getcwd(), file_name)
        OK_count += 1
    
    # save checkpoint
    download_df.to_csv(os.path.join(CHECKPOINT_FOLDER, 'whitepaper_download.csv'), index=False, sep=';')
            
print('\nTotal elapsed time:', str(datetime.timedelta(seconds=round(timer()-start))))

# save results
download_df.to_csv(os.path.join(RESULTS_FOLDER ,'00a_whitepaper_download_original.csv'), index=False, sep=';')

Downloading 7513 / 7513  - Total OK: 3046
Total elapsed time: 11:06:25


## Check downloaded files and convert to txt

In [17]:
download_df = pd.read_csv(os.path.join(RESULTS_FOLDER, '00a_whitepaper_download_original.csv'), sep = ";")
print(download_df['Status'].value_counts())

final_df = download_df.copy()
final_df = final_df[final_df['Status'] == 'OK'].reset_index(drop = True)
final_df['Path_Recovered'] = ""
final_df['Path_txt'] = ""
final_df['Status_txt'] = ""
final_df['Length_txt'] = 0
final_df['Length_txt_clean'] = 0
final_df['Content_txt'] = ""
final_df['Metadata'] = ""

OK       3048
ERROR    2710
404      1227
403       165
410        91
522        86
520        73
500        23
521        17
530        12
502        11
503        10
504         8
526         7
523         6
301         5
525         4
401         2
524         2
406         2
400         2
402         1
423         1
Name: Status, dtype: int64


In [18]:
# Parse pdf and convert to txt
URL_ROOT='https://icomarks.com/ico/'    # will be removed from url to create pdf name
RELOAD_PKL=True

tot_time=0
for index, row in final_df.iterrows():
    
    file_path = row['Path_Original']
    file_path_txt = os.path.join(os.getcwd(), CONVERTED_FOLDER, row['url'].replace(URL_ROOT.replace(".com", NEW_DOMAIN), '').replace('|', '') + '.txt')
    file_path_pkl = os.path.join(os.getcwd(), CONVERTED_FOLDER, row['url'].replace(URL_ROOT.replace(".com", NEW_DOMAIN), '').replace('|', '') + '.pkl')
    
    print('Parsing ' + str(index + 1) + ' / ' + str(len(final_df)), end = '\r')
    
    if not RELOAD_PKL or not os.path.exists(file_path_pkl):
        
        # pdf to txt
        start = timer()
        txt, meta, parsed_pdf = pdf_to_text(file_path=file_path, tesseract_path=TESSERACT_PATH, lang='eng')
        status=parsed_pdf['status']
        eval_time=datetime.timedelta(seconds=round(timer()-start)).total_seconds()
        
        # save .pkl
        joblib.dump({'txt': txt, 'meta': meta, 'status': status, 'eval_time': eval_time}, file_path_pkl)
    
    else:
        rr=joblib.load(file_path_pkl)
        txt=rr['txt']
        meta=rr['meta']
        status=rr['status']
        eval_time=rr['eval_time']
        
    final_df.loc[index, 'Status_txt'] = status
    final_df.loc[index, 'Length_txt'] = len(txt) if txt is not None else 0    # case of pdf saved as image
    final_df.loc[index, 'Length_txt_clean'] = len(txt.replace('\n','').replace(' ', '')) if txt is not None else 0    # clear whitespace to filter empty files
    final_df.at[index, 'Content_txt'] = meta['Content-Type']
    final_df.loc[index, 'Metadata'] = [meta]
    tot_time+=eval_time
        
    # save txt
    if txt is not None:
        final_df.loc[index, 'Path_txt'] = file_path_txt
        with open(file_path_txt, 'w') as f:
            f.write(unidecode(txt))
    
    # save checkpoint
    if index % 300 == 0 or index == (len(final_df) - 1):
        final_df.drop(columns='Metadata').to_csv(os.path.join(CHECKPOINT_FOLDER, 'whitepaper_parsing.csv'), index=False, sep=';')
        with open(os.path.join(CHECKPOINT_FOLDER, 'whitepaper_parsing.pickle'), 'wb') as handle:
                    pickle.dump(final_df, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
print('\nTotal elapsed time:', str(datetime.timedelta(seconds=round(tot_time))))

# save results
final_df.drop(columns='Metadata').to_csv(os.path.join(RESULTS_FOLDER, '00b_whitepaper_parsing.csv'), index=False, sep=';')

Parsing 3048 / 3048
Total elapsed time: 1:53:39


In [19]:
# stats
with open(os.path.join(CHECKPOINT_FOLDER, 'whitepaper_parsing.pickle'), 'rb') as handle:
    final_df = pickle.load(handle)

# status
print('-- Status --\n')
print(final_df['Status'].value_counts())
# text length
print('\n\n-- Text length --\n')
print(final_df[['Length_txt', 'Length_txt_clean']].describe(percentiles = [.5, .10, .15, .2, .25, .3, .4, .75, .95]))
# pdf content
cnt = final_df['Content_txt'].values
cnt_single = [x for x in cnt if type(x) == str]
cnt_multiple = [x for x in cnt if type(x) != str]
print('\n\n-- Pdf content --')
print('\n- Single values in "Content_txt" (' + str(len(cnt_single)) + '):')
print(pd.DataFrame({'Content_txt': cnt_single})['Content_txt'].value_counts())
print('\n- Multiple values in "Content_txt" (' + str(len(cnt_multiple)) +') first column is the number of multiple values in list:')
print(pd.DataFrame({'aa': [len(x) for x in cnt_multiple]})['aa'].value_counts())

-- Status --

OK    3048
Name: Status, dtype: int64


-- Text length --

          Length_txt  Length_txt_clean
count    3048.000000       3048.000000
mean    22257.032152      17168.373360
std     35151.340586      27752.542292
min         0.000000          0.000000
10%        71.000000         33.000000
15%        83.000000         44.000000
20%       133.600000         69.000000
25%       320.750000        106.000000
30%       508.200000        286.200000
40%      2323.600000       1264.800000
50%      6021.000000       3638.500000
75%     34135.500000      26832.750000
95%     88365.600000      71564.550000
max    485969.000000     411334.000000


-- Pdf content --

- Single values in "Content_txt" (2940):
text/html; charset=UTF-8                       1184
application/pdf                                1160
text/html; charset=ISO-8859-1                   387
application/xhtml+xml; charset=UTF-8            133
application/xhtml+xml; charset=ISO-8859-1        36
application/octet-st

## Try to recover empty pdf

In [20]:
LENGTH_TXT_CLEAN_THRSH = 4000     # threshold for maximum non-empty characters in parsed txt file


with open(os.path.join(CHECKPOINT_FOLDER, 'whitepaper_parsing.pickle'), 'rb') as handle:
    final_df_recover = pickle.load(handle)
final_df_recover['Recover_action'] = "SKIP"
final_df_recover['Recover_Length_txt'] = -1
final_df_recover['Recover_Length_txt_clean'] = -1
final_df_recover['Recover_Path_txt'] = ""

In [21]:
URL_ROOT='https://icomarks.com/ico/'    # will be removed from url to create pdf name
PATH_TO_WHHTMLTOPDF = r'C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe' # define path to wkhtmltopdf.exe, see https://python-bloggers.com/2022/06/convert-html-to-pdf-using-python/
TEMP_DOWNLOAD_FOLDER = "C:\\Users\\Alessandro Bitetto\\Downloads\\UniPV\\ICOs\\temp_download"
ID_TO_SKIP = []
RELOAD_PDF = True
RELOAD_PKL = True

# Point pdfkit configuration to wkhtmltopdf.exe
config = pdfkit.configuration(wkhtmltopdf=PATH_TO_WHHTMLTOPDF)
start = timer()
for index, row in final_df_recover.iterrows():

    short_name=row['url'].replace(URL_ROOT.replace(".com", NEW_DOMAIN), '').replace('|', '')
    
    print('Recovering ' + str(index + 1) + ' / ' + str(len(final_df_recover)) + ' (' + short_name + ')' +' '*30, end = '\r')

    file_path_pdf = os.path.join(os.getcwd(), RECOVERED_FOLDER, short_name + '.pdf')
    file_path_txt = os.path.join(os.getcwd(), RECOVERED_FOLDER, short_name + '.txt')
    file_path_pkl = os.path.join(os.getcwd(), RECOVERED_FOLDER, short_name + '.pkl')
    original_len_txt = row['Length_txt']
    original_len_txt_clean = row['Length_txt_clean']
    url = row['WhitepaperUrl']
    
    # check if txt length is below threshold
    if row['Length_txt_clean'] <= LENGTH_TXT_CLEAN_THRSH:
        
        if short_name in ID_TO_SKIP:
            pass
        
        ##### download pdf from google.drive or dropbox

        elif any(x in url for x in ['google', 'goo.gl', 'dropbox']):

            source = 'drive' if any(x in url for x in ['google', 'goo.gl']) else 'dropbox'

            if not RELOAD_PDF or not os.path.exists(file_path_pdf):
            
                out = download_from_drive_dropbox(chromedriver_path=CHROMEDRIVER_PATH, download_url=url,
                                                  download_folder=TEMP_DOWNLOAD_FOLDER, temp_folder=TEMP_DOWNLOAD_FOLDER,
                                                  pdf_name=short_name + '.pdf',
                                                  move_folder=os.path.join(os.getcwd(), RECOVERED_FOLDER), source=source)
            else:
                out="ok"

            if out == "ok":

                # pdf to txt
                if not RELOAD_PKL or not os.path.exists(file_path_pkl):
        
                    # pdf to txt
                    start_t = timer()
                    txt, meta, parsed_pdf = pdf_to_text(file_path=file_path_pdf, tesseract_path=TESSERACT_PATH, lang='eng')
                    status=parsed_pdf['status']
                    eval_time=datetime.timedelta(seconds=round(timer()-start_t)).total_seconds()

                    # save .pkl
                    joblib.dump({'txt': txt, 'meta': meta, 'status': status, 'eval_time': eval_time}, file_path_pkl)

                else:
                    rr=joblib.load(file_path_pkl)
                    txt=rr['txt']
                    meta=rr['meta']

                len_txt_clean = len(txt.replace('\n','').replace(' ', '')) if txt is not None else 0 

                # update files
                if len_txt_clean > original_len_txt_clean:

                    final_df_recover.loc[index, 'Recover_action'] = 'DOWNLOAD ' + source.upper() + ' - OK'
                    final_df_recover.loc[index, 'Path_Recovered'] = file_path_pdf
                    final_df_recover.loc[index, 'Recover_Path_txt'] = file_path_txt
                    final_df_recover.loc[index, 'Recover_Length_txt'] = len(txt)
                    final_df_recover.loc[index, 'Recover_Length_txt_clean'] = len_txt_clean
                    final_df_recover.at[index, 'Content_txt'] = meta['Content-Type']
                    final_df_recover.loc[index, 'Metadata'] = [meta]

                    # save txt
                    with open(file_path_txt, 'w') as f:
                        f.write(unidecode(txt))

            else:
                final_df_recover.loc[index, 'Recover_action'] = 'DOWNLOAD ' + source.upper() + ' - ' + out
        
        
        ##### try to download pdf from html page   e.g. https://www.quasa.io/white-paper
        
        else:

            try:
                # Convert Webpage to PDF
                if not RELOAD_PDF or not os.path.exists(file_path_pdf):
                    def download_web(url, output_path):
                        pdfkit.from_url(url, output_path=output_path, configuration=config)
                    def run_function(f, max_wait):
                        try:
                            func_timeout.func_timeout(max_wait, download_web, args=(url, file_path_pdf))
                            return 'ok'
                        except func_timeout.FunctionTimedOut:
                            pass
                        return 'timeout'
                    out = run_function(download_web, 80)    # stop running after 60*2 seconds
                    if out == 'timeout':
                        with open(file_path_pdf, 'w') as outfile:     # save empty pdf so speed up when RELOAD_PDF=True
                            outfile.write("")

                # pdf to txt
                if not RELOAD_PKL or not os.path.exists(file_path_pkl):
        
                    # pdf to txt
                    start_t = timer()
                    txt, meta, parsed_pdf = pdf_to_text(file_path=file_path_pdf, tesseract_path=TESSERACT_PATH, lang='eng')
                    status=parsed_pdf['status']
                    eval_time=datetime.timedelta(seconds=round(timer()-start_t)).total_seconds()

                    # save .pkl
                    joblib.dump({'txt': txt, 'meta': meta, 'status': status, 'eval_time': eval_time}, file_path_pkl)

                else:
                    rr=joblib.load(file_path_pkl)
                    txt=rr['txt']

                len_txt_clean = len(txt.replace('\n','').replace(' ', '')) if txt is not None else 0 

                # update files
                if len_txt_clean > original_len_txt_clean:

                    final_df_recover.loc[index, 'Recover_action'] = "CONVERT FROM HTML"
                    final_df_recover.loc[index, 'Path_Recovered'] = file_path_pdf
                    final_df_recover.loc[index, 'Recover_Path_txt'] = file_path_txt
                    final_df_recover.loc[index, 'Recover_Length_txt'] = len(txt)
                    final_df_recover.loc[index, 'Recover_Length_txt_clean'] = len_txt_clean
                    final_df_recover.at[index, 'Content_txt'] = meta['Content-Type']
                    final_df_recover.loc[index, 'Metadata'] = [meta]

                    # save txt
                    with open(file_path_txt, 'w') as f:
                        f.write(unidecode(txt))
            except:
                final_df_recover.loc[index, 'Recover_action'] = "CONVERT FROM HTML - FAILED"

    else:
        final_df_recover.loc[index, 'Recover_action'] = "KEEP ORIGINAL"
        
    # save checkpoint
    if index % 300 == 0 or index == (len(final_df_recover) - 1):
        final_df_recover.drop(columns='Metadata').to_csv(os.path.join(CHECKPOINT_FOLDER, 'whitepaper_recover.csv'), index=False, sep=';')
        with open(os.path.join(CHECKPOINT_FOLDER,'whitepaper_recover.pickle'), 'wb') as handle:
                    pickle.dump(final_df_recover, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
print('\nTotal elapsed time:', str(datetime.timedelta(seconds=round(timer()-start))))

# save results
final_df_recover.drop(columns='Metadata').to_csv(os.path.join(RESULTS_FOLDER, '00c_whitepaper_recover.csv'), index=False, sep=';')

Recovering 3048 / 3048 (santiment)                                                                      
Total elapsed time: 0:50:42


In [5]:
# recover actions stats
with open(os.path.join(CHECKPOINT_FOLDER,'whitepaper_recover.pickle'), 'rb') as handle:
    final_df_recover = pickle.load(handle)
final_df_recover.head(4)

Unnamed: 0,url,WhitepaperUrl,Status,Error,Path_Original,Path_Recovered,Path_txt,Status_txt,Length_txt,Length_txt_clean,Content_txt,Metadata,Recover_action,Recover_Length_txt,Recover_Length_txt_clean,Recover_Path_txt,Final_Path_txt,Final_Length_txt,Final_Length_txt_clean
0,https://icomarks.ai/ico/the-mill-of-blood,https://millofblood.com/white-paper.php,OK,,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,200,23152.0,8655.0,text/html; charset=UTF-8,"{'Content-Encoding': 'UTF-8', 'Content-Languag...",KEEP ORIGINAL,-1,-1,,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,23152,8655
1,https://icomarks.ai/ico/xenchain,https://cryptototem.com/wp-ico/img/files/65tpX...,OK,,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,200,316.0,88.0,text/html; charset=ISO-8859-1,"{'Content-Encoding': 'ISO-8859-1', 'Content-La...",FROM CRYPTOTOTEM,65927,53136,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,65928,53136
2,https://icomarks.ai/ico/moonlight,https://cryptototem.com/wp-ico/img/files/ANkwp...,OK,,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,200,64263.0,53667.0,application/pdf,"{'Author': 'moonlight.io', 'Content-Type': 'ap...",FROM CRYPTOTOTEM,64638,53980,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,64639,53980
3,https://icomarks.ai/ico/shipnext,https://cryptototem.com/wp-ico/img/files/0DAC0...,OK,,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,200,3841.0,3157.0,text/html; charset=UTF-8,"{'Content-Encoding': 'UTF-8', 'Content-Languag...",FROM CRYPTOTOTEM,82113,68250,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,82114,68250


In [8]:
# recover actions stats
with open(os.path.join(CHECKPOINT_FOLDER,'whitepaper_recover.pickle'), 'rb') as handle:
    final_df_recover = pickle.load(handle)

final_df_recover['url']=final_df_recover['url'].str.replace(".com", NEW_DOMAIN, regex=False)

print('-- Recover actions --\n')
print(final_df_recover['Recover_action'].value_counts())

-- Recover actions --

KEEP ORIGINAL                            1096
FROM CRYPTOTOTEM                          823
SKIP                                      440
CONVERT FROM HTML                         322
FROM SECOND DOWNLOAD                      164
DOWNLOAD DRIVE - OK                       158
DOWNLOAD DRIVE - page not available       106
CONVERT FROM HTML - FAILED                 80
DOWNLOAD DROPBOX - OK                      11
DOWNLOAD DROPBOX - page not available       9
DOWNLOAD DRIVE - out of time                2
DOWNLOAD DROPBOX - out of time              1
Name: Recover_action, dtype: int64


In [9]:
# set final path for txt files
RECOVER_TO_KEEP = ['CONVERT FROM HTML', 'DOWNLOAD DRIVE - OK', 'DOWNLOAD DROPBOX - OK', 'FROM CRYPTOTOTEM', 'FROM SECOND DOWNLOAD']
LENGTH_TXT_CLEAN_THRSH = 4000     # threshold for maximum non-empty characters in parsed txt file

final_df_recover['Final_Path_txt'] = np.where(final_df_recover['Recover_action'] == 'KEEP ORIGINAL', final_df_recover['Path_txt'], '')
final_df_recover['Final_Path_txt'] = np.where(final_df_recover['Recover_action'].isin(RECOVER_TO_KEEP), final_df_recover['Recover_Path_txt'], final_df_recover['Final_Path_txt'])
final_df_recover['Final_Length_txt'] = np.where(final_df_recover['Recover_action'] == 'KEEP ORIGINAL', final_df_recover['Length_txt'], '')
final_df_recover['Final_Length_txt'] = np.where(final_df_recover['Recover_action'].isin(RECOVER_TO_KEEP), final_df_recover['Recover_Length_txt'], final_df_recover['Final_Length_txt'])
final_df_recover['Final_Length_txt_clean'] = np.where(final_df_recover['Recover_action'] == 'KEEP ORIGINAL', final_df_recover['Length_txt_clean'], -1)
final_df_recover['Final_Length_txt_clean'] = np.where(final_df_recover['Recover_action'].isin(RECOVER_TO_KEEP), final_df_recover['Recover_Length_txt_clean'], final_df_recover['Final_Length_txt_clean'])


final_df_recover.drop(columns='Metadata').to_csv(os.path.join(RESULTS_FOLDER, '00d_whitepaper_final.csv'), index=False, sep=';')

print('Total available whitepapers:', sum(final_df_recover['Final_Path_txt'] != ''))
print(f'   - with more than {LENGTH_TXT_CLEAN_THRSH} clean characters:', sum(final_df_recover['Final_Length_txt_clean'] >= LENGTH_TXT_CLEAN_THRSH))
with open(os.path.join(CHECKPOINT_FOLDER,'whitepaper_final.pickle'), 'wb') as handle:
    pickle.dump(final_df_recover, handle, protocol=pickle.HIGHEST_PROTOCOL)

Total available whitepapers: 2574
   - with more than 4000 clean characters: 2202


In [7]:
import pycld2 as cld2
import regex


# load text
LENGTH_TXT_CLEAN_THRSH = 2000     # threshold for maximum non-empty characters in parsed txt file

with open(os.path.join(CHECKPOINT_FOLDER, 'whitepaper_recover.pickle'), 'rb') as handle:
    final_df = pickle.load(handle)

df_text = (final_df[final_df['Final_Length_txt_clean'] >= LENGTH_TXT_CLEAN_THRSH][['url', 'Final_Length_txt', 'Final_Length_txt_clean', 'Final_Path_txt']])
df_text = df_text[df_text['Final_Path_txt'] != ''].reset_index(drop = True)
df_text['text'] = ''

start = timer()
for index, row in df_text.iterrows():
    
    print('Reading ' + str(index + 1) + ' / ' + str(len(df_text)), end = '\r')
    
    with open(row['Final_Path_txt']) as f:
        df_text.loc[index, 'text'] = f.read()
print('\nTotal elapsed time:', str(datetime.timedelta(seconds=round(timer()-start))))   

Reading 2301 / 2301
Total elapsed time: 0:00:53


In [28]:
row

url                          https://icomarks.ai/ico/hanuman-universe-token
Final_Length_txt                                                       3384
Final_Length_txt_clean                                                 2701
Final_Path_txt            C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...
text                      \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...
Name: 2300, dtype: object

In [39]:
RE_BAD_CHARS = regex.compile(r"[\p{Cc}\p{Cs}]+")
def remove_bad_chars(text):
    return RE_BAD_CHARS.sub("", text)

df_report=pd.DataFrame()
tot_time=0
for index, row in df_text.iterrows():
    
    print(f'Parsing {index+1} / {len(df_text)}   last interaction: {datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S")}', end = '\r')
    
    text = row['text']
    start=timer()
    isReliable, _, details = cld2.detect(remove_bad_chars(text), returnVectors=False)
    add_row=pd.DataFrame({'url': row['url'],
                          'reliable': isReliable,
                          'total_page_lang': len([x for x in details if x[0] != 'Unknown'])}, index=[index])
    add_row['Lang1']=['undetected']
    for i, v in enumerate(details):
        if v[0] != 'Unknown':
            add_row['Lang'+str(i+1)]=[v[0]]
            add_row['Lang'+str(i+1)+'_accuracy']=[v[2]]
            add_row['Lang'+str(i+1)+'_score']=[v[3]]
    
    eval_time=datetime.timedelta(seconds=round(timer()-start)).total_seconds()
    add_row['eval_time']=eval_time
    lang_preval=(add_row['Lang1'].value_counts().to_frame() / add_row.shape[0]).sort_values(by='Lang1', ascending=False)
    add_row.insert(add_row.columns.get_loc("total_page_lang")+1, 'Lang1_preval', lang_preval.index[0])
    add_row.insert(add_row.columns.get_loc("Lang1_preval")+1, 'Lang1_preval_perc', lang_preval['Lang1'][0])
    add_row['Final_Path_txt']=row['Final_Path_txt']
    tot_time+=eval_time
    df_report=pd.concat([df_report, add_row])

move_col = df_report.pop('Final_Path_txt')
df_report.insert(df_report.shape[1], 'Final_Path_txt', move_col)
    
print('\n\nTotal elapsed time:', str(datetime.timedelta(seconds=round(tot_time))))

df_report.to_csv(os.path.join(RESULTS_FOLDER, '00e_whitepaper_language_report.csv'), index=False, sep=';')
print('\nData saved in', os.path.join(RESULTS_FOLDER, '00e_whitepaper_language_report.csv'))

Parsing 2301 / 2301   last interaction: 18/03/2024 18:43:38

Total elapsed time: 0:00:00

Data saved in .\Results\00e_whitepaper_language_report.csv


In [40]:
df_report = pd.read_csv(os.path.join(RESULTS_FOLDER, '00e_whitepaper_language_report.csv'), sep=';')

display(
(df_report[['url', 'Lang1_preval', 'Lang1_preval_perc']]
 .drop_duplicates()
 .groupby(['url', 'Lang1_preval'], as_index=False)
 .min()
 .drop(columns='url')
 .groupby('Lang1_preval', as_index=False)
 .agg(Tot_Documents = ('Lang1_preval', lambda x: len(x)),
      Min_Prevalence = ('Lang1_preval_perc', lambda x: round(min(x)*100, 2)),
      Min5_Prevalence = ('Lang1_preval_perc', lambda x: round(np.quantile(x, 0.05)*100, 2))) 
 .rename(columns={'Lang1_preval': 'Prevalent Language', 'Tot_Documents': 'Number of documents',
                  'Min_Prevalence': 'Minimum Prevalence %', 'Min5_Prevalence': '5th Percentile Prevalence %'})
 .sort_values(by='Number of documents', ascending=False)
))

display(df_report['reliable'].value_counts().to_frame())
print('Details of "reliable"==False:')
display(df_report[df_report['reliable']==False]['Lang1'].value_counts().to_frame())

Unnamed: 0,Prevalent Language,Number of documents,Minimum Prevalence %,5th Percentile Prevalence %
3,ENGLISH,2215,100.0,100.0
17,undetected,33,100.0,100.0
1,DANISH,11,100.0,100.0
7,INDONESIAN,7,100.0,100.0
15,SPANISH,6,100.0,100.0
6,GERMAN,5,100.0,100.0
12,PORTUGUESE,4,100.0,100.0
8,IRISH,4,100.0,100.0
13,SERBIAN,3,100.0,100.0
9,ITALIAN,3,100.0,100.0


Unnamed: 0,reliable
True,2238
False,63


Details of "reliable"==False:


Unnamed: 0,Lang1
undetected,33
ENGLISH,14
IRISH,4
SERBIAN,3
DUTCH,2
GERMAN,2
INDONESIAN,1
DANISH,1
ZULU,1
SPANISH,1


In [22]:
add_row['eval_time']=eval_time
lang_preval=(add_row['Lang1'].value_counts().to_frame() / add_row.shape[0]).sort_values(by='Lang1', ascending=False)
add_row.insert(add_row.columns.get_loc("total_page_lang")+1, 'Lang1_preval', lang_preval.index[0])
add_row.insert(add_row.columns.get_loc("Lang1_preval")+1, 'Lang1_preval_perc', lang_preval['Lang1'][0])
add_row

Unnamed: 0,url,reliable,total_page_lang,Lang1_preval,Lang1_preval_perc,Lang1,Lang1_accuracy,Lang1_score,Lang2,Lang2_accuracy,Lang2_score,eval_time
2300,https://icomarks.ai/ico/hanuman-universe-token,True,2,ENGLISH,1.0,ENGLISH,94,908.0,LATIN,5,915.0,0.0


## Recover missing pdf with Cryptototem url

In [5]:
with open(os.path.join(CHECKPOINT_FOLDER, 'whitepaper_final.pickle'), 'rb') as handle:
    final_df = pickle.load(handle)

robin = pd.read_csv('IcoDataRobin.csv', sep = ";")[['url', 'pdfWP']].drop_duplicates()
dd = final_df[['url', 'Final_Path_txt', 'Final_Length_txt' ,'Final_Length_txt_clean']].add_suffix('_original').rename(columns={'url_original': 'url'}).merge(robin, on='url', how='left')
dd = dd[~dd['pdfWP'].isna()]
# dd.to_csv('dd.csv', index=False, sep=';')
# dd = dd[dd['Final_Length_txt_clean_original'] <= 10000]
dd.reset_index(inplace=True, drop=True)

In [12]:
# Parse pdf and convert to txt
HEADERS = {"User-Agent": "Chrome/51.0.2704.103"}
URL_ROOT='https://icomarks.com/ico/'    # will be removed from url to create pdf name
RELOAD_PKL=True

OK_count = 0
tot_time = 0
dd['Status'] = ''
dd['Error'] = ''
dd['Final_Path_txt'] = ''
dd['Final_Length_txt'] = ''
dd['Final_Length_txt_clean'] = ''
for index, row in dd.iterrows():
    
    url = row['pdfWP']
    file_name = os.path.join(RECOVERED_FOLDER, row['url'].replace(URL_ROOT.replace(".com", NEW_DOMAIN), '').replace('|', '') + '.pdf')
    file_path_txt = os.path.join(os.getcwd(), RECOVERED_FOLDER, row['url'].replace(URL_ROOT.replace(".com", NEW_DOMAIN), '').replace('|', '') + '.txt')
    file_path_pkl = os.path.join(os.getcwd(), RECOVERED_FOLDER, row['url'].replace(URL_ROOT.replace(".com", NEW_DOMAIN), '').replace('|', '') + '.pkl')

    print('Downloading ' + str(index + 1) + ' / ' + str(len(dd)) + '  - Total OK: ' + str(OK_count), end = '\r')
    
    try:
        # connect
        response = requests.get(url, headers = HEADERS)

        # check response and save pdf
        if response.status_code == 200:
            with open(file_name, "wb") as f:
                f.write(response.content)
            dd.loc[index, 'Status'] = 'OK'
            OK_count += 1
        else:
            dd.loc[index, 'Status'] = response.status_code

    except Exception as e:
        dd.loc[index, 'Status'] = 'ERROR'
        dd.loc[index, 'Error'] = e
        
        
    if response.status_code == 200:
        
        
        try:
            # pdf to txt
            start = timer()
            txt, meta, parsed_pdf = pdf_to_text(file_path=file_name, tesseract_path=TESSERACT_PATH, lang='eng')
            status=parsed_pdf['status']
            eval_time=datetime.timedelta(seconds=round(timer()-start)).total_seconds()
            tot_time+=eval_time

            clean_text = len(txt.replace('\n','').replace(' ', '')) if txt is not None else 0    # clear whitespace to filter empty files
            dd.loc[index, 'Final_Length_txt'] = len(txt) if txt is not None else 0    # case of pdf saved as image
            dd.loc[index, 'Final_Length_txt_clean'] = clean_text

            if clean_text > row['Final_Length_txt_clean_original']:

                # save .pkl
                joblib.dump({'txt': txt, 'meta': meta, 'status': status, 'eval_time': eval_time}, file_path_pkl)

                # save txt
                if txt is not None:
                    dd.loc[index, 'Final_Path_txt'] = file_path_txt
                    with open(file_path_txt, 'w') as f:
                        f.write(unidecode(txt))

            else:
                dd.loc[index, 'Status'] = 'SKIPPED'
                
        except Exception as e:
            dd.loc[index, 'Status'] = 'ERROR'
            dd.loc[index, 'Error'] = e
                
print('\nTotal elapsed time:', str(datetime.timedelta(seconds=round(tot_time))))

dd.to_csv(os.path.join(RESULTS_FOLDER, '00zz_whitepaper_recover_Robin.csv'), index=False, sep=';')

Downloading 981 / 981  - Total OK: 977
Total elapsed time: 0:58:33


In [13]:
# Status of recovered
dd = pd.read_csv(os.path.join(RESULTS_FOLDER, '00zz_whitepaper_recover_Robin.csv'), sep = ";")
display(dd['Status'].value_counts().to_frame())

Unnamed: 0,Status
OK,821
SKIPPED,155
ERROR,3
404,1
520,1


In [22]:
with open(os.path.join(CHECKPOINT_FOLDER,'whitepaper_recover.pickle'), 'rb') as handle:
    final_df_recover = pickle.load(handle)

dd = pd.read_csv(os.path.join(RESULTS_FOLDER, '00zz_whitepaper_recover_Robin.csv'), sep = ";")
dd = dd[dd['Status'] == 'OK']

for index, row in final_df_recover.iterrows():
    
    if row['url'] in dd['url'].values:
        ref_dd = dd[dd['url'] == row['url']]
        final_df_recover.loc[index, 'Status'] = 'OK'
        final_df_recover.loc[index, 'Recover_action'] = 'FROM CRYPTOTOTEM'
        final_df_recover.loc[index, 'WhitepaperUrl'] = ref_dd['pdfWP'].values[0]
        final_df_recover.loc[index, 'Recover_Length_txt'] = int(ref_dd['Final_Length_txt'].values[0])
        final_df_recover.loc[index, 'Recover_Length_txt_clean'] = int(ref_dd['Final_Length_txt_clean'].values[0])
        final_df_recover.loc[index, 'Recover_Path_txt'] = ref_dd['Final_Path_txt'].values[0]
        
with open(os.path.join(CHECKPOINT_FOLDER,'whitepaper_recover.pickle'), 'wb') as handle:
    pickle.dump(final_df_recover, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Recover whitepaper from second download

In [9]:
url_new = pd.read_csv(os.path.join(RESULTS_FOLDER, '01b_ICOmarks_ico_list_new_only.csv'), sep=';')['url']
# dd = pd.read_pickle(os.path.join(CHECKPOINT_FOLDER, 'formatted_df.pkl'))
dd = pd.read_csv(os.path.join(RESULTS_FOLDER, '01g_ICOmarks_ico_list_scraped_formatted.csv'), sep = ";")
dd = dd[dd['url'].isin(url_new)]
dd = dd[~dd['WhitepaperUrl'].isna()][['url', 'WhitepaperUrl']]
dd = dd.reset_index(drop=True)

In [11]:
# Parse pdf and convert to txt
HEADERS = {"User-Agent": "Chrome/51.0.2704.103"}
URL_ROOT='https://icomarks.com/ico/'    # will be removed from url to create pdf name
RELOAD_PKL=True
TEMP_DOWNLOAD_FOLDER = "C:\\Users\\Alessandro Bitetto\\Downloads\\UniPV\\ICOs\\temp_download"

tot_time = 0
dd['Status'] = ''
dd['Error'] = ''
dd['Final_Path_txt'] = ''
dd['Final_Length_txt'] = ''
dd['Final_Length_txt_clean'] = ''
for index, row in dd.iterrows():
    
    url = row['WhitepaperUrl']
    file_name = os.path.join(RECOVERED_FOLDER, row['url'].replace(URL_ROOT.replace(".com", NEW_DOMAIN), '').replace('|', '') + '.pdf')
    file_path_txt = os.path.join(os.getcwd(), RECOVERED_FOLDER, row['url'].replace(URL_ROOT.replace(".com", NEW_DOMAIN), '').replace('|', '') + '.txt')
    file_path_pkl = os.path.join(os.getcwd(), RECOVERED_FOLDER, row['url'].replace(URL_ROOT.replace(".com", NEW_DOMAIN), '').replace('|', '') + '.pkl')

    print('Downloading ' + str(index + 1) + ' / ' + str(len(dd)) + '      '+ row['url'].replace(URL_ROOT.replace(".com", NEW_DOMAIN), '').replace('|', '') + ' '*30, end = '\r')
    
    eval_time=0
    error=''
    txt=None
    if not RELOAD_PKL or not os.path.exists(file_path_pkl):
    
        try:
            # try to download from Dropbox or Drive
            out=''
            if any(x in url for x in ['google', 'goo.gl', 'dropbox']):

                source = 'drive' if any(x in url for x in ['google', 'goo.gl']) else 'dropbox'

                status = 'OK ALREADY DOWNLOADED'
                if not os.path.exists(file_name):

                    out = download_from_drive_dropbox(chromedriver_path=CHROMEDRIVER_PATH, download_url=url,
                                                      download_folder=TEMP_DOWNLOAD_FOLDER, temp_folder=TEMP_DOWNLOAD_FOLDER,
                                                      pdf_name=os.path.basename(file_name),
                                                      move_folder=os.path.join(os.getcwd(), RECOVERED_FOLDER), source=source)
                    if out != 'ok':
                        status = out
                    else:
                        status = 'DOWNLOAD ' + source.upper() + ' - OK'

            else:
                status = 'OK ALREADY DOWNLOADED'
                if not os.path.exists(file_name):
                    # connect
                    response = requests.get(url, headers = HEADERS)

                    # check response and save pdf
                    if response.status_code == 200:
                        with open(file_name, "wb") as f:
                            f.write(response.content)
                        status = 'OK FROM REQUEST'
                    else:
                        status = str(response.status_code)

            if 'OK' in status:

                try:
                    # pdf to txt
                    start = timer()
                    txt, meta, parsed_pdf = pdf_to_text(file_path=file_name, tesseract_path=TESSERACT_PATH, lang='eng')
                    status_pars=parsed_pdf['status']
                    if status_pars == 200:
                        status += 'OK PDF TO TXT'
                    else:
                        status += str(status_pars)
                    eval_time=datetime.timedelta(seconds=round(timer()-start)).total_seconds()

                    # save .pkl
                    joblib.dump({'txt': txt, 'meta': meta, 'status': status, 'eval_time': eval_time}, file_path_pkl)

                except Exception as e:
                    status += ' - ERROR IN PDF TO TXT'
                    error = e
                    
        except Exception as e:
            status = 'ERROR'
            error = e
    
    else:
        rr=joblib.load(file_path_pkl)
        txt=rr['txt']
        status=rr['status']
        eval_time=rr['eval_time']

    
    dd.loc[index, 'Status'] = status
    dd.loc[index, 'Error'] = error
    clean_text = len(txt.replace('\n','').replace(' ', '')) if txt is not None else 0    # clear whitespace to filter empty files
    dd.loc[index, 'Final_Length_txt'] = len(txt) if txt is not None else 0    # case of pdf saved as image
    dd.loc[index, 'Final_Length_txt_clean'] = clean_text
    
    # save txt
    if txt is not None:
        dd.loc[index, 'Final_Path_txt'] = file_path_txt
        with open(file_path_txt, 'w') as f:
            f.write(unidecode(txt))
                            
    tot_time+=eval_time
                
print('\nTotal elapsed time:', str(datetime.timedelta(seconds=round(tot_time))))

dd.to_csv(os.path.join(RESULTS_FOLDER, '00zz_whitepaper_recover_second_download.csv'), index=False, sep=';')

display(dd['Status'].value_counts().to_frame())

Downloading 233 / 233      hanuman-universe-token                                                 
Total elapsed time: 0:34:21


Unnamed: 0,Status
OK FROM REQUESTOK PDF TO TXT,151
ERROR,33
404,19
OK ALREADY DOWNLOADEDOK PDF TO TXT,10
403,5
OK ALREADY DOWNLOADED - ERROR IN PDF TO TXT,4
522,3
DOWNLOAD DRIVE - OKOK PDF TO TXT,3
500,2
page not available,2


In [49]:
STATUS_TO_KEEP = ['OK FROM REQUESTOK PDF TO TXT', 'OK ALREADY DOWNLOADEDOK PDF TO TXT', 'DOWNLOAD DRIVE - OKOK PDF TO TXT']

with open(os.path.join(CHECKPOINT_FOLDER,'whitepaper_recover.pickle'), 'rb') as handle:
    final_df_recover = pickle.load(handle)

dd = pd.read_csv(os.path.join(RESULTS_FOLDER, '00zz_whitepaper_recover_second_download.csv'), sep = ";")
dd = dd[dd['Status'].isin(STATUS_TO_KEEP)]
dd['Recover_action'] = 'FROM SECOND DOWNLOAD'
dd['Status'] = 'OK'
dd['Recover_Length_txt'] = dd['Final_Length_txt']
dd['Recover_Length_txt_clean'] = dd['Final_Length_txt_clean']
dd['Recover_Path_txt'] = dd['Final_Path_txt']


final_df_recover = pd.concat([final_df_recover, dd])
        
with open(os.path.join(CHECKPOINT_FOLDER,'whitepaper_recover.pickle'), 'wb') as handle:
    pickle.dump(final_df_recover, handle, protocol=pickle.HIGHEST_PROTOCOL)