In [1]:
import os
import requests
import pandas as pd
import numpy as np
from timeit import default_timer as timer
import datetime
from unidecode import unidecode
import pickle
import pdfkit
import joblib
import func_timeout
from utils import get_chromedriver, download_from_drive_dropbox, pdf_to_text

TESSERACT_PATH=r'C:\Program Files\Tesseract-OCR\tesseract.exe'   # used for tesseract OCR. See pdf_to_text()

In [2]:
# set folders
WHITEPAPER_FOLDER = '.\\Whitepaper'
ORIGINAL_FOLDER = '.\\Whitepaper\\Original'
CONVERTED_FOLDER = '.\\Whitepaper\\Converted_to_txt'
RECOVERED_FOLDER = '.\\Whitepaper\\Recovered'
CHECKPOINT_FOLDER = '.\\Checkpoints'
RESULTS_FOLDER = '.\\Results'

if not os.path.exists(WHITEPAPER_FOLDER):
    os.makedirs(WHITEPAPER_FOLDER)
if not os.path.exists(ORIGINAL_FOLDER):
    os.makedirs(ORIGINAL_FOLDER)
if not os.path.exists(CONVERTED_FOLDER):
    os.makedirs(CONVERTED_FOLDER)
if not os.path.exists(RECOVERED_FOLDER):
    os.makedirs(RECOVERED_FOLDER)

In [3]:
# load whitepaper url
download_df = pd.read_csv(os.path.join(RESULTS_FOLDER, '01c_ICOmarks_ico_list_scraped_formatted.csv'), sep = ";")

download_df = download_df[['url', 'WhitepaperUrl']].dropna().reset_index(drop = True)
download_df['Status'] = ""
download_df['Error'] = ""
download_df['Path_Original'] = ""

download_df

Unnamed: 0,url,WhitepaperUrl,Status,Error,Path_Original
0,https://icomarks.com/ico/synthetics-ai,https://drive.google.com/file/d/1K7TkqYgCtiarZ...,,,
1,https://icomarks.com/ico/777-bingo,https://777.bingo/paper/Whitepaper.EN.pdf,,,
2,https://icomarks.com/ico/sonic,https://img1.wsimg.com/blobby/go/634ec806-c74c...,,,
3,https://icomarks.com/ico/botchain,botchain.talla.com/whitepaper,,,
4,https://icomarks.com/ico/eclipse,https://eclipsetoken.io/wp-content/uploads/201...,,,
...,...,...,...,...,...
7508,https://icomarks.com/ico/vanhealthing,https://vanhealthing.com/vanhealthing_wp_en.pdf,,,
7509,https://icomarks.com/ico/consensus,https://consensus.ai/whitepaper.pdf,,,
7510,https://icomarks.com/ico/kahnchat,https://www.kahnchat.com/docs/KahnChat-Whitepa...,,,
7511,https://icomarks.com/ico/santiment,https://docs.google.com/document/d/1hHmJQWrPrO...,,,


## Download pdf

In [4]:
HEADERS = {"User-Agent": "Chrome/51.0.2704.103"}
URL_ROOT='https://icomarks.com/ico/'    # will be removed from url to create pdf name
RELOAD_PDF=True

start = timer()
OK_count = 0
for index, row in download_df.iterrows():
    
    url = row['WhitepaperUrl']
    file_name = os.path.join(ORIGINAL_FOLDER, row['url'].replace(URL_ROOT, '').replace('|', '') + '.pdf')

    print('Downloading ' + str(index + 1) + ' / ' + str(len(download_df)) + '  - Total OK: ' + str(OK_count), end = '\r')
    
    if not RELOAD_PDF or not os.path.exists(file_name):
    
        try:
            # connect
            response = requests.get(url, headers = HEADERS)

            # check response and save pdf
            if response.status_code == 200:
                with open(file_name, "wb") as f:
                    f.write(response.content)
                download_df.loc[index, 'Status'] = 'OK'
                download_df.loc[index, 'Path_Original'] = os.path.join(os.getcwd(), file_name)
                OK_count += 1
            else:
                download_df.loc[index, 'Status'] = response.status_code

        except Exception as e:
            download_df.loc[index, 'Status'] = 'ERROR'
            download_df.loc[index, 'Error'] = e
    
    else:
        download_df.loc[index, 'Status'] = 'OK'
        download_df.loc[index, 'Path_Original'] = os.path.join(os.getcwd(), file_name)
        OK_count += 1
    
    # save checkpoint
    download_df.to_csv(os.path.join(CHECKPOINT_FOLDER, 'whitepaper_download.csv'), index=False, sep=';')
            
print('\nTotal elapsed time:', str(datetime.timedelta(seconds=round(timer()-start))))

# save results
download_df.to_csv(os.path.join(RESULTS_FOLDER ,'00a_whitepaper_download_original.csv'), index=False, sep=';')

Downloading 7513 / 7513  - Total OK: 3046
Total elapsed time: 11:06:25


## Check downloaded files and convert to txt

In [3]:
download_df = pd.read_csv(os.path.join(RESULTS_FOLDER, '00a_whitepaper_download_original.csv'), sep = ";")
print(download_df['Status'].value_counts())

final_df = download_df.copy()
final_df = final_df[final_df['Status'] == 'OK'].reset_index(drop = True)
final_df['Path_Recovered'] = ""
final_df['Path_txt'] = ""
final_df['Status_txt'] = ""
final_df['Length_txt'] = 0
final_df['Length_txt_clean'] = 0
final_df['Content_txt'] = ""
final_df['Metadata'] = ""

OK       3046
ERROR    2712
404      1227
403       165
410        91
522        86
520        73
500        23
521        17
530        12
502        11
503        10
504         8
526         7
523         6
301         5
525         4
401         2
524         2
406         2
400         2
402         1
423         1
Name: Status, dtype: int64


In [6]:
# Parse pdf and convert to txt
URL_ROOT='https://icomarks.com/ico/'    # will be removed from url to create pdf name
RELOAD_PKL=True

tot_time=0
for index, row in final_df.iterrows():
    
    file_path = row['Path_Original']
    file_path_txt = os.path.join(os.getcwd(), CONVERTED_FOLDER, row['url'].replace(URL_ROOT, '').replace('|', '') + '.txt')
    file_path_pkl = os.path.join(os.getcwd(), CONVERTED_FOLDER, row['url'].replace(URL_ROOT, '').replace('|', '') + '.pkl')
    
    print('Parsing ' + str(index + 1) + ' / ' + str(len(final_df)), end = '\r')
    
    if not RELOAD_PKL or not os.path.exists(file_path_pkl):
        
        # pdf to txt
        start = timer()
        txt, meta, parsed_pdf = pdf_to_text(file_path=file_path, tesseract_path=TESSERACT_PATH, lang='eng')
        status=parsed_pdf['status']
        eval_time=datetime.timedelta(seconds=round(timer()-start)).total_seconds()
        
        # save .pkl
        joblib.dump({'txt': txt, 'meta': meta, 'status': status, 'eval_time': eval_time}, file_path_pkl)
    
    else:
        rr=joblib.load(file_path_pkl)
        txt=rr['txt']
        meta=rr['meta']
        status=rr['status']
        eval_time=rr['eval_time'] 
        
    final_df.loc[index, 'Status_txt'] = status
    final_df.loc[index, 'Length_txt'] = len(txt) if txt is not None else 0    # case of pdf saved as image
    final_df.loc[index, 'Length_txt_clean'] = len(txt.replace('\n','').replace(' ', '')) if txt is not None else 0    # clear whitespace to filter empty files
    final_df.at[index, 'Content_txt'] = meta['Content-Type']
    final_df.loc[index, 'Metadata'] = [meta]
    tot_time+=eval_time
        
    # save txt
    if txt is not None:
        final_df.loc[index, 'Path_txt'] = file_path_txt
        with open(file_path_txt, 'w') as f:
            f.write(unidecode(txt))
    
    # save checkpoint
    if index % 300 == 0 or index == (len(final_df) - 1):
        final_df.drop(columns='Metadata').to_csv(os.path.join(CHECKPOINT_FOLDER, 'whitepaper_parsing.csv'), index=False, sep=';')
        with open(os.path.join(CHECKPOINT_FOLDER, 'whitepaper_parsing.pickle'), 'wb') as handle:
                    pickle.dump(final_df, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
print('\nTotal elapsed time:', str(datetime.timedelta(seconds=round(tot_time))))

# save results
final_df.drop(columns='Metadata').to_csv(os.path.join(RESULTS_FOLDER, '00b_whitepaper_parsing.csv'), index=False, sep=';')

Parsing 3046 / 3046
Total elapsed time: 1:53:33


In [9]:
# stats
with open(os.path.join(CHECKPOINT_FOLDER, 'whitepaper_parsing.pickle'), 'rb') as handle:
    final_df = pickle.load(handle)

# status
print('-- Status --\n')
print(final_df['Status'].value_counts())
# text length
print('\n\n-- Text length --\n')
print(final_df[['Length_txt', 'Length_txt_clean']].describe(percentiles = [.5, .10, .15, .2, .25, .3, .4, .75, .95]))
# pdf content
cnt = final_df['Content_txt'].values
cnt_single = [x for x in cnt if type(x) == str]
cnt_multiple = [x for x in cnt if type(x) != str]
print('\n\n-- Pdf content --')
print('\n- Single values in "Content_txt" (' + str(len(cnt_single)) + '):')
print(pd.DataFrame({'Content_txt': cnt_single})['Content_txt'].value_counts())
print('\n- Multiple values in "Content_txt" (' + str(len(cnt_multiple)) +') first column is the number of multiple values in list:')
print(pd.DataFrame({'aa': [len(x) for x in cnt_multiple]})['aa'].value_counts())

-- Status --

OK    3046
Name: Status, dtype: int64


-- Text length --

          Length_txt  Length_txt_clean
count    3046.000000       3046.000000
mean    22247.210112      17159.006894
std     35160.519442      27759.354711
min         0.000000          0.000000
10%        71.000000         33.000000
15%        82.750000         44.000000
20%       132.000000         69.000000
25%       320.250000        106.000000
30%       507.500000        286.000000
40%      2322.000000       1256.000000
50%      6009.500000       3623.500000
75%     34120.750000      26762.750000
95%     88420.000000      71595.250000
max    485969.000000     411334.000000


-- Pdf content --

- Single values in "Content_txt" (2938):
text/html; charset=UTF-8                       1185
application/pdf                                1157
text/html; charset=ISO-8859-1                   387
application/xhtml+xml; charset=UTF-8            133
application/xhtml+xml; charset=ISO-8859-1        36
application/octet-st

## Try to recover empty pdf

In [4]:
Length_txt_clean_thrsh = 4000     # threshold for maximum non-empty characters in parsed txt file


with open(os.path.join(CHECKPOINT_FOLDER, 'whitepaper_parsing.pickle'), 'rb') as handle:
    final_df_recover = pickle.load(handle)
final_df_recover['Recover_action'] = "SKIP"
final_df_recover['Recover_Length_txt'] = -1
final_df_recover['Recover_Length_txt_clean'] = -1
final_df_recover['Recover_Path_txt'] = ""

In [5]:
URL_ROOT='https://icomarks.com/ico/'    # will be removed from url to create pdf name
PATH_TO_WHHTMLTOPDF = r'C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe' # define path to wkhtmltopdf.exe, see https://python-bloggers.com/2022/06/convert-html-to-pdf-using-python/
CHROMEDRIVER_PATH = r"C:\Users\Alessandro Bitetto\Downloads\UniPV\ICOs\WebDriver\chromedriver"
TEMP_DOWNLOAD_FOLDER = "C:\\Users\\Alessandro Bitetto\\Downloads\\UniPV\\ICOs\\temp_download"
ID_TO_SKIP = ['494', '1369', '2527', '2548', '4100', '6138']
RELOAD_PDF = True
RELOAD_PKL = True

# Point pdfkit configuration to wkhtmltopdf.exe
config = pdfkit.configuration(wkhtmltopdf=PATH_TO_WHHTMLTOPDF)
start = timer()
for index, row in final_df_recover.iterrows():

    short_name=row['url'].replace(URL_ROOT, '').replace('|', '')
    
    print('Recovering ' + str(index + 1) + ' / ' + str(len(final_df_recover)) + ' (' + short_name + ')' +' '*30, end = '\r')

    file_path_pdf = os.path.join(os.getcwd(), RECOVERED_FOLDER, short_name + '.pdf')
    file_path_txt = os.path.join(os.getcwd(), RECOVERED_FOLDER, short_name + '.txt')
    file_path_pkl = os.path.join(os.getcwd(), RECOVERED_FOLDER, short_name + '.pkl')
    original_len_txt = row['Length_txt']
    original_len_txt_clean = row['Length_txt_clean']
    url = row['WhitepaperUrl']
    
    # check if txt length is below threshold
    if row['Length_txt_clean'] <= Length_txt_clean_thrsh:
        
        if short_name in ID_TO_SKIP:
            pass
        
        ##### download pdf from google.drive or dropbox

        elif any(x in url for x in ['google', 'goo.gl', 'dropbox']):

            source = 'drive' if any(x in url for x in ['google', 'goo.gl']) else 'dropbox'

            if not RELOAD_PDF or not os.path.exists(file_path_pdf):
            
                out = download_from_drive_dropbox(chromedriver_path=CHROMEDRIVER_PATH, download_url=url,
                                                  download_folder=TEMP_DOWNLOAD_FOLDER, temp_folder=TEMP_DOWNLOAD_FOLDER,
                                                  pdf_name=short_name + '.pdf',
                                                  move_folder=os.path.join(os.getcwd(), RECOVERED_FOLDER), source=source)
            else:
                out="ok"

            if out == "ok":

                # pdf to txt
                if not RELOAD_PKL or not os.path.exists(file_path_pkl):
        
                    # pdf to txt
                    start_t = timer()
                    txt, meta, parsed_pdf = pdf_to_text(file_path=file_path_pdf, tesseract_path=TESSERACT_PATH, lang='eng')
                    status=parsed_pdf['status']
                    eval_time=datetime.timedelta(seconds=round(timer()-start_t)).total_seconds()

                    # save .pkl
                    joblib.dump({'txt': txt, 'meta': meta, 'status': status, 'eval_time': eval_time}, file_path_pkl)

                else:
                    rr=joblib.load(file_path_pkl)
                    txt=rr['txt']
                    meta=rr['meta']

                len_txt_clean = len(txt.replace('\n','').replace(' ', '')) if txt is not None else 0 

                # update files
                if len_txt_clean > original_len_txt_clean:

                    final_df_recover.loc[index, 'Recover_action'] = 'DOWNLOAD ' + source.upper() + ' - OK'
                    final_df_recover.loc[index, 'Path_Recovered'] = file_path_pdf
                    final_df_recover.loc[index, 'Recover_Path_txt'] = file_path_txt
                    final_df_recover.loc[index, 'Recover_Length_txt'] = len(txt)
                    final_df_recover.loc[index, 'Recover_Length_txt_clean'] = len_txt_clean
                    final_df_recover.at[index, 'Content_txt'] = meta['Content-Type']
                    final_df_recover.loc[index, 'Metadata'] = [meta]

                    # save txt
                    with open(file_path_txt, 'w') as f:
                        f.write(unidecode(txt))

            else:
                final_df_recover.loc[index, 'Recover_action'] = 'DOWNLOAD ' + source.upper() + ' - ' + out
        
        
        ##### try to download pdf from html page   e.g. https://www.quasa.io/white-paper
        
        else:

            try:
                # Convert Webpage to PDF
                if not RELOAD_PDF or not os.path.exists(file_path_pdf):
                    def download_web(url, output_path):
                        pdfkit.from_url(url, output_path=output_path, configuration=config)
                    def run_function(f, max_wait):
                        try:
                            func_timeout.func_timeout(max_wait, download_web, args=(url, file_path_pdf))
                            return 'ok'
                        except func_timeout.FunctionTimedOut:
                            pass
                        return 'timeout'
                    out = run_function(download_web, 80)    # stop running after 60*2 seconds
                    if out == 'timeout':
                        with open(file_path_pdf, 'w') as outfile:     # save empty pdf so speed up when RELOAD_PDF=True
                            outfile.write("")

                # pdf to txt
                if not RELOAD_PKL or not os.path.exists(file_path_pkl):
        
                    # pdf to txt
                    start_t = timer()
                    txt, meta, parsed_pdf = pdf_to_text(file_path=file_path_pdf, tesseract_path=TESSERACT_PATH, lang='eng')
                    status=parsed_pdf['status']
                    eval_time=datetime.timedelta(seconds=round(timer()-start_t)).total_seconds()

                    # save .pkl
                    joblib.dump({'txt': txt, 'meta': meta, 'status': status, 'eval_time': eval_time}, file_path_pkl)

                else:
                    rr=joblib.load(file_path_pkl)
                    txt=rr['txt']

                len_txt_clean = len(txt.replace('\n','').replace(' ', '')) if txt is not None else 0 

                # update files
                if len_txt_clean > original_len_txt_clean:

                    final_df_recover.loc[index, 'Recover_action'] = "CONVERT FROM HTML"
                    final_df_recover.loc[index, 'Path_Recovered'] = file_path_pdf
                    final_df_recover.loc[index, 'Recover_Path_txt'] = file_path_txt
                    final_df_recover.loc[index, 'Recover_Length_txt'] = len(txt)
                    final_df_recover.loc[index, 'Recover_Length_txt_clean'] = len_txt_clean
                    final_df_recover.at[index, 'Content_txt'] = meta['Content-Type']
                    final_df_recover.loc[index, 'Metadata'] = [meta]

                    # save txt
                    with open(file_path_txt, 'w') as f:
                        f.write(unidecode(txt))
            except:
                final_df_recover.loc[index, 'Recover_action'] = "CONVERT FROM HTML - FAILED"

    else:
        final_df_recover.loc[index, 'Recover_action'] = "KEEP ORIGINAL"
        
    # save checkpoint
    if index % 300 == 0 or index == (len(final_df_recover) - 1):
        final_df_recover.drop(columns='Metadata').to_csv(os.path.join(CHECKPOINT_FOLDER, 'whitepaper_recover.csv'), index=False, sep=';')
        with open(os.path.join(CHECKPOINT_FOLDER,'whitepaper_recover.pickle'), 'wb') as handle:
                    pickle.dump(final_df_recover, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
print('\nTotal elapsed time:', str(datetime.timedelta(seconds=round(timer()-start))))

# save results
final_df_recover.drop(columns='Metadata').to_csv(os.path.join(RESULTS_FOLDER, '00c_whitepaper_recover.csv'), index=False, sep=';')

Recovering 3046 / 3046 (santiment)                                                                      
Total elapsed time: 2:40:53


In [None]:
# recover actions stats
with open('./Checkpoints/whitepaper_recover.pickle', 'rb') as handle:
    final_df_recover = pickle.load(handle)

print('-- Recover actions --\n')
print(final_df_recover['Recover_action'].value_counts())

In [19]:
# recover actions stats
with open('./Checkpoints/whitepaper_recover.pickle', 'rb') as handle:
    final_df_recover = pickle.load(handle)

print('-- Recover actions --\n')
print(final_df_recover['Recover_action'].value_counts())

-- Recover actions --

KEEP ORIGINAL                            1049
SKIP                                      345
CONVERT FROM HTML                         249
DOWNLOAD DRIVE - OK                       149
CONVERT FROM HTML - FAILED                124
DOWNLOAD DRIVE - page not available       114
DOWNLOAD DROPBOX - page not available      17
DOWNLOAD DROPBOX - OK                       9
DOWNLOAD DRIVE - out of time                2
Name: Recover_action, dtype: int64


In [20]:
# set final path for txt files
recover_to_keep = ['CONVERT FROM HTML', 'DOWNLOAD DRIVE - OK', 'DOWNLOAD DROPBOX - OK']

final_df_recover['Final_Path_txt'] = np.where(final_df_recover['Recover_action'] == 'KEEP ORIGINAL', final_df_recover['Path_txt'], '')
final_df_recover['Final_Path_txt'] = np.where(final_df_recover['Recover_action'].isin(recover_to_keep), final_df_recover['Recover_Path_txt'], final_df_recover['Final_Path_txt'])
final_df_recover['Final_Length_txt'] = np.where(final_df_recover['Recover_action'] == 'KEEP ORIGINAL', final_df_recover['Length_txt'], '')
final_df_recover['Final_Length_txt'] = np.where(final_df_recover['Recover_action'].isin(recover_to_keep), final_df_recover['Recover_Length_txt'], final_df_recover['Final_Length_txt'])
final_df_recover['Final_Length_txt_clean'] = np.where(final_df_recover['Recover_action'] == 'KEEP ORIGINAL', final_df_recover['Length_txt_clean'], '')
final_df_recover['Final_Length_txt_clean'] = np.where(final_df_recover['Recover_action'].isin(recover_to_keep), final_df_recover['Recover_Length_txt_clean'], final_df_recover['Final_Length_txt_clean'])


final_df_recover.drop(columns='Metadata').to_csv('./Results/00d_whitepaper_final.csv', index=False, sep=';')

print('Total available whitepapers:', sum(final_df_recover['Final_Path_txt'] != ''))
with open('./Checkpoints/whitepaper_final.pickle', 'wb') as handle:
    pickle.dump(final_df_recover, handle, protocol=pickle.HIGHEST_PROTOCOL)

Total available whitepapers: 1456


In [21]:
final_df_recover.head(5)

Unnamed: 0,id,link_white_paper,Status,Error,Path_Original,Path_Recovered,Path_txt,Status_txt,Length_txt,Length_txt_clean,Content_txt,Metadata,Recover_action,Recover_Length_txt,Recover_Length_txt_clean,Recover_Path_txt,Final_Path_txt,Final_Length_txt,Final_Length_txt_clean
0,2,https://www.tycoon.io/whitepaper.pdf,OK,,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,200,58501,48450,application/pdf,"{'Content-Type': 'application/pdf', 'Last-Modi...",KEEP ORIGINAL,-1,-1,,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,58501,48450
1,3,https://mindsync.ai/docs/whitepaper.pdf,OK,,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,200,73801,61180,application/pdf,"{'Content-Type': 'application/pdf', 'Creation-...",KEEP ORIGINAL,-1,-1,,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,73801,61180
2,5,https://lohncontrol.com/down/LOHN-white-paper-...,OK,,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,200,74525,59500,application/pdf,"{'Author': 'Vali', 'Content-Type': 'applicatio...",KEEP ORIGINAL,-1,-1,,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,74525,59500
3,6,https://emanate.live/pdf/mn8-whitepaper-v12.pdf,OK,,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,200,245,166,application/pdf,"{'Content-Type': 'application/pdf', 'Creation-...",CONVERT FROM HTML,365,292,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,365,292
4,7,https://hashbon.com/info/whitepaper_eng.pdf,OK,,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,200,11197,9114,application/pdf,"{'Content-Type': 'application/pdf', 'Creation-...",KEEP ORIGINAL,-1,-1,,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,11197,9114
