In [1]:
import os
import requests
import pandas as pd
import numpy as np
from timeit import default_timer as timer
import datetime
from unidecode import unidecode
import pickle
import pdfkit
import joblib
import func_timeout
from utils import get_chromedriver, download_from_drive_dropbox, pdf_to_text

TESSERACT_PATH=r'C:\Program Files\Tesseract-OCR\tesseract.exe'   # used for tesseract OCR. See pdf_to_text()

In [2]:
# set folders
WHITEPAPER_FOLDER = '.\\Whitepaper'
ORIGINAL_FOLDER = '.\\Whitepaper\\Original'
CONVERTED_FOLDER = '.\\Whitepaper\\Converted_to_txt'
RECOVERED_FOLDER = '.\\Whitepaper\\Recovered'
CHECKPOINT_FOLDER = '.\\Checkpoints'
RESULTS_FOLDER = '.\\Results'

if not os.path.exists(WHITEPAPER_FOLDER):
    os.makedirs(WHITEPAPER_FOLDER)
if not os.path.exists(ORIGINAL_FOLDER):
    os.makedirs(ORIGINAL_FOLDER)
if not os.path.exists(CONVERTED_FOLDER):
    os.makedirs(CONVERTED_FOLDER)
if not os.path.exists(RECOVERED_FOLDER):
    os.makedirs(RECOVERED_FOLDER)

In [3]:
# load whitepaper url
download_df = pd.read_csv(os.path.join(RESULTS_FOLDER, '01c_ICOmarks_ico_list_scraped_formatted.csv'), sep = ";")

download_df = download_df[['url', 'WhitepaperUrl']].dropna().reset_index(drop = True)
download_df['Status'] = ""
download_df['Error'] = ""
download_df['Path_Original'] = ""

download_df

Unnamed: 0,url,WhitepaperUrl,Status,Error,Path_Original
0,https://icomarks.com/ico/synthetics-ai,https://drive.google.com/file/d/1K7TkqYgCtiarZ...,,,
1,https://icomarks.com/ico/777-bingo,https://777.bingo/paper/Whitepaper.EN.pdf,,,
2,https://icomarks.com/ico/sonic,https://img1.wsimg.com/blobby/go/634ec806-c74c...,,,
3,https://icomarks.com/ico/botchain,botchain.talla.com/whitepaper,,,
4,https://icomarks.com/ico/eclipse,https://eclipsetoken.io/wp-content/uploads/201...,,,
...,...,...,...,...,...
7508,https://icomarks.com/ico/vanhealthing,https://vanhealthing.com/vanhealthing_wp_en.pdf,,,
7509,https://icomarks.com/ico/consensus,https://consensus.ai/whitepaper.pdf,,,
7510,https://icomarks.com/ico/kahnchat,https://www.kahnchat.com/docs/KahnChat-Whitepa...,,,
7511,https://icomarks.com/ico/santiment,https://docs.google.com/document/d/1hHmJQWrPrO...,,,


## Download pdf

In [4]:
HEADERS = {"User-Agent": "Chrome/51.0.2704.103"}
URL_ROOT='https://icomarks.com/ico/'    # will be removed from url to create pdf name
RELOAD_PDF=True

start = timer()
OK_count = 0
for index, row in download_df.iterrows():
    
    url = row['WhitepaperUrl']
    file_name = os.path.join(ORIGINAL_FOLDER, row['url'].replace(URL_ROOT, '').replace('|', '') + '.pdf')

    print('Downloading ' + str(index + 1) + ' / ' + str(len(download_df)) + '  - Total OK: ' + str(OK_count), end = '\r')
    
    if not RELOAD_PDF or not os.path.exists(file_name):
    
        try:
            # connect
            response = requests.get(url, headers = HEADERS)

            # check response and save pdf
            if response.status_code == 200:
                with open(file_name, "wb") as f:
                    f.write(response.content)
                download_df.loc[index, 'Status'] = 'OK'
                download_df.loc[index, 'Path_Original'] = os.path.join(os.getcwd(), file_name)
                OK_count += 1
            else:
                download_df.loc[index, 'Status'] = response.status_code

        except Exception as e:
            download_df.loc[index, 'Status'] = 'ERROR'
            download_df.loc[index, 'Error'] = e
    
    else:
        download_df.loc[index, 'Status'] = 'OK'
        download_df.loc[index, 'Path_Original'] = os.path.join(os.getcwd(), file_name)
        OK_count += 1
    
    # save checkpoint
    download_df.to_csv(os.path.join(CHECKPOINT_FOLDER, 'whitepaper_download.csv'), index=False, sep=';')
            
print('\nTotal elapsed time:', str(datetime.timedelta(seconds=round(timer()-start))))

# save results
download_df.to_csv(os.path.join(RESULTS_FOLDER ,'00a_whitepaper_download_original.csv'), index=False, sep=';')

Downloading 7513 / 7513  - Total OK: 3046
Total elapsed time: 11:06:25


## Check downloaded files and convert to txt

In [17]:
download_df = pd.read_csv(os.path.join(RESULTS_FOLDER, '00a_whitepaper_download_original.csv'), sep = ";")
print(download_df['Status'].value_counts())

final_df = download_df.copy()
final_df = final_df[final_df['Status'] == 'OK'].reset_index(drop = True)
final_df['Path_Recovered'] = ""
final_df['Path_txt'] = ""
final_df['Status_txt'] = ""
final_df['Length_txt'] = 0
final_df['Length_txt_clean'] = 0
final_df['Content_txt'] = ""
final_df['Metadata'] = ""

OK       3048
ERROR    2710
404      1227
403       165
410        91
522        86
520        73
500        23
521        17
530        12
502        11
503        10
504         8
526         7
523         6
301         5
525         4
401         2
524         2
406         2
400         2
402         1
423         1
Name: Status, dtype: int64


In [18]:
# Parse pdf and convert to txt
URL_ROOT='https://icomarks.com/ico/'    # will be removed from url to create pdf name
RELOAD_PKL=True

tot_time=0
for index, row in final_df.iterrows():
    
    file_path = row['Path_Original']
    file_path_txt = os.path.join(os.getcwd(), CONVERTED_FOLDER, row['url'].replace(URL_ROOT, '').replace('|', '') + '.txt')
    file_path_pkl = os.path.join(os.getcwd(), CONVERTED_FOLDER, row['url'].replace(URL_ROOT, '').replace('|', '') + '.pkl')
    
    print('Parsing ' + str(index + 1) + ' / ' + str(len(final_df)), end = '\r')
    
    if not RELOAD_PKL or not os.path.exists(file_path_pkl):
        
        # pdf to txt
        start = timer()
        txt, meta, parsed_pdf = pdf_to_text(file_path=file_path, tesseract_path=TESSERACT_PATH, lang='eng')
        status=parsed_pdf['status']
        eval_time=datetime.timedelta(seconds=round(timer()-start)).total_seconds()
        
        # save .pkl
        joblib.dump({'txt': txt, 'meta': meta, 'status': status, 'eval_time': eval_time}, file_path_pkl)
    
    else:
        rr=joblib.load(file_path_pkl)
        txt=rr['txt']
        meta=rr['meta']
        status=rr['status']
        eval_time=rr['eval_time'] 
        
    final_df.loc[index, 'Status_txt'] = status
    final_df.loc[index, 'Length_txt'] = len(txt) if txt is not None else 0    # case of pdf saved as image
    final_df.loc[index, 'Length_txt_clean'] = len(txt.replace('\n','').replace(' ', '')) if txt is not None else 0    # clear whitespace to filter empty files
    final_df.at[index, 'Content_txt'] = meta['Content-Type']
    final_df.loc[index, 'Metadata'] = [meta]
    tot_time+=eval_time
        
    # save txt
    if txt is not None:
        final_df.loc[index, 'Path_txt'] = file_path_txt
        with open(file_path_txt, 'w') as f:
            f.write(unidecode(txt))
    
    # save checkpoint
    if index % 300 == 0 or index == (len(final_df) - 1):
        final_df.drop(columns='Metadata').to_csv(os.path.join(CHECKPOINT_FOLDER, 'whitepaper_parsing.csv'), index=False, sep=';')
        with open(os.path.join(CHECKPOINT_FOLDER, 'whitepaper_parsing.pickle'), 'wb') as handle:
                    pickle.dump(final_df, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
print('\nTotal elapsed time:', str(datetime.timedelta(seconds=round(tot_time))))

# save results
final_df.drop(columns='Metadata').to_csv(os.path.join(RESULTS_FOLDER, '00b_whitepaper_parsing.csv'), index=False, sep=';')

Parsing 3048 / 3048
Total elapsed time: 1:53:39


In [19]:
# stats
with open(os.path.join(CHECKPOINT_FOLDER, 'whitepaper_parsing.pickle'), 'rb') as handle:
    final_df = pickle.load(handle)

# status
print('-- Status --\n')
print(final_df['Status'].value_counts())
# text length
print('\n\n-- Text length --\n')
print(final_df[['Length_txt', 'Length_txt_clean']].describe(percentiles = [.5, .10, .15, .2, .25, .3, .4, .75, .95]))
# pdf content
cnt = final_df['Content_txt'].values
cnt_single = [x for x in cnt if type(x) == str]
cnt_multiple = [x for x in cnt if type(x) != str]
print('\n\n-- Pdf content --')
print('\n- Single values in "Content_txt" (' + str(len(cnt_single)) + '):')
print(pd.DataFrame({'Content_txt': cnt_single})['Content_txt'].value_counts())
print('\n- Multiple values in "Content_txt" (' + str(len(cnt_multiple)) +') first column is the number of multiple values in list:')
print(pd.DataFrame({'aa': [len(x) for x in cnt_multiple]})['aa'].value_counts())

-- Status --

OK    3048
Name: Status, dtype: int64


-- Text length --

          Length_txt  Length_txt_clean
count    3048.000000       3048.000000
mean    22257.032152      17168.373360
std     35151.340586      27752.542292
min         0.000000          0.000000
10%        71.000000         33.000000
15%        83.000000         44.000000
20%       133.600000         69.000000
25%       320.750000        106.000000
30%       508.200000        286.200000
40%      2323.600000       1264.800000
50%      6021.000000       3638.500000
75%     34135.500000      26832.750000
95%     88365.600000      71564.550000
max    485969.000000     411334.000000


-- Pdf content --

- Single values in "Content_txt" (2940):
text/html; charset=UTF-8                       1184
application/pdf                                1160
text/html; charset=ISO-8859-1                   387
application/xhtml+xml; charset=UTF-8            133
application/xhtml+xml; charset=ISO-8859-1        36
application/octet-st

## Try to recover empty pdf

In [20]:
LENGTH_TXT_CLEAN_THRSH = 4000     # threshold for maximum non-empty characters in parsed txt file


with open(os.path.join(CHECKPOINT_FOLDER, 'whitepaper_parsing.pickle'), 'rb') as handle:
    final_df_recover = pickle.load(handle)
final_df_recover['Recover_action'] = "SKIP"
final_df_recover['Recover_Length_txt'] = -1
final_df_recover['Recover_Length_txt_clean'] = -1
final_df_recover['Recover_Path_txt'] = ""

In [21]:
URL_ROOT='https://icomarks.com/ico/'    # will be removed from url to create pdf name
PATH_TO_WHHTMLTOPDF = r'C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe' # define path to wkhtmltopdf.exe, see https://python-bloggers.com/2022/06/convert-html-to-pdf-using-python/
CHROMEDRIVER_PATH = r"C:\Users\Alessandro Bitetto\Downloads\UniPV\ICOs\WebDriver\chromedriver"
TEMP_DOWNLOAD_FOLDER = "C:\\Users\\Alessandro Bitetto\\Downloads\\UniPV\\ICOs\\temp_download"
ID_TO_SKIP = []
RELOAD_PDF = True
RELOAD_PKL = True

# Point pdfkit configuration to wkhtmltopdf.exe
config = pdfkit.configuration(wkhtmltopdf=PATH_TO_WHHTMLTOPDF)
start = timer()
for index, row in final_df_recover.iterrows():

    short_name=row['url'].replace(URL_ROOT, '').replace('|', '')
    
    print('Recovering ' + str(index + 1) + ' / ' + str(len(final_df_recover)) + ' (' + short_name + ')' +' '*30, end = '\r')

    file_path_pdf = os.path.join(os.getcwd(), RECOVERED_FOLDER, short_name + '.pdf')
    file_path_txt = os.path.join(os.getcwd(), RECOVERED_FOLDER, short_name + '.txt')
    file_path_pkl = os.path.join(os.getcwd(), RECOVERED_FOLDER, short_name + '.pkl')
    original_len_txt = row['Length_txt']
    original_len_txt_clean = row['Length_txt_clean']
    url = row['WhitepaperUrl']
    
    # check if txt length is below threshold
    if row['Length_txt_clean'] <= LENGTH_TXT_CLEAN_THRSH:
        
        if short_name in ID_TO_SKIP:
            pass
        
        ##### download pdf from google.drive or dropbox

        elif any(x in url for x in ['google', 'goo.gl', 'dropbox']):

            source = 'drive' if any(x in url for x in ['google', 'goo.gl']) else 'dropbox'

            if not RELOAD_PDF or not os.path.exists(file_path_pdf):
            
                out = download_from_drive_dropbox(chromedriver_path=CHROMEDRIVER_PATH, download_url=url,
                                                  download_folder=TEMP_DOWNLOAD_FOLDER, temp_folder=TEMP_DOWNLOAD_FOLDER,
                                                  pdf_name=short_name + '.pdf',
                                                  move_folder=os.path.join(os.getcwd(), RECOVERED_FOLDER), source=source)
            else:
                out="ok"

            if out == "ok":

                # pdf to txt
                if not RELOAD_PKL or not os.path.exists(file_path_pkl):
        
                    # pdf to txt
                    start_t = timer()
                    txt, meta, parsed_pdf = pdf_to_text(file_path=file_path_pdf, tesseract_path=TESSERACT_PATH, lang='eng')
                    status=parsed_pdf['status']
                    eval_time=datetime.timedelta(seconds=round(timer()-start_t)).total_seconds()

                    # save .pkl
                    joblib.dump({'txt': txt, 'meta': meta, 'status': status, 'eval_time': eval_time}, file_path_pkl)

                else:
                    rr=joblib.load(file_path_pkl)
                    txt=rr['txt']
                    meta=rr['meta']

                len_txt_clean = len(txt.replace('\n','').replace(' ', '')) if txt is not None else 0 

                # update files
                if len_txt_clean > original_len_txt_clean:

                    final_df_recover.loc[index, 'Recover_action'] = 'DOWNLOAD ' + source.upper() + ' - OK'
                    final_df_recover.loc[index, 'Path_Recovered'] = file_path_pdf
                    final_df_recover.loc[index, 'Recover_Path_txt'] = file_path_txt
                    final_df_recover.loc[index, 'Recover_Length_txt'] = len(txt)
                    final_df_recover.loc[index, 'Recover_Length_txt_clean'] = len_txt_clean
                    final_df_recover.at[index, 'Content_txt'] = meta['Content-Type']
                    final_df_recover.loc[index, 'Metadata'] = [meta]

                    # save txt
                    with open(file_path_txt, 'w') as f:
                        f.write(unidecode(txt))

            else:
                final_df_recover.loc[index, 'Recover_action'] = 'DOWNLOAD ' + source.upper() + ' - ' + out
        
        
        ##### try to download pdf from html page   e.g. https://www.quasa.io/white-paper
        
        else:

            try:
                # Convert Webpage to PDF
                if not RELOAD_PDF or not os.path.exists(file_path_pdf):
                    def download_web(url, output_path):
                        pdfkit.from_url(url, output_path=output_path, configuration=config)
                    def run_function(f, max_wait):
                        try:
                            func_timeout.func_timeout(max_wait, download_web, args=(url, file_path_pdf))
                            return 'ok'
                        except func_timeout.FunctionTimedOut:
                            pass
                        return 'timeout'
                    out = run_function(download_web, 80)    # stop running after 60*2 seconds
                    if out == 'timeout':
                        with open(file_path_pdf, 'w') as outfile:     # save empty pdf so speed up when RELOAD_PDF=True
                            outfile.write("")

                # pdf to txt
                if not RELOAD_PKL or not os.path.exists(file_path_pkl):
        
                    # pdf to txt
                    start_t = timer()
                    txt, meta, parsed_pdf = pdf_to_text(file_path=file_path_pdf, tesseract_path=TESSERACT_PATH, lang='eng')
                    status=parsed_pdf['status']
                    eval_time=datetime.timedelta(seconds=round(timer()-start_t)).total_seconds()

                    # save .pkl
                    joblib.dump({'txt': txt, 'meta': meta, 'status': status, 'eval_time': eval_time}, file_path_pkl)

                else:
                    rr=joblib.load(file_path_pkl)
                    txt=rr['txt']

                len_txt_clean = len(txt.replace('\n','').replace(' ', '')) if txt is not None else 0 

                # update files
                if len_txt_clean > original_len_txt_clean:

                    final_df_recover.loc[index, 'Recover_action'] = "CONVERT FROM HTML"
                    final_df_recover.loc[index, 'Path_Recovered'] = file_path_pdf
                    final_df_recover.loc[index, 'Recover_Path_txt'] = file_path_txt
                    final_df_recover.loc[index, 'Recover_Length_txt'] = len(txt)
                    final_df_recover.loc[index, 'Recover_Length_txt_clean'] = len_txt_clean
                    final_df_recover.at[index, 'Content_txt'] = meta['Content-Type']
                    final_df_recover.loc[index, 'Metadata'] = [meta]

                    # save txt
                    with open(file_path_txt, 'w') as f:
                        f.write(unidecode(txt))
            except:
                final_df_recover.loc[index, 'Recover_action'] = "CONVERT FROM HTML - FAILED"

    else:
        final_df_recover.loc[index, 'Recover_action'] = "KEEP ORIGINAL"
        
    # save checkpoint
    if index % 300 == 0 or index == (len(final_df_recover) - 1):
        final_df_recover.drop(columns='Metadata').to_csv(os.path.join(CHECKPOINT_FOLDER, 'whitepaper_recover.csv'), index=False, sep=';')
        with open(os.path.join(CHECKPOINT_FOLDER,'whitepaper_recover.pickle'), 'wb') as handle:
                    pickle.dump(final_df_recover, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
print('\nTotal elapsed time:', str(datetime.timedelta(seconds=round(timer()-start))))

# save results
final_df_recover.drop(columns='Metadata').to_csv(os.path.join(RESULTS_FOLDER, '00c_whitepaper_recover.csv'), index=False, sep=';')

Recovering 3048 / 3048 (santiment)                                                                      
Total elapsed time: 0:50:42


In [22]:
# recover actions stats
with open(os.path.join(CHECKPOINT_FOLDER,'whitepaper_recover.pickle'), 'rb') as handle:
    final_df_recover = pickle.load(handle)

print('-- Recover actions --\n')
print(final_df_recover['Recover_action'].value_counts())

-- Recover actions --

KEEP ORIGINAL                            1485
SKIP                                      603
CONVERT FROM HTML                         475
DOWNLOAD DRIVE - OK                       215
DOWNLOAD DRIVE - page not available       140
CONVERT FROM HTML - FAILED                 98
DOWNLOAD DROPBOX - OK                      15
DOWNLOAD DROPBOX - page not available      14
DOWNLOAD DRIVE - out of time                2
DOWNLOAD DROPBOX - out of time              1
Name: Recover_action, dtype: int64


In [40]:
# set final path for txt files
RECOVER_TO_KEEP = ['CONVERT FROM HTML', 'DOWNLOAD DRIVE - OK', 'DOWNLOAD DROPBOX - OK']
LENGTH_TXT_CLEAN_THRSH = 4000     # threshold for maximum non-empty characters in parsed txt file

final_df_recover['Final_Path_txt'] = np.where(final_df_recover['Recover_action'] == 'KEEP ORIGINAL', final_df_recover['Path_txt'], '')
final_df_recover['Final_Path_txt'] = np.where(final_df_recover['Recover_action'].isin(RECOVER_TO_KEEP), final_df_recover['Recover_Path_txt'], final_df_recover['Final_Path_txt'])
final_df_recover['Final_Length_txt'] = np.where(final_df_recover['Recover_action'] == 'KEEP ORIGINAL', final_df_recover['Length_txt'], '')
final_df_recover['Final_Length_txt'] = np.where(final_df_recover['Recover_action'].isin(RECOVER_TO_KEEP), final_df_recover['Recover_Length_txt'], final_df_recover['Final_Length_txt'])
final_df_recover['Final_Length_txt_clean'] = np.where(final_df_recover['Recover_action'] == 'KEEP ORIGINAL', final_df_recover['Length_txt_clean'], -1)
final_df_recover['Final_Length_txt_clean'] = np.where(final_df_recover['Recover_action'].isin(RECOVER_TO_KEEP), final_df_recover['Recover_Length_txt_clean'], final_df_recover['Final_Length_txt_clean'])


final_df_recover.drop(columns='Metadata').to_csv(os.path.join(RESULTS_FOLDER, '00d_whitepaper_final.csv'), index=False, sep=';')

print('Total available whitepapers:', sum(final_df_recover['Final_Path_txt'] != ''))
print(f'   - with more than {LENGTH_TXT_CLEAN_THRSH} clean characters:', sum(final_df_recover['Final_Length_txt_clean'] >= LENGTH_TXT_CLEAN_THRSH))
with open(os.path.join(CHECKPOINT_FOLDER,'whitepaper_final.pickle'), 'wb') as handle:
    pickle.dump(final_df_recover, handle, protocol=pickle.HIGHEST_PROTOCOL)

Total available whitepapers: 2190
   - with more than 4000 clean characters: 1766


## Use NLP to extract features

In [44]:
nlp_df=pd.read_csv(os.path.join(RESULTS_FOLDER, '00d_whitepaper_final.csv'), sep=';')
nlp_df=nlp_df[nlp_df['Final_Length_txt_clean'] >= LENGTH_TXT_CLEAN_THRSH][['url', 'Final_Path_txt', 'Final_Length_txt_clean']]
nlp_df

Unnamed: 0,url,Final_Path_txt,Final_Length_txt_clean
0,https://icomarks.com/ico/the-mill-of-blood,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,8655
2,https://icomarks.com/ico/moonlight,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,53667
4,https://icomarks.com/ico/digithoth,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,5950
5,https://icomarks.com/ico/tourcom-blockchain,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,40533
6,https://icomarks.com/ico/migland,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,27525
...,...,...,...
3039,https://icomarks.com/ico/coinseed,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,32416
3040,https://icomarks.com/ico/cindx,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,77606
3043,https://icomarks.com/ico/ispolink,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,49948
3045,https://icomarks.com/ico/deepcloud,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,55228


In [25]:
final_df_recover

Unnamed: 0,url,WhitepaperUrl,Status,Error,Path_Original,Path_Recovered,Path_txt,Status_txt,Length_txt,Length_txt_clean,Content_txt,Metadata,Recover_action,Recover_Length_txt,Recover_Length_txt_clean,Recover_Path_txt,Final_Path_txt,Final_Length_txt,Final_Length_txt_clean
0,https://icomarks.com/ico/the-mill-of-blood,https://millofblood.com/white-paper.php,OK,,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,200,23152,8655,text/html; charset=UTF-8,"{'Content-Encoding': 'UTF-8', 'Content-Languag...",KEEP ORIGINAL,-1,-1,,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,23152,8655
1,https://icomarks.com/ico/xenchain,http://xenchain.io/whitepaper.pdf,OK,,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,200,316,88,text/html; charset=ISO-8859-1,"{'Content-Encoding': 'ISO-8859-1', 'Content-La...",CONVERT FROM HTML,133,95,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,133,95
2,https://icomarks.com/ico/moonlight,https://assets.moonlight.io/docs/wp/moonlight_...,OK,,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,200,64263,53667,application/pdf,"{'Author': 'moonlight.io', 'Content-Type': 'ap...",KEEP ORIGINAL,-1,-1,,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,64263,53667
3,https://icomarks.com/ico/shipnext,https://shipnext.io/assets/files/ShipNext_Whit...,OK,,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,200,3841,3157,text/html; charset=UTF-8,"{'Content-Encoding': 'UTF-8', 'Content-Languag...",SKIP,-1,-1,,,,
4,https://icomarks.com/ico/digithoth,https://digithoth.com/wp-content/uploads/2018/...,OK,,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,200,22151,5950,text/html; charset=UTF-8,"{'Content-Encoding': 'UTF-8', 'Content-Languag...",KEEP ORIGINAL,-1,-1,,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,22151,5950
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3043,https://icomarks.com/ico/ispolink,https://ispolink.com/Ispolink_Whitepaper_v1.pdf,OK,,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,200,60721,49948,application/pdf,"{'Content-Type': 'application/pdf', 'Creation-...",KEEP ORIGINAL,-1,-1,,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,60721,49948
3044,https://icomarks.com/ico/olportal,https://olportal.ai/static/media/wp_eng.pdf,OK,,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,,200,0,0,application/pdf,"{'Content-Type': 'application/pdf', 'Creation-...",CONVERT FROM HTML,322,236,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,322,236
3045,https://icomarks.com/ico/deepcloud,https://www.dropbox.com/s/st6ldsd5shfdz3y/Deep...,OK,,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,200,115,51,application/pdf,"{'Content-Type': 'application/pdf', 'Creation-...",DOWNLOAD DROPBOX - OK,66230,55228,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,66230,55228
3046,https://icomarks.com/ico/kahnchat,https://www.kahnchat.com/docs/KahnChat-Whitepa...,OK,,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,200,5493,4437,text/html; charset=UTF-8,"{'Content-Encoding': 'UTF-8', 'Content-Languag...",KEEP ORIGINAL,-1,-1,,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,5493,4437
