## Level 1 Crawler algorithm

### Before every crawl, make sure to remove the already visited addresses, then ...
### 1. Extract all addresses to visit from a .gexf  
### 2. For each address
> - Create a new folder with name hashed 
> - Extract the main page as a .html and save it in the folder 
> - List all the hyperlinks contained in the page
> > - For each sublinks, if it is a subpage of the main address (same domain), save it as .html in the folder with an incrementing number
> - Once all html pages are saved, clean them to get the text so as to be parsed easily, extracting tags.

In [1]:
# import
import networkx as nx
import os
import requests
import pickle
import hashlib
import time
import re
from bs4 import BeautifulSoup as bs


In [2]:
# Load .gexf
G = nx.readwrite.gexf.read_gexf("diabetes-final-graph.gexf")

Extract the addresses

In [3]:
address_tab = set()
i=0
for node in G.nodes():
    # we check if there is a homepage set
    homepage = G.node[node]['homepage']
    if homepage != 'null':
        #if yes
        address_tab.add(homepage)
    else:
        #else we get the name
        address_tab.add(G.node[node]['name'])

print('Number of website found on the gexf file: ',len(address_tab)) #should be 2355 if files does not change   
#print(address_tab)

Number of website found on the gexf file:  2355


### At this point all the websites are in the var ADDRESS_TAB

In [4]:
# We don't want to re-download an existing folder
# However, the name being hashed, we need to compare every name 
downloaded_websites = set()

directories = os.listdir('pages/')
for directory in directories:
    downloaded_websites.add(directory) 

# Now we have all the already downloaded website BUT hashed    

pending_websites = set(address_tab)    

for d_site in downloaded_websites:
    for p_site in pending_websites.copy():
        if hashlib.md5(p_site.encode('utf-8')).hexdigest() == d_site:
            #print("we remove " + p_site)
            #print("we remove " + d_site)
            pending_websites.remove(p_site)
        
# However the last found folder may not be fully downloaded



#print(pending_website)
print(len(downloaded_websites),' page(s) downloaded.')
print('Still',len(pending_websites),' page(s) to download out of ',len(address_tab))



1854  page(s) downloaded.
Still 501  page(s) to download out of  2355


### At this point :
> - All the websites are in ADDRESS_TAB
> - All the remaining website to download are in PENDING_WEBSITE

In [5]:
for website in pending_websites:
    #Create the folder with name hashed - to avoid the special characters
    print('We create folder ', hashlib.md5(website.encode('utf-8')).hexdigest(), 'for the site ', website)
    folder_name = hashlib.md5(website.encode('utf-8')).hexdigest()
    try:
        os.mkdir("pages/" + folder_name)    


        #go to the page, get the html and save it
        url =''
        #if we don't have the website page, we need to add the http adress (note that the 's' of https seems auto added)
        if website[:4]!='http':
            url+='http://'
        url+=website
        try:
            p = requests.get(url)
            with open("pages/"+ folder_name +'/index.html','w+', encoding='utf-8') as fp:
                fp.write(p.text)
            with open("pages/"+ folder_name +'/index.txt','w+', encoding='utf-8') as fp:
                soup = bs(p.text,'html.parser')
                fp.write(soup.get_text())    

            #with all hyperlinks, if they are subpage, go to them and save the html 
            soup = bs(p.text)
            sublinks = set()   
            print(str(len(soup.find_all('a'))) + ' sublinks  found on this page')    
            for link in soup.find_all('a'):
                try:
                    #print(link['href'])
                    if website in link['href'] or website.lower() in link['href']:
                        sublinks.add(link['href'])
                except KeyError:
                    #no href in this link
                    continue
            print(str(len(sublinks)) + ' sublinks of the same domain found on this page')

            page_cpt=1 

            for subpage in sublinks:        
                print('Treating page ',page_cpt,' out of ',len(sublinks),' please wait ...')
                try:
                    sub_p = requests.get(subpage)
                    sub_soup = bs(sub_p.text,'html.parser')
                    #print(sub_soup.get_text())
                    #clean the page
                    #soup = bs(p.text, 'html.parser')
                    #to_clean = soup.findAll(text=True)
                    #print(to_clean)
                    #regex = [r"\/\*(.*)\*\/","\_(.*)\;"]
                    #subst = "  "
                    #for rex in regex:
                    #    result = to_clean
                    #    result = re.sub(rex, subst, to_clean, 0, re.MULTILINE)
                    #    to_clean = result

                    with open("pages/"+ folder_name+"/" + str(page_cpt)+ ".txt",'w+', encoding='utf-8') as fp_sub:
                        fp_sub.write(sub_soup.get_text())     
                    #only work on the 5 first pages
                    #if page_cpt == 1:
                    #    break
                    page_cpt+=1
                    #end of the subpage scrapping
                    #time.sleep(0.1)
                except:
                    #the page might not exist anymore
                    continue
            #end of the main website scrapping    
            #time.sleep(0.1)    
            #break #test only on the first one
        except:
            #website may not exist anymore
            continue
    except:
        #duplicate of folder in the gexf
        continue
    




We create folder  68cede68ab8064308f4852eb846da1d6 for the site  http://www.diabetespilot.com
34 sublinks  found on this page
1 sublinks of the same domain found on this page
Treating page  1  out of  1  please wait ...
We create folder  24cc443c8cbae579a3b07f30ae8b8337 for the site  http://diabetsy.com
We create folder  1bcfb31778e9b24f5a7e88247be0d1e5 for the site  http://www.diabetesinspain.com
100 sublinks  found on this page
38 sublinks of the same domain found on this page
Treating page  1  out of  38  please wait ...
Treating page  2  out of  38  please wait ...
Treating page  3  out of  38  please wait ...
Treating page  4  out of  38  please wait ...
Treating page  5  out of  38  please wait ...
Treating page  6  out of  38  please wait ...
Treating page  7  out of  38  please wait ...
Treating page  8  out of  38  please wait ...
Treating page  9  out of  38  please wait ...
Treating page  10  out of  38  please wait ...
Treating page  11  out of  38  please wait ...
Treating