## Level 1 Crawler algorithm

### Before every crawl, make sure to remove the already visited addresses, then ...
### 1. Extract all addresses to visit from a .gexf  
### 2. For each address
> - Create a new folder with name hashed 
> - Extract the main page as a .html and save it in the folder 
> - List all the hyperlinks contained in the page
> > - For each sublinks, if it is a subpage of the main address (same domain), save it as .html in the folder with an incrementing number
> - Once all html pages are saved, clean them to get the text so as to be parsed easily, extracting tags.

In [1]:
# import
import networkx as nx
import os
import requests
import pickle
import hashlib
import time
import re
from bs4 import BeautifulSoup as bs


In [2]:
# Load .gexf
G = nx.readwrite.gexf.read_gexf("diabetes-final-graph.gexf")

Extract the addresses

In [3]:
address_tab = set()
i=0
for node in G.nodes():
    # we check if there is a homepage set
    homepage = G.node[node]['homepage']
    if homepage != 'null':
        #if yes
        address_tab.add(homepage)
    else:
        #else we get the name
        address_tab.add(G.node[node]['name'])

print('Number of website found on the gexf file: ',len(address_tab)) #should be 2355 if files does not change   
#print(address_tab)

Number of website found on the gexf file:  2355


### At this point all the websites are in the var ADDRESS_TAB

In [31]:
# We don't want to re-download an existing folder
# However, the name being hashed, we need to compare every name 
downloaded_websites = set()

directories = os.listdir('pages/')
for directory in directories:
    downloaded_websites.add(directory) 

# Now we have all the already downloaded website BUT hashed    

pending_websites = set(address_tab)    

for d_site in downloaded_websites:
    for p_site in pending_websites.copy():
        if hashlib.md5(p_site.encode('utf-8')).hexdigest() == d_site:
            #print("we remove " + p_site)
            #print("we remove " + d_site)
            pending_websites.remove(p_site)
        
# However the last found folder may not be fully downloaded



#print(pending_website)
print(len(downloaded_websites),' page(s) downloaded.')
print('Still',len(pending_websites),' page(s) to download out of ',len(address_tab))



0  page(s) downloaded.
Still 2355  page(s) to download out of  2355


### At this point :
> - All the websites are in ADDRESS_TAB
> - All the remaining website to download are in PENDING_WEBSITE

In [53]:
for website in pending_websites:
    #Create the folder with name hashed - to avoid the special characters
    print('We create folder ', hashlib.md5(website.encode('utf-8')).hexdigest(), 'for the site ', site)
    folder_name = hashlib.md5(website.encode('utf-8')).hexdigest()
    os.mkdir("pages/" + folder_name)    
    
    
    #go to the page, get the html and save it
    url =''
    #if we don't have the website page, we need to add the http adress (note that the 's' of https seems auto added)
    if website[:4]!='http':
        url+='http://'
    url+=website
    p = requests.get(url)
    with open("pages/"+ folder_name +'/index.html','w+', encoding='utf-8') as fp:
        fp.write(p.text)
            
    #with all hyperlinks, if they are subpage, go to them and save the html 
    soup = bs(p.text)
    sublinks = set()   
        
    for link in soup.find_all('a'):
        try:
            if website in link['href'] or website.lower() in link['href']:
                sublinks.add(link['href'])
        except KeyError:
            #no href in this link
            continue
    print(str(len(sublinks)) + ' sublinks of the same domain found on this page')
    
    page_cpt=1 
    
    for subpage in sublinks:
        print('Treating page ',page_cpt,' out of ',len(sublinks),' please wait ...')
        p = requests.get(subpage)
        
        #clean the page
        soup = bs(p.text, 'html.parser')
        to_clean = soup.findAll(text=True)
        print(to_clean)
        #regex = [r"\/\*(.*)\*\/","\_(.*)\;"]
        #subst = "  "
        #for rex in regex:
        #    result = to_clean
        #    result = re.sub(rex, subst, to_clean, 0, re.MULTILINE)
        #    to_clean = result
        with open("pages/"+ folder_name+"/" + str(page_cpt)+ ".txt",'w+', encoding='utf-8') as fp_sub:
            fp_sub.write(to_clean.text())     
        #only work on the 5 first pages
        if page_cpt == 1:
            break
        page_cpt+=1
        #end of the subpage scrapping
        time.sleep(2) 
    #end of the main website scrapping    
    time.sleep(2)    
    break #test only on the first one
    




We create folder  ac67df6e51ca58d669c683630a5d5c49 for the site  http://type1sweetharp.blogspot.com
33 sublinks of the same domain found on this page
Treating page  1  out of  33  please wait ...
['html', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '[if IE]><script type="text/javascript" src="https://www.blogger.com/static/v1/jsbin/864213505-ieretrofit.js"></script>\n<![endif]', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '[if IE]> <script> (function() { var html5 = ("abbr,article,aside,audio,canvas,datalist,details," + "figure,footer,header,hgroup,mark,menu,meter,nav,output," + "progress,section,time,video").split(\',\'); for (var i = 0; i < html5.length; i++) { document.createElement(html5[i]); } try { document.execCommand(\'BackgroundImageCache\', false, true); } catch(e) {} })(); </script> <![endif]', '\n', 'Welcome to Type 1 SweetHarp!: Customer Service....', '\n', "@font-face{font-family:'Coming Soon';font-style:normal;font-weight:400;src:local('Co

AttributeError: ResultSet object has no attribute 'text'. You're probably treating a list of items like a single item. Did you call find_all() when you meant to call find()?

website = 'Asweetlife.org'
    #folder_name = website
    #os.mkdir("pages/" + folder_name)
    #pickle for hash md5
    
url =''
    #if we don't have the website page, we need to add the http adress (nte that the 's' of https seems auto added)
if website[:3]!='http':
    url+='http://'
url+=website
    
    #go to the page, get the html and save it
p = requests.get(url)
with open("pages/"+ folder_name +"/index.html",'w+', encoding='utf-8') as fp:
    fp.write(p.text)
        
    #with all hyperlinks, if they are subpage, go to them and save the html
soup = bs(p.text)
links = soup.find_all('a')
sublinks = set()    
for link in links:
    if website in link['href'] or website.lower() in link['href']:
        sublinks.add(link['href'])            
print(str(len(sublinks)) + ' sublinks of the same domain found on this page')
page_cpt=1
for subpage in sublinks:
    print('Treating page ',page_cpt,' out of ',len(sublinks),' please wait ...')
    p = requests.get(subpage)
    #clean the page
    
    #save it
    with open("pages/"+ folder_name +"/" + str(page_cpt)+ ".html",'w+', encoding='utf-8') as fp:
        fp.write(p.text)
    #only work on the 5 first pages
    if page_cpt == 5:
        break
    page_cpt+=1    
    time.sleep(0.5)