In [None]:
import urllib.request
import re
import csv
import os
import datetime
from bs4 import BeautifulSoup


sitemap = "http://localhost:8000/places/default/sitemap.xml"
csv_filename = "output.csv"


def download(url, user_agent='wswp', num_retries=2, charset='utf-8'):
    request = urllib.request.Request(url)
    request.add_header('User-agent', user_agent)
    try:
        resp = urllib.request.urlopen(request)
        cs = resp.headers.get_content_charset(failobj=charset)
        html = resp.read().decode(cs)
    except Exception as e:
        print('Download error:', e.reason)
        html = None

    return html


def crawl_sitemap(url):
    pages = []
    sitemap = download(url)
    links = re.findall('<loc>(.*?)</loc>', sitemap)
    for link in links:
        html = download(link)
        pages.append({"link": link, "html": html})
    
    return pages


def get_datas(html):
    infos = {}
    
    soup = BeautifulSoup(html, 'html.parser')
    for item in soup.table.find_all("tr"):
        tr = item.find_all("td")
        
        key = tr[0].text.replace(": ", "")
        
        img = tr[1].find('img')
        value = img['src'] if img != None else tr[1].text
        
        infos[key] = value
    
    return infos


def start():
    pages = crawl_sitemap(sitemap)
    
    file_exists = os.path.isfile(csv_filename)
    with open (csv_filename, 'a') as csvfile:
        first_datas = get_datas(pages[0]['html'])   

        headers = ["ID"]
        headers = headers + list(first_datas.keys());
        headers.append('link')
        headers.append('timestamp')

        writer = csv.DictWriter(csvfile, delimiter=',', lineterminator='\n',fieldnames=headers)
        
        if not file_exists:
            writer.writeheader()

        for i in range(0, len(pages)):
            datas = get_datas(pages[i]['html'])
            datas['timestamp'] = datetime.datetime.now()
            datas['link'] = pages[i]['link']
            datas["ID"] = i
            
            writer.writerow(datas)


if __name__ == "__main__":
    start()
