In [94]:
import os
import logging
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
import json

In [118]:
logging.basicConfig(
    format='%(asctime)s %(levelname)s:%(message)s',
    level=logging.INFO)

class Crawler:

    def __init__(self, urls=[]):
        self.headers = {
          'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/109.0',
        }
        self.decade = 50
        self.visited_urls = []
        self.urls_to_visit = urls
        self.current_year = 0
        self.music_list = []
        self.year_list = []
        self.top_music_by_year = []
        self.add_url_to_visit()

    def add_url_to_visit(self):
        url = self.urls_to_visit.pop(0)
        for i in range(2021,2023):
          new_url = url + str(i) + '/'
          self.year_list.append(i)
          self.urls_to_visit.append(new_url)            
        self.current_year = self.year_list.pop(0)
    def download_url(self, url):
        return requests.get(url, headers=self.headers)

    def get_linked_urls(self, url, html):
        soup = BeautifulSoup(html, 'html.parser')
        for link in soup.find_all('a'):
            path = link.get('href')
            if path and path.startswith('/'):
                path = urljoin(url, path)
            yield path
            
    def parse_list(self, parser, text):
        tag = parser.find(class_='lista') 
        index = 1
        for list_item in tag.find_all('li'):
          content = list_item.text
          content = content.split(' – ')
          self.music_list.append(
              dict(rank = str(index),
                   name = content[0],
                   author = content[1]
                  ))
          index += 1
          
    def assembly_and_release(self):
        top_music = {
            'year' : self.current_year,
            'music_list' : self.music_list
        }
        prev_year = self.current_year
        if len(self.year_list) > 0:
          self.current_year = self.year_list.pop(0)
        self.top_music_by_year.append(top_music)
        self.music_list = []
        if prev_year % 10 == 0:
          filename = 'out' +'/' + str(self.decade) + '.json'
          self.dump(filename)
        if len(self.year_list) == 0:
          filename = 'out/20.json'
          self.dump(filename)          
        
    def dump(self, filename):
        year = str(self.current_year)
        decade = int(year[2:4])
        decade = decade - (decade % 10)
        self.decade = decade
        with open(filename, "w+") as outfile:
          json.dump(self.top_music_by_year, outfile)
        #self.top_music_by_year.clear()

    def crawl(self, url):      
        html = self.download_url(url)
        soup = BeautifulSoup(html.text, 'html.parser')
        self.parse_list(soup, html.text)
        self.assembly_and_release()
        
    def run(self):
        while self.urls_to_visit:
            url = self.urls_to_visit.pop(0)
            logging.info(f'Crawling: {url}')
            try:
                self.crawl(url)
            except Exception:
                logging.exception(f'Failed to crawl: {url}')
            finally:
                self.visited_urls.append(url)
if __name__ == '__main__':
    Crawler(urls=['https://maistocadas.mus.br/']).run()

2023-02-28 13:43:37,843 INFO:Crawling: https://maistocadas.mus.br/2021/
2023-02-28 13:43:38,330 INFO:Crawling: https://maistocadas.mus.br/2022/


In [110]:
23 % 10

3