In [4]:
import requests
import json 
import random
import time
import os
import math
from bs4 import BeautifulSoup
from selenium import webdriver
from PIL import Image  
import PIL
import statistics
import re
import urllib
from selenium.webdriver.firefox.options import Options

In [128]:
class BirdCrawler:
    """
    store_path: onde armazena as imagens crawladas
    initial_link: link inicial da wikiaves
    bird_link_list_photo: lista de links para fotos de cada espécie
    bird_link_list_sound: lista de links para áudios de cada espécie
    species_list = lista de todas as espécies
    num_photo: lista do número de fotos por espécie
    num_sound: lista do número de áudios por espécie
    """
    
    def __init__(self, store_path = '', 
                 initial_link_photo = "https://www.wikiaves.com.br/especies.php?t=t&o=5", 
                 initial_link_sound = "https://www.wikiaves.com.br/especies.php?t=t&o=4",
                 photo = True):
        self.bird_link_list_photo = [] 
        self.bird_link_list_sound = []
        self.species_list = []
        self.path = store_path
        if photo == True:
            self.initial_link = initial_link_photo
        else:
            self.initial_link = initial_link_sound
        self.num_photo = []
        self.num_sound = []
        self.browser = None
        self.soup = None
        self.photo = photo
        
    def connect_to_internet(self):
        firefox_profile = webdriver.FirefoxProfile()
        options = Options()
        options.add_argument('--headless')
        self.browser = webdriver.Firefox(firefox_profile = firefox_profile, options = options)
    
    def get_list_link_num(self):
        for especie in self.soup.find_all(class_="font-blue"):
            if especie.get('href') is not None and ("https://www.wikiaves.com.br/" + especie.get('href'))[42] == "f":
                self.bird_link_list_photo.append("https://www.wikiaves.com.br/" + especie.get('href'))
                self.num_photo.append(especie.text)
            elif especie.get('href') is not None and ("https://www.wikiaves.com.br/" + especie.get('href'))[42] == "s":
                self.bird_link_list_sound.append("https://www.wikiaves.com.br/" + especie.get('href'))
                self.num_sound.append(especie.text)

            
    def get_species(self):
        for especie in self.soup.find_all(class_="font-green-dark"):
            if especie.text not in self.species_list:
                self.species_list.append(especie.text)
        
                
    def get_information(self):
        self.browser.get(self.initial_link)
        html = self.browser.page_source
        self.soup = BeautifulSoup(html, "html.parser")
        self.get_list_link_num()
        self.get_species()
        
    def create_dir(self):
        os.mkdir(self.path + "/images")
        os.mkdir(self.path + '/sounds')
        for especie in self.species_list:
            os.mkdir(self.path + "/images/{}".format(especie))
            os.mkdir(self.path + "/sounds/{}".format(especie))
    
    def export_links_to_txt(self):
        file_photo = open(self.path + "/links_photo.txt", "w")
        for k in range(len(self.bird_link_list_photo)):
            file_photo.write(self.bird_link_list_photo[k] + "\n")
            
        file_sound = open("links_sound.txt", "w")
        for k in range(len(self.bird_link_list_sound)):
            file_sound.write(self.bird_link_list_sound[k] + "\n")
    
    def import_links_from_txt(self):
        links_photo = open(self.path +"/links_photo.txt", "r")
        links_photo = links_photo.read()
        links_photo = links_photo.split("\n")[:-1]
        
        links_sound = open(self.path + "/links_sound.txt", "r")
        links_sound = links_sound.read()
        links_sound = links_sound.split("\n")[:-1]
        
        return links_photo, links_sound
        
    def crawl_one_photo_link(self, especie):
        list_links = self.import_links_from_txt()[0]
        for k in range(len(self.species_list)):
            if self.species_list[k] in especie:
                print(list_links[k])
                self.browser.get(list_links[k])
                i = 0
                for j in range(1000000):
                    if j % 10 == 0:
                        html = self.browser.page_source
                        self.soup = BeautifulSoup(html, 'html.parser')
                        imagens = self.soup.find_all(class_ = "img-responsive")
                        del html
                        self.soup = None
                    if len(imagens) >= int(self.num_photo[k]):
                        break
                    if j % 100 == 0:
                        print(j)
                    self.browser.execute_script("window.scrollTo(0, {})".format(2000 + i))
                    i += 2000
                self.save_images(self.browser, especie)
    
    def crawl_one_audio_link(self, especie):
        list_links = self.import_links_from_txt()[1]
        for k in range(len(self.species_list)):
            if self.species_list[k] in especie:
                print(list_links[k])
                self.browser.get(list_links[k])
                i = 0
                for j in range(1000000):
                    if j % 100 == 0:
                        html = self.browser.page_source
                        self.soup = BeautifulSoup(html, 'html.parser')
                        sounds = self.soup.find_all(class_ = 'mejs-container svg wikiaves-player progression-single progression-skin progression-minimal-dark progression-audio-player mejs-audio')
                        del html
                        self.soup = None
                    if len(sounds) >= int(self.num_sound[k]):
                        break
                    if j % 100 == 0:
                        print(j)
                    self.browser.execute_script("window.scrollTo(0, {})".format(2000 + i))
                    i += 2000
                self.save_sounds(self.browser, especie)
    
    def save_sounds(self, browser, especie):
        html = browser.page_source
        self.soup = BeautifulSoup(html, 'html.parser')
        sounds = self.soup.find_all(class_ = 'mejs-container svg wikiaves-player progression-single progression-skin progression-minimal-dark progression-audio-player mejs-audio')
        for sound in range(len(sounds)):
            try:
                save = sounds[sound]['src']
                my_filename = os.path.join(self.path + "/sounds/{}/".format(especie) + "{}{}.mp3".format(especie, sound))
                with open(my_filename, 'w') as handle:
                    print(file=handle)
                urllib.request.urlretrieve(save, self.path + '/sounds/{}/'.format(especie) + '{}{}.mp3'.format(especie, sound))
                del html
                self.soup = None
                del sounds
            except:
                a = 'a'
    def save_images(self, browser, especie):
        html = browser.page_source
        self.soup = BeautifulSoup(html, 'html.parser')
        imagens = self.soup.find_all(class_ = 'img-responsive')
        for imagem in range(len(imagens)):
            save = imagens[imagem]['src']
            my_filename = os.path.join(self.path + "/images/{}/".format(especie) + '{}{}.jpg'.format(especie, imagem))
            with open(my_filename, "w")as handle:
                print(file=handle)
            urllib.request.urlretrieve(save, self.path + "/images/{}/".format(especie) + '{}{}.jpg'.format(especie, imagem))
        del html
        self.soup = None
        del imagens
    
    def crawl_lots_of_photo_links(self, especies):
        for especie in especies:
            self.crawl_one_photo_link(especie)
        
    def crawl_lots_of_sound_links(self, especies):
        for especie in especies:
            self.crawl_one_audio_link(especie)
            
    def main(self, especies):
        print("Starting program...")
        self.connect_to_internet()
        print("Connected to internet!")
        self.get_information()
        print("All information was collected!")
        try:
            self.create_dir()
        except:
            a = 'a'
        print("All directories was created")
        self.export_links_to_txt()
        print("Exported links to txt!")
        if self.photo == True:
            self.crawl_lots_of_photo_links(especies)
            print("All photos was crawled")
        else:
            self.crawl_lots_of_sound_links(especies)
            print("All sounds was crawled")
        self.browser.close()
    

In [129]:
classe = BirdCrawler(store_path = '/home/aninha/Documents/Birds_Project', photo = False)

In [130]:
classe.main(["colegial"])

Starting program...
Connected to internet!
All information was collected!
All directories was created
Exported links to txt!
https://www.wikiaves.com.br/midias.php?tm=s&t=s&s=11291
All sounds was crawled
