I. Crawl Image from https://www.freeimages.com/


In [67]:
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import urllib.request

import time
from threading import Thread
import os


In [68]:
class GetImagesfromPages:

    def __init__(self, n_threads, n_page, url_page):
        """
       n_threads: Number of active threads
       n_page: Number of pages to retrieve images
       url_page: Path to the results page
       result_url: A list that stores the received image paths.
       """

        self.n_threads = n_threads
        self.n_page = n_page
        self.url_page = url_page
        self.result_urls = []

    def is_valid(self, url):
        """
        Checks url is valid.
        """
        parsed = urlparse(url)
        return bool(parsed.netloc) and bool(parsed.scheme)

    def get_all_images(self, url):
        soup = BeautifulSoup(urllib.request.urlopen(url), "html.parser")
        urls = []
        for img in soup.find_all("img", {"class": "grid-thumb"}):
            img_url = img.attrs.get("src")
            if not img_url:
                continue
            img_url = urljoin(url, img_url)
            try:
                pos = img_url.index("?")
                img_url = img_url[:pos]
            except ValueError:
                pass
            if self.is_valid(img_url):
                urls.append(img_url)
        return urls

    def main(self, start, end):
        for i in range(start, end):
            try:
                self.result_urls.extend(self.get_all_images(self.url_page + str(i)))
            except:
                pass

    def __call__(self):

        threads = []
        batch = self.n_page // self.n_threads
        for i in range(0, self.n_page, batch):
            start = i
            end = i + batch
            if end >= self.n_page:
                end = self.n_page + 1
            threads.append(Thread(target=self.main, args=(start, end)))
        start = time.time()
        for i in range(self.n_threads):
            threads[i].start()
        for i in range(self.n_threads):
            threads[i].join()
        end = time.time()
        print(f"Time handle pages = {end - start:.2f}s")
        return self.result_urls



In [69]:
def urls_to_txts(topic_names, topics, urltopic, n_page, n_threads):
    for dir, names in zip(topic_names, topics):
        dir_path_urls = f"data/{dir}/urls"
        if not os.path.exists(dir_path_urls):
            os.makedirs(dir_path_urls)

        for name in names:
            result_of_name = []
            for key in urltopic.keys():
                res = GetImagesfromPages(min(n_threads, n_page // 2), n_page,
                                         urltopic[key].format(name=name))()

                if len(res) > 0:
                    res = list(set(res))
                    result_of_name.extend(res)

            print(f"{dir_path_urls}/{dir}_{name}.txt have {len(result_of_name)} images \n")
            strResult = '\n'.join(result_of_name)
            with open(f"{dir_path_urls}/{dir}_{name}.txt", "w") as f:
                f.write(strResult)

In [70]:
animal = ["horse", "pig", "Alligator", "bird"]

plant = ["apple", "carrot", "flower"]

furniture = ["table", "Piano", "Bookcase", "Umbrella", "book"]

scenery = ["fireworks", "sky", "cave", "cloud"]

urltopic = {
    "freeimages": "https://www.freeimages.com/search/{name}/"
}

topic_names = ["animal", "plant", "furniture", "scenery"]
topics = [animal, plant, furniture, scenery]
n_threads = 3
n_page = 2

In [71]:
urls_to_txts(topic_names=topic_names, topics=topics, urltopic=urltopic, n_page=n_page, n_threads=n_threads)

Time handle pages = 0.42s
data/animal/urls/animal_horse.txt have 120 images 

Time handle pages = 0.28s
data/animal/urls/animal_pig.txt have 120 images 

Time handle pages = 0.35s
data/animal/urls/animal_Alligator.txt have 60 images 

Time handle pages = 0.36s
data/animal/urls/animal_bird.txt have 120 images 

Time handle pages = 0.28s
data/plant/urls/plant_apple.txt have 120 images 

Time handle pages = 0.35s
data/plant/urls/plant_carrot.txt have 120 images 

Time handle pages = 0.27s
data/plant/urls/plant_flower.txt have 119 images 

Time handle pages = 0.26s
data/furniture/urls/furniture_table.txt have 120 images 

Time handle pages = 0.43s
data/furniture/urls/furniture_Piano.txt have 60 images 

Time handle pages = 0.34s
data/furniture/urls/furniture_Bookcase.txt have 60 images 

Time handle pages = 0.36s
data/furniture/urls/furniture_Umbrella.txt have 60 images 

Time handle pages = 0.35s
data/furniture/urls/furniture_book.txt have 120 images 

Time handle pages = 0.28s
data/scene

II. Get images from txt

In [72]:
import urllib.request
from threading import Thread
import time
import requests
import random
import os


class DownloadImagesFromUrls():
    def __init__(self, nThreads, urls, destinate_folder):
        self.nThreads = nThreads
        self.urls = urls
        self.n = len(urls)
        self.destinate_folder = destinate_folder

    # Func target
    def download_url(self, start, end):

        for i in range(start, end):
            a = random.random()
            try:
                urllib.request.urlretrieve(self.urls[i], f"{self.destinate_folder}/{a}.jpg")
            except:
                print(f"cannot access {self.urls[i]}")
            print('.', end=" ")

    def __call__(self):

        threads = []
        batch = self.n // self.nThreads
        for i in range(0, self.n, batch):
            start = i
            end = i + batch

            if end >= self.n:
                end = self.n

            threads.append(Thread(target=self.download_url, args=(start, end)))

        start = time.time()
        for i in range(self.nThreads):
            threads[i].start()
        for i in range(self.nThreads):
            threads[i].join()
        end = time.time()

        print(f"\nTime handle download urls = {end - start:.2f}s\n", )


In [73]:
def get_image_from_txts(topic_names, topics):
    for dir, names in zip(topic_names, topics):

        dir_path_images = f"images"
        dir_path_urls = f"data/{dir}/urls"
        if not os.path.exists(dir_path_images):
            os.makedirs(dir_path_images)

        txts = [name for name in os.listdir(dir_path_urls) if name.endswith(".txt")]

        for txt in txts:
            folder_txt = f"{dir_path_urls}/{txt}"
            with open(folder_txt, "r") as f:
                content_txt = f.readlines()

            folder_image = f"{dir_path_images}/{txt}"
            if not os.path.exists(folder_image[:-4]):
                os.makedirs(folder_image[:-4])
            print(folder_image[:-4])

            n_threads = 10
            DownloadImagesFromUrls(min(n_threads, len(content_txt) // 2), content_txt, folder_image[:-4])()

In [74]:
animal = ["horse", "pig", "Alligator", "bird"]
plant = ["apple", "carrot", "flower"]
furniture = ["table", "Piano", "Bookcase", "Umbrella", "book"]
scenery = ["fireworks", "sky", "cave", "cloud"]
urltopic = {
    "freeimages": "https://www.freeimages.com/search/{name}/"
}
topic_names = ["animal", "plant", "furniture", "scenery"]
topics = [animal, plant, furniture, scenery]
get_image_from_txts(topic_names=topic_names, topics=topics)

images/animal_Alligator
. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 
Time handle download urls = 16.56s

images/animal_bird
. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 
Time handle download urls = 23.21s

images/animal_horse
. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 
Time handle download urls = 24.30s

images/animal_pig
. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 