In [1]:
from tqdm import tqdm
import time
import os
import hashlib
from selenium import webdriver
import selenium
# import imutils
import urllib
from IPython.display import Javascript
import requests
from bs4 import BeautifulSoup
import pandas as pd
from io import BytesIO
import PIL.Image
import ipywidgets as widgets  # Using the ipython notebook widgets
import IPython.display
from IPython.display import clear_output
import cv2
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from matplotlib.widgets import RectangleSelector
import numpy as np
import matplotlib.pyplot as plt
%matplotlib notebook


def imdisplay(img, fmt='jpeg', width=500):
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    new_p = PIL.Image.fromarray(img)
    f = BytesIO()
    if new_p.mode != 'RGB':
        new_p = new_p.convert('RGB')
    new_p.save(f, fmt)
    return IPython.display.Image(data=f.getvalue(), width=width)


def preventScrolling():
    disable_js = """
    IPython.OutputArea.prototype._should_scroll = function(lines) {
        return false;
    }
    """
    display(Javascript(disable_js))


def imwidget(img, fmt='jpeg', width=500):
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    # Create binary stream object
    f = BytesIO()
    # Convert array to binary stream object
    new_p = PIL.Image.fromarray(img)
    if new_p.mode != 'RGB':
        new_p = new_p.convert('RGB')
    new_p.save(f, fmt)
    return widgets.Image(value=f.getvalue(), format=fmt, width=width)


def implot(img):
    plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    plt.show()


def url_to_image(url):
    # download the image, convert it to a NumPy array, and then read
    # it into OpenCV format
    resp = urllib.request.urlopen(url)
    image = np.asarray(bytearray(resp.read()), dtype="uint8")
    image = cv2.imdecode(image, cv2.IMREAD_COLOR)
    return image


def createHTMLListBox(data):
    final = f'''<ul style="list-style-type:none;">'''
    for x1, y1, x2, y2 in data:
        final += f'''<li>({int(x1)}, {int(y1)}), ({int(x2)}, {int(y2)})</li>'''
    final += "</ul>"
    return final


def popDest(event):
    global img_dest
    try:
        dest_pts.pop()
        img_dest.patches[-1].remove()
    except:
        pass

    dest_txt.value = createHTMLListBox(dest_pts)


def add_to_database(event):
    global df, dest_pts, mountain_pic_srcs, src_index, img_dest
    df = df.append(
        {'url': mountain_pic_srcs[src_index], 'bbox': dest_pts}, ignore_index=True)
    df.to_csv("peak_labels.csv")
    img_dest.figure.canvas.flush_events()
    update_UI_image()
    try:
        
        dest_pts = []
        img_dest.patches[:] = []
    except:
        pass
    
    dest_txt.value = createHTMLListBox(dest_pts)


def skip(event=None):
    global dest_pts, img_dest, mountain_pic_srcs, src_index
    mountain_pic_srcs.pop(src_index)
    src_index -= 1
    counter_string = f"""<h1>Total Pictures in Dataset: {len(mountain_pic_srcs)}___Current Photo Index: {src_index}</h1>"""
    url_counter.update(IPython.display.HTML(counter_string))
    with open("mountain_url_outputs.csv", "w") as output_writer:
        for url in mountain_pic_srcs:
            output_writer.write(url)
    update_UI_image()
    dest_pts = []
    img_dest.patches[1:] = []
    dest_txt.value = createHTMLListBox(dest_pts)


def save_database(event):
    global df, mountain_pic_srcs
    df.to_csv("peak_labels.csv")
    df.tail()
    with open("mountain_url_outputs.csv", "w") as output_writer:
        for url in mountain_pic_srcs:
            output_writer.write(url)


def add_bounding_box(event):
    global img_dest, last_coords
    outline = plt.Rectangle((float(last_coords[0]), float(last_coords[1])), float(
        last_coords[2]-last_coords[0]), float(last_coords[3]-last_coords[1]), fill=True, color='r', alpha=0.5)
    dest_pts.append(last_coords)
    dest_txt.value = createHTMLListBox(dest_pts)
    img_dest.add_patch(outline)
    img_dest.figure.canvas.draw()


def setupUI(fig):
    add_dest_btn.on_click(add_bounding_box)
    pop_dest_btn.on_click(popDest)
    confirm_btn.on_click(add_to_database)
    save_btn.on_click(save_database)
    skip_btn.on_click(skip)
    fig.canvas.mpl_connect('key_press_event', toggle_selector)
    preventScrolling()


def line_select_callback(eclick, erelease):
    global last_coords
    'eclick and erelease are the press and release events'
    x1, y1 = eclick.xdata, eclick.ydata
    x2, y2 = erelease.xdata, erelease.ydata
    last_coords = [int(x1), int(y1), int(x2), int(y2)]
#     print(" The button you used were: %s %s" %
#           (eclick.button, erelease.button))


def toggle_selector(event):
    print(' Key pressed.')
    if event.key in ['Q', 'q'] and toggle_selector.RS.active:
        print(' RectangleSelector deactivated.')
        toggle_selector.RS.set_active(False)
    if event.key in ['A', 'a'] and not toggle_selector.RS.active:
        print(' RectangleSelector activated.')
        toggle_selector.RS.set_active(True)


def init_UI():
    global img_dest
    fig = plt.figure("Mountain Peak Labelling", constrained_layout=True)
    spec = gridspec.GridSpec(ncols=1, nrows=1, figure=fig)
    img_dest = fig.add_subplot(spec[0, 0])
    toggle_selector.RS = RectangleSelector(img_dest, line_select_callback,
                                           drawtype='box', useblit=True,
                                           # don't use middle button
                                           button=[1, 3],
                                           minspanx=5, minspany=5,
                                           spancoords='pixels',
                                           interactive=True)
    plt.rcParams['figure.figsize'] = [6, 3]
    setupUI(fig)


def update_UI_image():
    global img_dest, dest_pts, last_coords, mountain_pic_srcs, src_index
    dest_pts = []
    last_coords = []
    src_index += 1
    counter_string = f"""<h1>Total Pictures in Dataset: {len(mountain_pic_srcs)}___Current Photo Index: {src_index}</h1>"""
    url_counter.update(IPython.display.HTML(counter_string))
    try:
        img = url_to_image(mountain_pic_srcs[src_index])
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        if img.shape[0] < 244 and img.shape[1] < 244: skip()
        img_dest.imshow(img)
        plt.show()
    except ValueError:
        update_UI_image()
    except urllib.error.HTTPError:
        update_UI_image()
    except IndexError:
        pass


def save_image(folder_path, url):
    try:
        image_content = requests.get(url).content

    except Exception as e:
        print(f"ERROR - Could not download {url} - {e}")

    try:
        image_file = io.BytesIO(image_content)
        image = Image.open(image_file).convert('RGB')
        file_path = os.path.join(folder_path, hashlib.sha1(
            image_content).hexdigest()[:10] + '.jpg')
        with open(file_path, 'wb') as f:
            image.save(f, "JPEG", quality=85)
        print(f"SUCCESS - saved {url} - as {file_path}")
    except Exception as e:
        print(f"ERROR - Could not save {url} - {e}")


def fetch_image_urls(query, max_links_to_fetch, wd: webdriver, sleep_between_interactions, only_fullsize=False):
    def scroll_to_end(wd):
        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(sleep_between_interactions)

    # build the google query
    search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"

    # load the page
    wd.get(search_url.format(q=query))

    image_urls = set()
    image_count = 0
    results_start = 0
    while image_count < max_links_to_fetch:

        scroll_to_end(wd)

        # get all image thumbnail results
        thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd")
        number_results = len(thumbnail_results)

        print(
            f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}")

        for img in thumbnail_results[results_start:number_results]:
            # try to click every thumbnail such that we can get the real image behind it
            try:
                img.click()
                time.sleep(sleep_between_interactions)
            except Exception:
                print("Couldn't click on thumbnail")
                continue

            # extract image urls
            actual_images = wd.find_elements_by_css_selector('img.n3VNCb')
            for actual_image in actual_images:
                if actual_image.get_attribute('src') and 'http' in actual_image.get_attribute('src'):
                    image_urls.add(actual_image.get_attribute('src'))
            if only_fullsize:
                image_urls = set([x for x in image_urls if x.find(
                    ".jpg") != -1 or x.find(".png") != -1])
            image_count = len(image_urls)
            if len(image_urls) >= max_links_to_fetch:
                print(f"Found: {len(image_urls)} image links, done!")
                break
        else:
            print("Found:", len(image_urls),
                  "image links, looking for more ...")
            time.sleep(30)
            return
            load_more_button = wd.find_element_by_css_selector(".mye4qd")
            if load_more_button:
                wd.execute_script("document.querySelector('.mye4qd').click();")

        # move the result startpoint further down
        results_start = len(thumbnail_results)

    return image_urls


def search_and_download(search_term, driver_path, output_folder=None, number_images=5, only_fullsize=False):
    if output_folder != None:
        target_folder = os.path.join(
            output_folder, '_'.join(search_term.lower().split(' ')))

        if not os.path.exists(target_folder):
            os.makedirs(target_folder)

        with webdriver.Chrome(executable_path=driver_path) as wd:
            res = fetch_image_urls(
                search_term, number_images, wd=wd, sleep_between_interactions=3.5)

        for elem in res:
            save_image(target_folder, elem)
    else:
        with webdriver.Chrome() as wd:
            res = fetch_image_urls(search_term, number_images, wd=wd,
                                   sleep_between_interactions=3.5, only_fullsize=only_fullsize)
    return list(res)

def clean_dataframe(current_urls: set):
    global df
    reset_urls = []
    for index, row in df.iteritems():
        if len(row) <= 3:
            current_urls.add(df.at[index, 'url'])
            reset_urls.append(index)
    for index in reset_urls:
        df = df.drop(df.index[index])
    return current_urls



# Create and display textarea widget
dest_txt = widgets.HTML(
    value="(x,y)",
    placeholder='(x,y)',
    description='Destination Points: ',
    layout=widgets.Layout(width="300px"),
    style={'description_width': 'initial'}
)

button_layout = widgets.Layout(width="300px")

add_dest_btn = widgets.Button(
    description='Add Bounding Box Coords',
    disabled=False,
    button_style='info',  # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Reset',
    icon='check',  # (FontAwesome names without the `fa-` prefix)
    layout=button_layout
)

pop_dest_btn = widgets.Button(
    description='Remove Last Coordinate',
    disabled=False,
    button_style='warning',  # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Reset',
    icon='check',  # (FontAwesome names without the `fa-` prefix)
    layout=button_layout
)

confirm_btn = widgets.Button(
    description='Add to Database and Move to Next Image!',
    disabled=False,
    button_style='info',  # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Move on to the next image',
    icon='check',  # (FontAwesome names without the `fa-` prefix)
    layout=button_layout
)

save_btn = widgets.Button(
    description='Save to CSV',
    disabled=False,
    button_style='success',  # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Save Database to CSV',
    icon='check',  # (FontAwesome names without the `fa-` prefix)
    layout=button_layout
)
skip_btn = widgets.Button(
    description='Skip Picture',
    disabled=False,
    button_style='warning',  # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Do not save into database',
    icon='check',  # (FontAwesome names without the `fa-` prefix)
    layout=button_layout
)

In [None]:
img_dest = None
dest_img = None
dest_pts = []
last_coords = []

try:
    df = pd.read_csv("peak_labels.csv")
except:
    df = pd.DataFrame(columns=['url', 'bbox'])


urls = set()
with open("mountain_url_outputs.csv", "r") as reader:
    lines = reader.readlines()
    for line in lines:
        if len(line) > 0:
            urls.add(line)
            
urls = clean_dataframe(urls)
            
url_counter = IPython.display.display("", display_id=1)
init_UI()
display(widgets.VBox([widgets.HBox([widgets.VBox([add_dest_btn,pop_dest_btn,dest_txt])], 
                     layout=widgets.Layout(width='100%', align_items="center", padding="1rem 5rem 5rem")),
                     widgets.HBox([skip_btn,confirm_btn,save_btn])], layout=widgets.Layout(align_items="center")))


mountain_pic_srcs = list(urls)
src_index = len(df)
counter_string = f"""<h1>Total Pictures in Dataset: {len(mountain_pic_srcs)}___Current Photo Index: {src_index}</h1>"""
url_counter.update(IPython.display.HTML(counter_string))
update_UI_image()




In [14]:
try:
    urls = set()
    with open("mountain_url_outputs.csv", "r") as reader:
        lines = reader.readlines()
        for line in lines:
            urls.add(line)

    # Driver is for Windows Google Version 86.0.4240.198 (Official Build) (64-bit)
    # https://chromedriver.chromium.org/downloads
    # Put the path for your ChromeDriver here
    """
    USED TERMS: ['mountain', 'mountains', 'mountain peak', 'mountain peaks', 
                'mountain landscape', 'mountain landscapes', 'mountain ridge',
                'mountain terrain', 'mountain pass', 'Rocky Mountains', 
                'Transantarctic Mountains', 'Andes', 'Great Dividing Range', 
                'Ural Mountains', 'Himalayas', 'Alps Mountains', 'Alaska Mountains',
                'Sweden Mountains', 'Atlas Mountains', 'Wyoming Mountains',
                'Colorado Mountains', 'New Mexico Mountains', 'Utah Mountains',
                'Idaho Mountains', 'Washington Mountains', 'Californian Mountains',
                'Dolomites Mountains', 'Scotland Mountains', 'mountain highlands',
                'scottish highlands', 'mountain scenery', 'sunset beautiful mountain',
                'beautiful scenery mountain', 'mountain nature landscape', 'photography mountain landscape',
                'alpine photography', 'simple mountain landscape photography', 
                'aesthetic mountain photography']
    """
    DRIVER_PATH = '/chromedriver'
    search_terms = ['mountain', 'mountains', 'mountain peak', 'mountain peaks',
                    'mountain landscape', 'mountain landscapes', 'mountain ridge',
                    'mountain terrain', 'mountain pass', 'Rocky Mountains',
                    'Transantarctic Mountains', 'Andes', 'Great Dividing Range',
                    'Ural Mountains', 'Himalayas', 'Alps Mountains', 'Alaska Mountains',
                    'Sweden Mountains', 'Atlas Mountains', 'Wyoming Mountains',
                    'Colorado Mountains', 'New Mexico Mountains', 'Utah Mountains',
                    'Idaho Mountains', 'Washington Mountains', 'Californian Mountains',
                    'Dolomites Mountains', 'Scotland Mountains', 'mountain highlands',
                    'scottish highlands', 'mountain scenery', 'sunset beautiful mountain',
                    'beautiful scenery mountain', 'mountain nature landscape', 'photography mountain landscape',
                    'alpine photography', 'simple mountain landscape photography',
                    'aesthetic mountain photography']
    samples_per_term = 150
    for i, search_term in enumerate(search_terms):
        try:
            clear_output()
            print(str(float(i/len(search_terms))) +
                  "% Done: currently " + str(len(urls)) + " photos")
            full_img_urls = search_and_download(search_term=search_term, driver_path=DRIVER_PATH,number_images=samples_per_term,only_fullsize=False)
            for url in full_img_urls:
                urls.add(url)
        except:
            continue

finally:
    with open("mountain_url_outputs.csv", "w") as output_writer:
        for url in urls:
            output_writer.write(url+"\n")
    clear_output()
    print("File saved")

File saved
