In [1]:
# references:
# https://github.com/Cledge-org/cledge/blob/dev/features/college_search_tool/college_picture_scraper/src/college_picture_url_scraper.ipynb

import time
import json

from pyspark.sql import *
from selenium import webdriver

In [2]:
# Variables
DRIVER_PATH = './driver/chromedriver'
DATA_PATH = '../data/college-search-data-v2.parquet'
TARGET_FILE = './college_picture_urls.json'

In [3]:
# Initialize SparkContext
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

In [4]:
# Load data
df = spark.read.load(DATA_PATH)

In [5]:
# Collect list of college names
colleges = {}
for row in df.select(["UNITID", "INSTNM"]).collect():
    colleges[row.UNITID] = ' '.join(c for c in row.INSTNM.replace('-', ' ').split() if c.isalnum())

In [19]:
def fetch_image_data(college_name_str, wd, sleep=2.5):
    # search limit to wikipedia and wikimedia along with creative commons restriction
    search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&tbs=il:cl&source=hp&q=site:wikipedia.org, commons.wikimedia.org {q}&oq=site:wikipedia.org, commons.wikimedia.org {q}&gs_l=img"
    college_name_url_str = college_name_str.lower()
    if 'campus' in college_name_url_str:
        query_str = college_name_url_str
    else:
        query_str = '{} campus'.format(college_name_url_str)
    wd.get(search_url.format(q=query_str))
    thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd")

    result_obj = {
        'college_name': college_name_str,
        'img_title': None,
        'img_wiki_link': None,
        'img_link': None,
        'img_description': None
    }

    for res in thumbnail_results:
        try:
            res.click()
        except Exception:
            continue

        time.sleep(sleep)

        try:
            imgs = wd.find_elements_by_css_selector('img.n3VNCb') # finds actual image link
            for img_element in imgs:
                src = img_element.get_attribute('src')
                if src and src.startswith('http'):
                    result_obj['img_link'] = src
                    break
            if not result_obj['img_link']:
                continue
        except Exception:
            result_obj['img_link'] = None
            continue
        #print(result_obj['img_link'])
        try:
            img_title_element = wd.find_element_by_css_selector('h1.eYbsle') # get image title
            print(img_title_element)
            result_obj['img_title'] = img_title_element.text
        except Exception:
            result_obj['img_title'] = None # Problem
            continue
        print(result_obj['img_title'])
        try:
            links = wd.find_elements_by_css_selector('a.zSA7pe')
            for link_element in links:
                if 'wiki' in link_element.text.lower():
                    result_obj['img_wiki_link'] = link_element.get_attribute('href')
                    wd.execute_script('''window.open("{}","_blank");'''.format(result_obj['img_wiki_link']))
                    wd.switch_to_window(wd.window_handles[1])
                    try:
                        desc_element = wd.find_element_by_css_selector('div.description.mw-content-ltr.en')
                    except Exception:
                        try:
                            desc_element = wd.find_element_by_css_selector('td.description')
                        except Exception:
                            try:
                                desc_element = wd.find_element_by_css_selector('div.description.mw-content-ltr.en a.extiw')
                            except Exception:
                                desc_element = None
                    if desc_element:
                        result_obj['img_description'] = desc_element.text
                    else:
                        result_obj['img_description'] = None
                    break
        except Exception:
            result_obj['img_wiki_link'] = None
            result_obj['img_description'] = None
        break

    return result_obj

In [20]:
# Testing code block
wd = webdriver.Chrome(executable_path=DRIVER_PATH)
print(fetch_image_data('University of Washington Seattle Campus', wd))
wd.quit()

  
  # Remove the CWD from sys.path while we load stuff.


<selenium.webdriver.remote.webelement.WebElement (session="56620cccc2bddec4b712708fae27cdc2", element="7189829b-465c-43ff-bf0b-478a3c080858")>
File:University of Washington .jpg - Wikimedia Commons
{'college_name': 'University of Washington Seattle Campus', 'img_title': 'File:University of Washington .jpg - Wikimedia Commons', 'img_wiki_link': None, 'img_link': 'https://upload.wikimedia.org/wikipedia/commons/b/b0/University_of_Washington_.jpg', 'img_description': None}


In [None]:
with open(TARGET_FILE) as f:
    pic_urls = json.load(f)

In [None]:
wd = webdriver.Chrome(executable_path=DRIVER_PATH)
for UNITID, INSTNM in colleges.items():
    if UNITID in pic_urls:
        continue
    pic_urls[UNITID] = fetch_image_data(INSTNM, wd)
    with open(TARGET_FILE, 'w') as f:
        json.dump(pic_urls, f)
wd.quit()