In [11]:
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import json, itertools, re, requests, random, datetime
from datetime import datetime as dt
from fake_useragent import UserAgent

options = Options()

options.add_argument('--headless')

FLICKR = 'https://flickr.com/search/'
TAGS = ['clouds', 'cloud', 'sky', 'storm', 'weather']

caps = DesiredCapabilities.CHROME
caps['loggingPref'] = {'performance': 'ALL'}

xhrCallIntercept_js = """
(function(XHR) {
  "use strict";

  var element = document.createElement('div');
  element.id = "interceptedResponse";
  element.appendChild(document.createTextNode(""));
  document.body.appendChild(element);

  var open = XHR.prototype.open;
  var send = XHR.prototype.send;

  XHR.prototype.open = function(method, url, async, user, pass) {
    this._url = url; // want to track the url requested
    open.call(this, method, url, async, user, pass);
  };

  XHR.prototype.send = function(data) {
    var self = this;
    var oldOnReadyStateChange;
    var url = this._url;

    function onReadyStateChange() {
      if(self.status === 200 && self.readyState == 4 /* complete */) {
        document.getElementById("interceptedResponse").innerHTML +=
          '{"data":' + self._url + ', "headers" :' + self.headers + ' }*****';
      }
      if(oldOnReadyStateChange) {
        oldOnReadyStateChange();
      }
    }

    if(this.addEventListener) {
      this.addEventListener("readystatechange", onReadyStateChange,
        false);
    } else {
      oldOnReadyStateChange = this.onreadystatechange;
      this.onreadystatechange = onReadyStateChange;
    }
    send.call(this, data);
  }
})(XMLHttpRequest);
"""

driver = webdriver.Chrome(options = options, desired_capabilities=caps)

try:
    url = FLICKR + "?has_geo=1&media=photos&view_all=1&text=" + TAGS[0]
    
    driver.get(url)
    driver.execute_script(xhrCallIntercept_js)

    print('title : "{}"'.format(driver.title))

    time.sleep(5)

except Exception as e:
    print('Error: ' + str(e))

wait = "Getting AJAX data..."
while wait != True:
    print(wait)
    # trying scroll to trigger and api call
    try:
        print('attempting Scroll!')
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        print('Scrolled...')
        
        # waiting for the api call to be included in the DOM  
        wait = WebDriverWait(driver, 15).until(EC.text_to_be_present_in_element((By.ID, "interceptedResponse"), "api_key"))
        
    except Exception as e:
        print("intercept failed!:" + str(e))
        
    intercepts = driver.find_elements_by_id('interceptedResponse')
    
if wait == True:
    print('ajax call intercepted!\n')

xhr_call = intercepts[0].text

var_names = ["api_key", "reqId", "api_url", "extras"]
re_expressions = [r"(api_key)=([\dabcdef]*)(&)", r"(reqId)=([\dabcdef]*)(&)", r"(https:\/\/(\w+\.?)+(\/\w+)+)(\?)", r"extras=((\w+(%2)?)+?)?&"]

groups = [2,2,1,1]

variables = [dict(zip(["var_name", "regex", 'group'], each)) for each in [each for each in zip(var_names, re_expressions, groups)]]

for each in variables:
    
    if  re.search(each["regex"], string=xhr_call, flags=re.MULTILINE) != None:
        globals()[each["var_name"]] = re.search(each["regex"], string=xhr_call, flags=re.MULTILINE).group(each["group"])
    else:
        globals()[each["var_name"]] = None
        
extras = extras.replace('%2C', ',')

print("Extracted ajax params:--------\n")
for each in var_names:
    print("%(var)s :     %(value)s" % {"var": each.ljust(10, ' '), "value" : globals()[each]})


photos_per_page = "500"
additional_extras = "url_o,original_format,date_taken,date_upload,geo"

privacy_filter ={
    "public photos" : '1', 
    "private photos visible to friends" : '2',
    "private photos visible to family" : '3',
    "private photos visible to friends & family": '4',
    "completely private photos" : '5'
}



global params
params = {
    "sort" : "relevance",
    "tags" : 'clouds',
    "parse_tags" : "1",
    "content_type" : "7",
    "extras" : extras + additional_extras,
    "per_page" : photos_per_page,
    "page" : 1,
    "lang": "en-US",
    "has_geo" :"1",
    "media" : "photos",
    "view_all" : "1",
    "text" : "clouds",
    "viewerNSID": "",
    "method" : "flickr.photos.search",
    "csrf" : "",
    "api_key" : api_key,
    "format" : "json",
    "hermes" : "1",
    "hermesClient" : "1",
    "reqId" : reqId,
    "nojsoncallback" : "1",
    "privacy_filter" : privacy_filter['public photos'],
    "geo_context": '2'
}

cookies = driver.get_cookies()

#ua = UserAgent()

with requests.sessions.Session() as s:
    for cookie in cookies:
        s.cookies.set(cookie['name'], cookie['value'])
    #s.headers['User-Agent'] = str(ua.chrome)
    response = s.get(api_url, params=params)

json_data_path = './json_data/'
photo_json = 'photos.json'
scraped_ids = 'scraped_photos.txt'
now = dt.now()
days_offset = 3

def change_date_range(index, offset=3):
    params['min_upload_date'] = index - datetime.timedelta(days=offset)
    params['max_upload_date'] = index

change_date_range(now)

params['tags'] = ''

total_photos = s.get(api_url, params=params).json()['photos']['total']

def find_best_date_range(total_photos, days_offset):
    
    print(f"Finding a suitable range in 20 attempts or less ...")
    repeats = 0
    #int(total_photos) <= 3990 or int(total_photos) > 4000
    while (int(total_photos) != 4000) and not (int(total_photos) < 4000 and repeats > 20):
        if int(total_photos) > 4000:
            days_offset = days_offset * 4000/ int(total_photos)
            params['min_upload_date'] = params['max_upload_date'] - datetime.timedelta(days_offset)
            print(f"({str(repeats)}): too many photos   ({total_photos.ljust(10, '+')}): new range from {params['min_upload_date']} to {params['max_upload_date']}", end = '\r')
        if int(total_photos) <= 3990:
            days_offset = days_offset * 3990 / int(total_photos)
            params['min_upload_date'] = params['max_upload_date'] - datetime.timedelta(days_offset)
            print(f"({str(repeats)}): not enough photos ({total_photos.ljust(10, '-')}): new range from {params['min_upload_date']} to {params['max_upload_date']}", end = '\r')

        params['per_page'] = 1
        total_photos = s.get(api_url, params=params).json()['photos']['total']
        repeats += 1
    print('\n')
    print(total_photos)



for term in TAGS:
    
    params['text'] = term
        
    while params['min_upload_date'].timestamp() >= 1483228800 :
        change_date_range(now, days_offset)
        print("__________________________")
        print(f"New date range: {params['min_upload_date']} to {params['max_upload_date']}______ total photos : {total_photos}")
        
        total_photos = s.get(api_url, params=params).json()['photos']['total']
        print(f"Total photos in next batch: {total_photos}")
        
        find_best_date_range(total_photos, days_offset)     
        total_photos = s.get(api_url, params=params).json()['photos']['total']
        print(f'starting json dump... (photos per request: {total_photos})')

        for page in range(1, 9):
            print(f"getting page {page}...")
            print('sleeping', end = '\r')
            time.sleep(random.randrange(5000, 15000)/ 1000)

            params['page'] = page
            params['per_page'] = photos_per_page
            
            response = s.get(api_url, params=params)
            path = f"{json_data_path}{term}_{str(params['min_upload_date'].timestamp())}-{str(params['max_upload_date'].timestamp())}_{page}.json"
            try:
                with open(path, 'w') as outfile:
                    json.dump(response.json(), outfile)
                print(f"{path} written succesfully!")
            except Exception as e:
                print(f"problem dumping json data: {str(e)}")
        time.sleep(random.randrange(10000, 25000)/ 1000)

    time.sleep(random.randrange(40000, 70000)/ 1000)

In [7]:
[each for each in range(1, 9)]

[1, 2, 3, 4, 5, 6, 7, 8]

In [13]:
now = datetime.datetime.now()
now.strftime('%Y-%m-%d %H:%M:%S')

'2019-12-26 08:08:45'

In [17]:
images = response.json()['photos']['photo']

In [7]:
response.json()['photos'].keys()

dict_keys(['page', 'pages', 'perpage', 'total', 'photo', 'max_allowed_results', 'max_allowed_pages'])

In [9]:
response.json()['photos']['total']

'0'

In [53]:
def get_link_from_Id(img_id, size = 'o'):
    image = [image for image in images if image['id'] == img_id]
    assert len(image) == 1, (f"More than one image have the id '{img_id}'")
    if size == 'o':
        assert image[0].get('originalsecret') != None, f"The image with the id '{img_id}' does not have and original size secret"
        return f"https://farm{image[0]['farm']}.staticflickr.com/{image[0]['server']}/{img_id}_{image[0]['originalsecret']}_o.{image[0]['originalformat']}"
    else:
        return f"https://farm{image[0]['farm']}.staticflickr.com/{image[0]['server']}/{img_id}_{image[0]['secret']}_{size}.jpg"

In [56]:
get_link_from_Id('3863384442', 'b')

'https://farm3.staticflickr.com/2499/3863384442_4739162827_b.jpg'

In [14]:
url

'https://flickr.com/search/?has_geo=1&media=photos&view_all=1&text=clouds'

In [60]:
random.randint(2, 6)

2

In [33]:
random.choice([img for img in images if img.get('originalsecret') == None])

NameError: name 'images' is not defined

In [66]:
before = dt.now().timestamp()

In [69]:
after =dt.now().timestamp()

In [70]:
after - before

12.318365812301636

In [197]:
pre = driver.find_elements_by_xpath('/html/body/pre')
from pprint import pprint as pp

In [194]:
data = dict(json.loads(pre[0].text))

In [199]:
pp(data['photos']['pages'])

10205


In [124]:
[each['name']+':'  + each['value'] for each in cookies if each['name'] == 's_tp']

['s_tp:7940']

In [145]:
response = s.get(api_url, params=params)

In [146]:
response.json()['photos']['photo'][0].keys()

dict_keys(['id', 'owner', 'secret', 'server', 'farm', 'title', 'ispublic', 'isfriend', 'isfamily', 'safe'])

In [140]:
driver.quit()

In [141]:
browser.quit()

In [18]:
img = response.json()['photos']['photo'][0]

In [19]:
img

{'id': '8966548467',
 'owner': '36587311@N08',
 'secret': '312f2ed006',
 'server': '3792',
 'farm': 4,
 'title': 'cloud',
 'ispublic': 1,
 'isfriend': 0,
 'isfamily': 0,
 'safe': 0}

In [20]:
link = f"https://farm{img['farm']}.staticflickr.com/{img['server']}/{img['id']}_{img['secret']}_o.png" 

In [None]:
https://farm{farm-id}.staticflickr.com/{server-id}/{id}_{o-secret}_o.(jpg|gif|png)

In [21]:
link

'https://farm4.staticflickr.com/3792/8966548467_312f2ed006_o.png'

In [12]:
parsed_url = api_url+"?"+"&".join(["=".join([each , params[each]]) for each in params])

In [46]:
browser = webdriver.Firefox()

In [47]:
browser.get(parsed_url)

In [45]:
browser.quit()

In [13]:
script = f"window.open('{parsed_url}')"

In [None]:
browser.execute_script(script)

In [154]:
ext = "date_upload, date_taken, owner_name, icon_server, original_format, last_update, geo, tags, machine_tags, o_dims, views, media, path_alias, url_sq, url_t, url_s, url_q, url_m, url_n, url_z, url_c, url_l, url_o"

In [155]:
ext = [each.strip() for each in ext.split(',')]

In [None]:
try:
    elements = driver.find_elements_by_xpath('//a[@class="overlay"]')
    num_elements = len(elements)
    print(num_elements)
except Exception as e:
    print('Error getting elements:' + str(e))

In [None]:
elem= elements[0]

In [None]:
elements[0].get_property('attributes')

In [None]:
for element in elements:
    print(element.get_attribute('href'))

In [None]:
def exif(action):
    exif = driver.find_element_by_link_text(action)
    exif.click()
    return

In [None]:
elem  = driver.find_element_by_xpath('//a[@id="yui_3_16_0_1_1576995420623_60055"]')

In [None]:
elem.send_keys(Keys.COMMAND + Keys.RETURN)


In [None]:
tab = driver.find_element_by_tag_name('body')
tab.send_keys(Keys.COMMAND + "t")

In [None]:
SHOW = "Show EXIF"
HIDE = "Hide EXIF"

In [None]:
exif(SHOW)

In [None]:
exif(HIDE)

In [None]:
main_window = driver.current_window_handle

In [None]:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

In [None]:
scrpt = ['window.open("', page2_scroll, '");']

In [None]:
driver.find_element_by_tag_name('body').send_keys(Keys.COMMAND + 't') 

In [None]:
driver.execute_script("".join(scrpt))

In [None]:
driver.execute_script('window.open("")')

In [None]:
new_tab = [tab for tab in driver.window_handles if tab != main_window][0]

In [None]:
driver.switch_to.window(new_tab)

In [None]:
driver.get("https://www.google.com")

In [None]:
driver.close()

In [None]:
driver.current_window_handle

In [None]:
driver.quit()