In [1]:
from bs4 import BeautifulSoup
from datetime import datetime
import dateutil
import ipyplot
import json
import requests
import time
api_base = 'http://api.firewallcafe.com'

In [2]:
r = requests.get('https://firewallcafe.com/wp-json/wp/v2/search-result?per_page=25&page=1')

In [3]:
wp = r.json()

In [4]:
wp[0]['slug']

'coronavirus-1582773676'

In [5]:
wp[0]

{'id': 282427,
 'date': '2020-02-27T03:21:16',
 'date_gmt': '2020-02-27T03:21:16',
 'guid': {'rendered': 'https://firewallcafe.com/?post_type=search-result&#038;p=282427'},
 'modified': '2020-02-26T22:21:32',
 'modified_gmt': '2020-02-27T03:21:32',
 'slug': 'coronavirus-1582773676',
 'status': 'publish',
 'type': 'search-result',
 'link': 'https://firewallcafe.com/archive/coronavirus-1582773676/',
 'title': {'rendered': 'coronavirus'},
 'content': {'rendered': '<p>Google<br />\n<div id=\'gallery-1\' class=\'gallery galleryid-282427 gallery-columns-3 gallery-size-thumbnail\'><figure class=\'gallery-item\'>\n\t\t\t<div class=\'gallery-icon landscape lazy-load\'>\n\t\t\t\t<a href=\'https://firewallcafe.com/archive/coronavirus-1582773676/google-1582773676-01/\'><img width="318" height="159" src="https://firewallcafe.com/wp-content/uploads/2020/02/26/282427/google-1582773676-01.jpg" data-src="https://firewallcafe.com/wp-content/uploads/2020/02/26/282427/google-1582773676-01.jpg" class="atta

In [6]:
ipyplot.plot_images(wp[0]['galleries'][0]['src'], img_width=150)

In [7]:
wp[0]['date']

'2020-02-27T03:21:16'

In [8]:
dateutil.parser.parse(wp[0]['date'])

datetime.datetime(2020, 2, 27, 3, 21, 16)

Since there isn't an API endpoint to get searches by Wordpress ID, let's just scoop up all the searches first. 

In [9]:
searches = []
ts = time.time()
for i in range(10):
    j = requests.get(api_base + f"/searches?page={i}&page_size=1000").json()
    searches += j
    print(i, round(time.time()-ts,1), "seconds")
    ts = time.time()

0 6.0 seconds
1 5.6 seconds
2 4.1 seconds
3 4.0 seconds
4 3.3 seconds
5 3.8 seconds
6 3.8 seconds
7 4.1 seconds
8 1.1 seconds
9 0.5 seconds


In [10]:
with open('all_searches.json', 'w') as f:
    f.write(json.dumps(searches))

In [11]:
searches[0]

{'search_id': 9660,
 'search_timestamp': '1614559105095',
 'search_location': 'automated_scraper',
 'search_ip_address': '192.168.0.1',
 'search_client_name': 'automated_scraper',
 'search_engine_initial': None,
 'search_engine_translation': None,
 'search_term_initial': 'clubbing',
 'search_term_initial_language_code': 'EN',
 'search_term_initial_language_confidence': '1.0',
 'search_term_initial_language_alternate_code': None,
 'search_term_translation': '泡吧',
 'search_term_translation_language_code': 'zh-CN',
 'search_term_status_banned': False,
 'search_term_status_sensitive': False,
 'search_schema_initial': None,
 'wordpress_search_term_popularity': None,
 'wordpress_copyright_takedown': None,
 'wordpress_unflattened': None,
 'wordpress_regular_post_id': None,
 'wordpress_search_result_post_id': None,
 'wordpress_search_result_post_slug': None}

In [12]:
wp_ids = set([search['wordpress_regular_post_id'] for search in searches])
wp_ids2 = set([search['wordpress_search_result_post_id'] for search in searches])

In [13]:
wp[0]['id']

282427

In [14]:
wp[0]['id'] in wp_ids2

True

In [15]:
set([item['id'] for item in wp]) - wp_ids2

set()

Looks like all the IDs of what we got back from the Wordpress API are in the DB; the ID field we care about is called "wordpress_search_result_post_id" there.

Now, how would we check that they have the same images?

In [16]:
def get_search(wp_id):
    results = [search for search in searches if search['wordpress_search_result_post_id'] == wp_id]
    if len(results) != 1:
        raise Exception("hmm, this doesn't seem to be the right length", len(results), wp_id)
    return results[0]
get_search(wp[0]['id'])

{'search_id': 5579,
 'search_timestamp': '1582773676000',
 'search_location': 'poughkeepsie',
 'search_ip_address': None,
 'search_client_name': 'Anonymous',
 'search_engine_initial': 'google',
 'search_engine_translation': 'baidu',
 'search_term_initial': 'coronavirus',
 'search_term_initial_language_code': 'en',
 'search_term_initial_language_confidence': '0.5703125',
 'search_term_initial_language_alternate_code': '',
 'search_term_translation': '冠状病毒',
 'search_term_translation_language_code': 'zh-CN',
 'search_term_status_banned': False,
 'search_term_status_sensitive': False,
 'search_schema_initial': 2,
 'wordpress_search_term_popularity': None,
 'wordpress_copyright_takedown': None,
 'wordpress_unflattened': None,
 'wordpress_regular_post_id': None,
 'wordpress_search_result_post_id': 282427,
 'wordpress_search_result_post_slug': 'coronavirus-1582773676'}

Next step: request the images from the DB using that search ID, and see if they match the images from the Wordpress results. We're going to limit ourselves to Google for simplicity.

In [18]:
for search in wp[:10]:
    search_db = get_search(search['id'])
    if search_db['wordpress_search_result_post_slug'] != search['slug']:
        raise Exception("we seem to have an incorrect correspondance", search_db['wordpress_search_result_post_slug'], search['slug'])
    r = requests.get(api_base + '/images/search_id/' + str(search_db['search_id']))
    j = r.json()
    print("plotting Wordpress images")
    ipyplot.plot_images(search['galleries'][0]['src'], img_width=100)
    print("plotting DB images")
    db_imgs = [item['image_href'] for item in j if item['image_search_engine'] == 'google']
    ipyplot.plot_images(db_imgs, img_width=100)

plotting Wordpress images


plotting DB images


plotting Wordpress images


plotting DB images


plotting Wordpress images


plotting DB images


plotting Wordpress images


plotting DB images


plotting Wordpress images


plotting DB images


plotting Wordpress images


plotting DB images


plotting Wordpress images


plotting DB images


plotting Wordpress images


plotting DB images


plotting Wordpress images


plotting DB images


plotting Wordpress images


plotting DB images
