In [1]:
from datetime import date

today = date.today()
print("last run:", today)

last run: 2021-05-13


## Testing the API

In [2]:
import json
import requests

This is a test version set up on a Digital Ocean droplet created using the Firewall Cafe snapshot.

In [3]:
# BASE_URL = 'http://159.89.80.47/'
BASE_URL = 'http://test-api.firewallcafe.com/'

### GET endpoints

In [4]:
endpoints = [
    'searches',
    'searches/search_id/1',
    'searches/images',
    'searches/images/search_id/1',
    'searches/votes',
    'searches/votes/search_id/1',
    'searches/votes/vote_id/1',
    'searches/votes/censored_searches',
    'searches/votes/uncensored_searches',
    'searches/votes/bad_translation_searches',
    'searches/votes/good_translation_searches',
    'searches/votes/nsfw_searches',
    'searches/votes/wtf_searches',
    'searches/votecounts',
    'searches/1/votecounts',
    'searches/votecounts/images',
    'searches/1/votecounts/images',
    'images',
    'images/search_id/1',
    'images/censored_searches',
    'images/uncensored_searches',
    'images/bad_translation_searches',
    'images/good_translation_searches',
    'images/nsfw_searches',
    'images/wtf_searches',
]

In [5]:
import time

In [6]:
for endpoint in endpoints:
    ts = time.time()
    r = requests.get(BASE_URL + endpoint)
    info = []
    try:
        first = r.json()[0]
        check = ['search_term_initial', 'vote_name']
        for item in check:
            if item in first: info.append(first[item])
    except: 
        pass
    print(f'{endpoint:50} {round(time.time()-ts, 1):5} seconds {r.status_code}')
    if info: print('\t first query info:', ', '.join(info))

searches                                             0.4 seconds 200
	 first query info: test
searches/search_id/1                                 0.2 seconds 200
	 first query info: football
searches/images                                      0.9 seconds 200
	 first query info: test
searches/images/search_id/1                          0.4 seconds 200
	 first query info: football
searches/votes                                       0.4 seconds 200
	 first query info: test, Censored
searches/votes/search_id/1                           0.2 seconds 200
searches/votes/vote_id/1                             0.8 seconds 200
	 first query info: blond jenny artist, Censored
searches/votes/censored_searches                     0.8 seconds 200
	 first query info: shen yun
searches/votes/uncensored_searches                   0.6 seconds 200
	 first query info: black jesus
searches/votes/bad_translation_searches              0.6 seconds 200
	 first query info: history of women's suffrage
searches/

Looks like they're all working, and the endpoints that weren't working were breaking because raw image data is being stored in the Postgres DB and the queries get too big fast.

### POST endpoints

Okay, now try posting the results of a single search.

This is a bit tricky as I believe that using this endpoint isn't documented at all, which means I have to read the actual functions, [createSearch](https://github.com/dphiffer/firewall-cafe/blob/d0f0ed33e511430d55c1a6e04994399afe0003c6/api/queries.js#L428) and [saveImage](https://github.com/dphiffer/firewall-cafe/blob/d0f0ed33e511430d55c1a6e04994399afe0003c6/api/queries.js#L510).

Looks like we're going to have to call `saveImage()` once for each image we want to save, which will be a lot of overhead on the API.

In [7]:
from datetime import datetime

In [8]:
datetime.utcnow().timestamp()

1620984226.664925

Let's look at an example search.

In [9]:
requests.get(BASE_URL + 'searches/search_id/1').json()

[{'search_id': 1,
  'search_timestamp': '1454979377000',
  'search_location': 'new_york_city',
  'search_ip_address': None,
  'search_client_name': 'Dan',
  'search_engine_initial': None,
  'search_engine_translation': None,
  'search_term_initial': 'football',
  'search_term_initial_language_code': None,
  'search_term_initial_language_confidence': None,
  'search_term_initial_language_alternate_code': None,
  'search_term_translation': '足球',
  'search_term_translation_language_code': None,
  'search_term_status_banned': False,
  'search_term_status_sensitive': False,
  'search_schema_initial': 0,
  'wordpress_search_term_popularity': 1,
  'wordpress_copyright_takedown': None,
  'wordpress_unflattened': None,
  'wordpress_regular_post_id': 223,
  'wordpress_search_result_post_id': 241408,
  'wordpress_search_result_post_slug': 'football-1454979377'}]

### Create searches

In [10]:
with open('api-config.json') as f:
    secret = json.loads(f.read())['secret']

In [11]:
def create_search(text):
    r = requests.post(BASE_URL + 'createSearch', data={
        'search_timestamp':int(datetime.utcnow().timestamp()),
        'search_location':'new_york_city',
        'search_ip_address': None,
        'search_client_name':'rowan_scraper_tests',
        'search_engine_initial': None,
        'search_engine_translation': None,
        'search_term_initial': text,
        'search_term_initial_language_code': None,
        'search_term_initial_language_confidence': None,
        'search_term_initial_language_alternate_code': None,
        'search_term_translation': None,
        'search_term_translation_language_code': None,
        'search_term_status_banned': False,
        'search_term_status_sensitive': False,
        'search_schema_initial':0,
        'secret': secret
    })
    return r

In [12]:
def query_search(search_id):
    r = requests.get(BASE_URL + 'searches/search_id/' + str(search_id))
#     print(r.json())
    # TODO: endpoint should return 404 if search_id not found
    if r.status_code == 200 and r.json():
        print("search", search_id, "in DB")
        return True
    else:
        print("could not find", search_id)    
        return False

r = create_search('test')
last_id_used = -1
if r.status_code == 201:
    for insert in r.json():
        query_search(insert['search_id'])
        last_id_used = insert['search_id']

search 5764 in DB


In [13]:
query_search(last_id_used+1)

could not find 5765


False

In [14]:
r = create_search('test')
query_search(last_id_used+1)
r.json()

search 5765 in DB


[{'search_id': 5765}]

### Create votes

In [15]:
def create_vote(search_id):
    r = requests.post(BASE_URL + 'createVote', data={
        'vote_id': 1, # censored
        'search_id': search_id,
        'vote_timestamp': int(datetime.utcnow().timestamp()),
        'vote_client_name': 'rowan_tests',
        'vote_ip_address': '192.168.0.1', 
        'secret': secret
    })
    return r

In [16]:
def get_votes(search_id):
    r = requests.get(BASE_URL + 'searches/votes/search_id/' + str(search_id))
    return r.json()

In [17]:
for i in range(100, 1000):
    votes = get_votes(i)
    if len(votes) > 0:
        break
    print(i, end='\r')

102

In [18]:
votes

[{'vote_name': 'NSFW',
  'vote_serial': 40,
  'vote_id': 6,
  'search_id': 103,
  'vote_timestamp': None,
  'vote_client_name': None,
  'vote_ip_address': None,
  'search_timestamp': '1455141099000',
  'search_location': 'new_york_city',
  'search_ip_address': None,
  'search_client_name': 'Client 289',
  'search_engine_initial': None,
  'search_engine_translation': None,
  'search_term_initial': 'ceo',
  'search_term_initial_language_code': None,
  'search_term_initial_language_confidence': None,
  'search_term_initial_language_alternate_code': None,
  'search_term_translation': 'all',
  'search_term_translation_language_code': None,
  'search_term_status_banned': False,
  'search_term_status_sensitive': False,
  'search_schema_initial': 0,
  'wordpress_search_term_popularity': 1,
  'wordpress_copyright_takedown': None,
  'wordpress_unflattened': None,
  'wordpress_regular_post_id': 3814,
  'wordpress_search_result_post_id': 241338,
  'wordpress_search_result_post_slug': 'ceo-14551410

In [19]:
get_votes(last_id_used)

[]

In [20]:
r = create_vote(last_id_used)
print(r, r.json())

<Response [201]> []


In [21]:
get_votes(last_id_used)

[{'vote_name': 'Censored',
  'vote_serial': 6106,
  'vote_id': 1,
  'search_id': 5764,
  'vote_timestamp': '1620984228',
  'vote_client_name': 'rowan_tests',
  'vote_ip_address': '192.168.0.1',
  'search_timestamp': '1620984226',
  'search_location': 'new_york_city',
  'search_ip_address': None,
  'search_client_name': 'rowan_scraper_tests',
  'search_engine_initial': None,
  'search_engine_translation': None,
  'search_term_initial': 'test',
  'search_term_initial_language_code': None,
  'search_term_initial_language_confidence': None,
  'search_term_initial_language_alternate_code': None,
  'search_term_translation': None,
  'search_term_translation_language_code': None,
  'search_term_status_banned': False,
  'search_term_status_sensitive': False,
  'search_schema_initial': None,
  'wordpress_search_term_popularity': None,
  'wordpress_copyright_takedown': None,
  'wordpress_unflattened': None,
  'wordpress_regular_post_id': None,
  'wordpress_search_result_post_id': None,
  'wordpr

### Create images

In [22]:
def create_image(search_id):
    r = requests.post(BASE_URL + 'saveImage', data={
        'search_id': search_id,
        'image_search_engine': 'test',
        'image_href': 'https://firewallcafe.com/wp-content/themes/fwc/img/logo-firewall-generic.svg?1578614316',
        'image_href_original': 'https://firewallcafe.com/thisisatest.jpg',
        'image_rank': 0,
        'secret': secret
    })
    if r.status_code >= 300:
        print(r.json())
        raise Exception("create_image failed")
    return r.json()

In [23]:
create_image(last_id_used)

{'url': 'https://firewallcafe.com/wp-content/themes/fwc/img/logo-firewall-generic.svg?1578614316',
 'query_result': {'command': 'INSERT',
  'rowCount': 1,
  'oid': 0,
  'rows': [],
  'fields': [],
  '_types': {'_types': {'arrayParser': {},
    'builtins': {'BOOL': 16,
     'BYTEA': 17,
     'CHAR': 18,
     'INT8': 20,
     'INT2': 21,
     'INT4': 23,
     'REGPROC': 24,
     'TEXT': 25,
     'OID': 26,
     'TID': 27,
     'XID': 28,
     'CID': 29,
     'JSON': 114,
     'XML': 142,
     'PG_NODE_TREE': 194,
     'SMGR': 210,
     'PATH': 602,
     'POLYGON': 604,
     'CIDR': 650,
     'FLOAT4': 700,
     'FLOAT8': 701,
     'ABSTIME': 702,
     'RELTIME': 703,
     'TINTERVAL': 704,
     'CIRCLE': 718,
     'MACADDR8': 774,
     'MONEY': 790,
     'MACADDR': 829,
     'INET': 869,
     'ACLITEM': 1033,
     'BPCHAR': 1042,
     'VARCHAR': 1043,
     'DATE': 1082,
     'TIME': 1083,
     'TIMESTAMP': 1114,
     'TIMESTAMPTZ': 1184,
     'INTERVAL': 1186,
     'TIMETZ': 1266,
     '

In [24]:
def get_images(search_id):
    r = requests.get(BASE_URL + 'images/search_id/' + str(search_id))
    return r.json()

In [25]:
get_images(last_id_used)

[{'image_id': 216262,
  'image_search_engine': 'test',
  'image_href': 'https://firewallcafe.com/wp-content/themes/fwc/img/logo-firewall-generic.svg?1578614316',
  'image_href_original': 'https://firewallcafe.com/thisisatest.jpg',
  'image_rank': '0',
  'image_mime_type': None,
  'wordpress_attachment_post_id': None,
  'wordpress_attachment_file_path': None}]

The POST endpoints are now working as well

Make sure that pagination is working correctly.

In [26]:
paginated_endpoints = [
    'searches',
    'searches/images',
    'searches/votes',
    'searches/votecounts/images',
    'images',
    'images/type/censored_searches',
    'images/type/uncensored_searches',
    'images/type/bad_translation_searches',
    'images/type/good_translation_searches',
    'images/type/nsfw_searches',
    'images/type/wtf_searches',
]

In [27]:
print(BASE_URL)
for endpoint in paginated_endpoints:
#     try:
    j1 = requests.get(f'{BASE_URL}{endpoint}?page_size=1&page=1').json()
    j2 = requests.get(f'{BASE_URL}{endpoint}?page_size=1&page=100').json()
    try:
        j1 = j1[0]
        j2 = j2[0]
    except:
        print(' ', endpoint, j2)
#         break
    identifier1 = ''
    identifier2 = ''
    if 'search_id' in j1:
        identifier1 += str(j1['search_id'])
        identifier2 += str(j2['search_id'])
    if 'image_id' in j1:
        identifier1 += str(j1['image_id'])
        identifier2 += str(j2['image_id'])
    if identifier1 == identifier2:
        print('x', endpoint)
#         break
    else:
        print(' ', endpoint)

http://test-api.firewallcafe.com/
  searches
  searches/images
  searches/votes
  searches/votecounts/images
  images
  images/type/censored_searches
  images/type/uncensored_searches
  images/type/bad_translation_searches
  images/type/good_translation_searches
  images/type/nsfw_searches
  images/type/wtf_searches
