# Wikipedia - this day in history <small>(step 3 - get image data)</small>
---
**Goal:** get image data for images in wikipedia_tdih.db (if no data already in database).

**Notes about this notebook:**  
- this notebook is for the third step of this project. 
- the notebook for the first step is [TADS_wikipedia_tdih_main_step_01_get_day_data_30jan21](https://github.com/Bianca-Aguglia/TADS_wikipedia_this_day_in_history_build_dataset/tree/master/notebooks)
- the notebook for the second step is [TADS_wikipedia_tdih_main_step_02_get_link_data_07feb21](https://github.com/Bianca-Aguglia/TADS_wikipedia_this_day_in_history_build_dataset/tree/master/notebooks)

### Process flow summary: <small>(with details and checkmarks for steps done in this notebook)</small>
1. get day data
    - get day data from wikipedia API, process it, and save it to wikipedia_tdih.db
2. get link data
    - get link data from wikipedia API, process it, and save in to wikipedia_tdih.db
3. get image data
    - [ ] query wikipedia_tdih.db for images that are in wiki_image but not in wiki_image_data_log
    - [ ] get image data from wikipedia API and extract the following:
        - [ ] user (used for wiki_credit)
        - [ ] image_url ?
        - [ ] copyright license
        - [ ] license name
        - [ ] license description
        - [ ] license usage rights

In [None]:
import requests
import time
import sqlite3
import config
import numpy as np
import datetime as dt
from bs4 import BeautifulSoup

In [None]:
DATABASE_FILE = config.DATABASE_FILE
DOE = config.DOE
URL = config.WIKIPEDIA_URL
HEADERS = config.HEADERS
PARAMS = {'titles': '', # placeholder for page
          'action': 'query',
          'format': 'json',
          'prop': 'imageinfo',
          'iiprop': 'user|url|extmetadata', 
         }

# explanation of values for 'iiprop'
# user for wikipedia_user
# url for wikipedia_url for full picture
# extmetadata for license and attribution data

In [None]:
def image_data_main(database_file = DATABASE_FILE, date = '', batch_size = 5, doe = DOE,
                    url = URL, params = PARAMS, headers = HEADERS):
    """
    Main function for getting and processing the image data for a group of images in wikipedia_tdih.db.
    It uses several helper functions to break down the process into simple steps:
        - get_images_to_update_from_db
        - get_image_data
        - extract_image_data
        - add_image_data_to_db
    
    Params:
        database_file: database_file to read and update data from
        date: if present, select images updated prior to date
        batch_size: no. of links to request data for in a single call to wikipedia API (to reduce no. of API calls)
        doe: date of entry (defaults to current day)
        
    Returns:
        None
    """
    with sqlite3.connect(DATABASE_FILE) as connection:
        cursor = connection.cursor()
    
        # select from wikipedia_tdih.db the image we need data for
        image_list = get_images_to_update_from_db(cursor)
    
        # exit if no links need to have data retrieved or updated
        if not image_list:
            return 'no image data needs to be retrieved / updated'
        
        # keep track of failed requests (break if failed_requests > 5)
        failed_request = 0
    
        # process images in batches of size batch_size
        for i in range(0, len(image_list), batch_size):
            image_batch = image_list[slice(i, i + batch_size)]
            
            # get image data for images in batch
            resp = get_image_data(image_batch, url = url, params = params, headers = headers)
            
            # extract link_data (if resp.status_code == requests.codes.ok)
            if resp.status_code == requests.codes.ok:
                image_dicts = extract_image_data(resp)

            else:
                # update wiki_image_data_log with response status_code
#                 update_table_wiki_link_data_log(link_id, status_code, cursor, doe = DOE)
                update_table_wiki_image_data_log(image_batch, resp.status_code)
                failed_request += 1
                if failed_request == 5:
                    return f'failed requests: {failed_request}'
                continue

            # add image_data to db
            for image_dict in image_dicts:
                add_image_data_to_db(image_dict, cursor, doe = doe)
                
        return 'wiki_image update complete'  

In [None]:
def get_images_to_update_from_db(cursor):
    """
    Select from wikipedia_tdih.db the images for which data is needed. These images are in wiki_image but not
    in wiki_image_data_log)
    
    Params:
        cursor: database cursor
        
    Returns:
        image_list: list of images for which wikipedia data is needed.
    """
    image_list = cursor.execute('''SELECT image_id, image_file FROM wiki_image WHERE image_id NOT IN (
                                    SELECT image_id FROM wiki_image_data_log )''').fetchall()
    
    return image_list

In [None]:
def get_image_data(image_batch, url = URL, params = PARAMS, headers = HEADERS):
    """
    Get image data from Wikipedia API. Images are processed in batches (usually of size 5) to reduce the number
    of API calls.
    
    Params:
        image_batch: list of tuples representing images to get data for (usually a batch of size 5)
                     each tuple is of the form (image_id, image_file)
        url: url for wikipedia API
        params: params to use in API call
        headers: headers to use in API call
        
    Returns:
        resp: requests response object
    """
    # join the image_files for images in image_batch and assign them to request params
    titles = '|'.join(wiki_image[1] for wiki_image in image_batch)
    params['titles'] = titles
    
    # request data
    resp = requests.get(url = url, headers = headers, params = params)
    
    return resp

In [None]:
def extract_image_data(response):
    """
    Extract image data for a batch of images.
    
    Params:
        response: requests reponse from Wikipedia API
        
    Returns:
        image_dict_list: list of dictionaries with data for each image in the batch.
                         e.g. see image_dict below        
    """
    resp = response.json()
    image_dict_list = [] 
    
    # wikipedia normalizes file names (e.g. 'File:François_Ier_Louvre.jpg ' to 'File:François Ier Louvre.jpg')
    # keep track of original vs. normalized files names (to match to file name in wikipedia_tdih.db)
    file_names = resp['query']['normalized']
    file_names_dict = {file['to']:file['from'] for file in file_names}
    
    # fields that, if available, are dictionaries from which data in ['value'] needs to be extracted
    fields_with_value = ['image_description','image_license_name', 'image_usage_terms', 
                         'image_attrib_required', 'image_copyright', 'image_restriction', 'image_license']
    
    # image data is dictionary resp['query']['pages'] where each key has the data for one image
    for image in resp['query']['pages'].values():
        # build up image_dict
        image_dict = {'title' : file_names_dict[image['title']],
                      'title_normalized': image['title'],
                      'image_repository' : image['imagerepository'],
                      'user' : image['imageinfo'][0]['user'],
                      'image_url' : image['imageinfo'][0]['url'],
                      'image_date' : image['imageinfo'][0]['extmetadata']['DateTime']['value'],
                      'image_credit' : image['imageinfo'][0]['extmetadata']['Credit']['value'],
                      'image_description' : image['imageinfo'][0]['extmetadata'].get('ImageDescription', np.nan),
                      'image_license_name' : image['imageinfo'][0]['extmetadata'].get('LicenseShortName', np.nan),
                      'image_usage_terms' : image['imageinfo'][0]['extmetadata'].get('UsageTerms', np.nan),
                      'image_attrib_required' : image['imageinfo'][0]['extmetadata'].get('AttributionRequired', np.nan),
                      'image_copyright' : image['imageinfo'][0]['extmetadata'].get('Copyrighted', np.nan),
                      'image_restriction' : image['imageinfo'][0]['extmetadata'].get('Restrictions', np.nan),
                      'image_license' : image['imageinfo'][0]['extmetadata'].get('License', np.nan)}
        
        # if available, add data from 'value'
        for field in fields_with_value:
            if isinstance(image_dict[field], dict):
                image_dict[field] = image_dict[field]['value']

        # if image has image_description it is usually html data 
        # extract text for image_description
        if isinstance(image_dict['image_description'], str):
            image_dict['image_description'] = BeautifulSoup(image_dict['image_description']).text
            
        # standardize the data in image_attrib_required and image_copyright (sample values are 'False', 'false', 'True', etc)
        for str_data in ['image_attrib_required', 'image_copyright']:
            image_dict[str_data] = image_dict[str_data].strip().lower()
    
        image_dict_list.append(image_dict)
        
    return image_dict_list

In [None]:
def add_image_data_to_db(image_dict, cursor, doe = DOE):
    """
    Add image data to wikipedia_tdih.db
    
    Params:
        image_dict: dictionary of data to be added to the database
        cursor: database cursor
        doe: date of entry (defaults to current day)
        
    Returns:
        None
    """
    
    # get license_id and user_id (update tables first, if license_id and user_id not present)
    license_data = (image_dict['image_license_name'], image_dict['image_usage_terms'],
                    image_dict['image_attrib_required'], image_dict['image_copyright'])
    license_id = update_table_wiki_copyright_license(license_data, cursor, doe)
    user_id = update_table_wiki_user(image_dict['user'], cursor, doe)
    
    # get image_id
    cursor.execute('SELECT image_id FROM wiki_image WHERE image_url = ?', (image_dict['image_url'], ))
    image_id = cursor.fetchone()[0]
    
    # update table wiki_image_info
    image_data = (doe, image_id, license_id, user_id,
                  image_dict['image_repository'], image_dict['image_date'], image_dict['image_credit'],
                  image_dict['description'])
    update_table_wiki_image_info(image_data, cursor, doe)

    

In [None]:
#
def update_table_wiki_link_data_log(link_id, status_code, cursor, doe = DOE):
    """
    Update table wiki_link_data_log
    
    Params:
        link_id: id of link to be updated in wiki_link_data_log
        status_code: requests.status_code from Wikipedia API
        cursor: database_cursor
        doe: date of entry (defaults to current day)
    
    Returns:
        update_status_dict: dictionary with status of wiki_day_data_log update
                            e.g. {'doe': doe,
                                  'wiki_table_name': 'wiki_day_data_log',
                                  'update_status': update_status
                                  'update_note': np.nan}  # this is relevant to other tables (e.g. wiki_event)
    """
    
    data = (doe, link_id, status_code)
    
    try:
        cursor.execute('INSERT INTO wiki_link_data_log VALUES (null,?,?,?)', data)
        update_status = 'update_complete'
    
    except Exception as e:
        update_status = repr(e)
        
    update_status_dict = {'doe': doe,
                          'wiki_table_name': 'wiki_link_data_log',
                          'update_status': update_status,
                          'update_note': np.nan}
        
    return update_status_dict

In [None]:
def update_table_wiki_image_info(image_data, cursor, doe):
    """
    Update table wiki_image_info.
    
    Params:
        image_data: tuple with data for image
        cursor: database cursor
        doe: date of entry (defaults to current day)
        
    Returns:
        None
    """
    cursor.execute('INSERT INTO wiki_image_info VALUES (null,?,?,?,?,?,?,?,?)', image_data)

In [None]:
def update_table_wiki_user(user_name, cursor, doe = DOE):
    """
    Update (if needed) table wiki_user
    
    Params:
        user_name: wiki_user_name
        cursor: database cursor
        doe: date of entry (defaults to current day)
        
    Returns:
        user_id
    """
    # add user to wiki_user (if not already in database)
    cursor.execute('INSERT or IGNORE INTO wiki_user VALUES (null,?)', (user_name))
    
    # get user_id
    cursor.execute('SELECT user_id FROM wiki_user WHERE user_name = ?', user_name)
    user_id = cursor.fetchone()[0]
    
    return user_id

In [None]:
def update_table_wiki_copyright_license(license_data, cursor, doe = DOE):
    """
    Update (if needed) table wiki_copyright_license)
    
    Params:
        license_data
    """
    cursor.execute('''SELECT copyright_license_id FROM wiki_copyright_license WHERE 
                        license_name = ? AND
                        license_description = ? AND
                        attrib_required = ? AND
                        copyright = ?''', license_data)
    
    license_id = cursor.fetchone()
    
    try:
        license_id = license_id[0]

    except:
        # insert the new license type into wiki_copyright_license
        license_data = (doe, *license_data)
        cursor.execute('INSERT INTO wiki_copyright_license VALUES (null, ?,?,?,?,?)', license_data)
        license_id = cursor.lastrowid
        
    return license_id