# Grab some sample data from flickr API

In this notebook, we use the Flickr API to obtain nearly randomly selected photo and user IDs. We use this information to query EXIF data through the API. The goal is to obtain a large sample dataset to be able to conduct further investigations.

In [2]:
# Python modules
import flickrapi
import pandas as pd
import requests
import json
import time
import os
from datetime import date
from dotenv import load_dotenv
load_dotenv()
from datetime import datetime
import googlemaps

# Import additional functions
from flickr_functions import df_remove_dupes

# Keys needed for API access
api_key = os.getenv('flickr_api_key')
api_secret = os.getenv('flickr_api_secret')
maps_api_key = os.getenv('maps_api_key')

# Configure maps access
gmaps = googlemaps.Client(key=maps_api_key)

# Flickr API object
flickr = flickrapi.FlickrAPI(api_key, api_secret, format='parsed-json')

# Data directory used to store CVS files
data_dir = './data/'

In [3]:
# Define the dataframes and lists with columns

df_photo_ids = pd.DataFrame(columns = ['id', 
                                       'owner', 
                                       'secret', 
                                       'title', 
                                       'ispublic'])

columns_df_photo_exif = ['id', 'Image Width', 'Image Height', 'Compression', 'Make', 'Model',
       'Orientation', 'Software', 'Date and Time (Modified)', 'Exposure',
       'Aperture', 'ISO Speed', 'Date and Time (Original)',
       'Date and Time (Digitized)', 'Flash', 'Focal Length', 'White Balance',
       'owner', 'secret', 'title', 'lat', 'lon', 'acc', 'country', 'admin_lvl1', 'admin_lvl2', 'city']

df_photo_exif = pd.DataFrame(columns = columns_df_photo_exif)

## Harvest image and user IDs, use IDs to query for EXIF and geo data

In [4]:
# Get some nice random words from an API

languages = ["it","de","zh","es"]

URL='https://random-word-api.herokuapp.com/word?number='

def get_words(number, length):
    response = requests.get(URL + str(number) + '&length=' + str(length) + '&lang=de').text
    return json.loads(response)

In [5]:
# Iterate through words, query API for results containing this word, add to dataframe

df_photo_ids = pd.DataFrame(columns = ['id', 
                                       'owner', 
                                       'secret', 
                                       'title', 
                                       'ispublic'])

for i in range(1):

    # start timer
    start_time = time.time()

    # count the number of added entries
    counter = 0

    for word in get_words(50, 6):

        print(word)

        try:
            get_photos = flickr.photos.search(text = word,
                            privacy_filter = 1, 
                            content_types = 0,
                            page = 1,
                            per_page = 100)
        except flickrapi.exceptions.FlickrError as ex:
            print("Error code: %s" % ex.code)
        
        for photo in get_photos.get('photos').get('photo'):
            df_photo_ids.loc[len(df_photo_ids)] = {'id': photo.get('id'), 
                                                'owner': photo.get('owner'),
                                                'secret': photo.get('secret'),
                                                'title': photo.get('title'),
                                                'ispublic': photo.get('ispublic')}
            counter += 1
        
    # end the timer and calculate duration
    end_time = time.time()
    minutes, seconds = divmod(int(end_time - start_time), 60)

    print(f'Fetched {counter} entries in {minutes} minutes and {seconds} seconds. Dataframe is now {len(df_photo_ids)} rows long.')

    # Remove duplicates
    df_photo_ids = df_remove_dupes(df_photo_ids)

    # Look for ids that are in the exif file already (and drop them)
    df_exif = pd.read_csv(data_dir + 'df_photo_exif_final.csv', index_col=[0])
    print(f'Length before: {len(df_photo_ids)}')
    df_photo_ids = df_photo_ids[~df_photo_ids.id.isin(df_exif.id)]
    print(f'Length after: {len(df_photo_ids)}')

    df_photo_ids.to_csv(data_dir + 'df_photo_ids.csv')


# Iterate through dataframe, query API for EXIF data and add to another dataframe (and csv!)

length = len(df_photo_ids)
print(f'Number of rows of dataframe: {length}')

# start timer
start_time = time.time()

# Count the number of added entries
counter = 0

for i, row in df_photo_ids.iterrows():

    # Query the API
    try:
        exif_data = flickr.photos.getExif(photo_id = row['id'], photo_secret = 'secret').get('photo').get('exif')

        # Print counter
        print(f'{datetime.now().strftime("%H:%M:%S")}: Added entry {counter}: {row["id"]}, {row["title"]} | {length - counter} remaining')
        counter += 1

        # Temporary dict
        dict_tmp = {}

        # Go through every EXIF key value pair available and add to tmp dict
        for exif in exif_data:
            key = exif.get('label')
            value = exif.get('raw').get('_content')
            dict_tmp[key] = value

        # Add ID from ID dataframe
        dict_tmp.update({'id': row['id']})
        dict_tmp.update({'owner': row['owner']})
        dict_tmp.update({'secret': row['secret']})
        dict_tmp.update({'title': row['title']})
        
        # Look for geodata and add it
        try:
            geodata = flickr.photos.geo.getLocation(photo_id = row['id'])

            lat = geodata.get('photo').get('location').get('latitude')
            lon = geodata.get('photo').get('location').get('longitude')
            acc = geodata.get('photo').get('location').get('accuracy')

            dict_tmp.update({'lat': lat})
            dict_tmp.update({'lon': lon})
            dict_tmp.update({'acc': acc})

            # Look up reverse geocoding by querying maps API

            country, admin_lvl1, admin_lvl2, city = '', '', '', ''

            # Query the API with the location data
            try:
                geo = gmaps.reverse_geocode((lat, lon))

                print(geo)

                # Get the data from the response
                for comp in geo[0].get('address_components'):
                    if 'country' in comp.get('types'):
                        country = comp.get('long_name')
                    if 'administrative_area_level_1' in comp.get('types'):
                        admin_lvl1 = comp.get('long_name')
                    if 'administrative_area_level_2' in comp.get('types'):
                        admin_lvl2 = comp.get('long_name')
                    if 'locality' in comp.get('types'):
                        city = comp.get('long_name')
                    elif 'postal_town' in comp.get('types'):
                        city = comp.get('long_name')

                dict_tmp.update({'country': country})
                dict_tmp.update({'admin_lvl1': admin_lvl1})
                dict_tmp.update({'admin_lvl2': admin_lvl2})
                dict_tmp.update({'city': city})

                print(f'Added geodata: {lat}, {lon} in {country}, {admin_lvl1}, {admin_lvl2}, {city}')
                
            except googlemaps.exceptions.ApiError as err :
                print('API key is invalid')

                dict_tmp.update({'country': 'na'})
                dict_tmp.update({'admin_lvl1': 'na'})
                dict_tmp.update({'admin_lvl2': 'na'})
                dict_tmp.update({'city': 'na'})


        except flickrapi.exceptions.FlickrError as ex:

            # Add n/a if there's no geodata
            dict_tmp.update({'lat': 'na'})
            dict_tmp.update({'lon': 'na'})
            dict_tmp.update({'acc': 'na'})
            dict_tmp.update({'country': 'na'})
            dict_tmp.update({'admin_lvl1': 'na'})
            dict_tmp.update({'admin_lvl2': 'na'})
            dict_tmp.update({'city': 'na'})

            print("!!! Geo: Error code: %s" % ex.code)

        # Add to dataframe
        df_photo_exif.loc[len(df_photo_exif)] = dict_tmp

        # Filename of csv to add data to
        filename = './data/df_photo_exif_final.csv'
        #filename = './data/df_photo_exif_final_bak.csv'
        
        # Create a temporary dataframe
        df_tmp = pd.DataFrame(columns = columns_df_photo_exif)
        df_tmp.loc[len(df_tmp)] = dict_tmp

        # If there is not enough information in dataset, do not add to csv
        #if len(df_tmp[df_tmp.count(axis='columns') >= 5]) > 0:
        df_tmp.to_csv(filename, mode='a', header=not os.path.exists(filename))
        #else:
        #    print('Not enough data, sry.')

    except flickrapi.exceptions.FlickrError as ex:
        print(f'!!! Error code: {ex.code} for id {row["id"]}')
    
    # Delete row from photo id dataframe
    df_photo_ids = df_photo_ids.drop(i)

# end the timer and calculate duration
end_time = time.time()
minutes, seconds = divmod(int(end_time - start_time), 60)

print(f'Fetched {counter} entries in {minutes} minutes and {seconds} seconds.')

Mutase
Jargon
Flavon
Rakish
Geruch
Telial
Plasma
Würden
Curare
Boccia
Saigas
Lunule
Moment
Succer
Nonnen
Bieter
Rillen
Nitier
Hilfen
Zucker
Isozym
Tempel
Rakete
Prosit
Pollen
Lizenz
grimat
Oktate
Tetrac
Enokis
Sublot
Schafe
Renten
Midleg
Kochen
Stärke
Acnode
Mieter
Hummer
Emigre
Sprite
Perlit
Torten
Mumien
weiser
Gerste
Canker
Berhyd
Impies
Raupen
Fetched 4164 entries in 0 minutes and 37 seconds. Dataframe is now 4164 rows long.
Found 6 duplicates. Dataframe is now 4158 rows long.
Length before: 4158
Length after: 4158
Number of rows of dataframe: 4158
20:09:13: Added entry 0: 16257586099, Fenugreek (IMG_3128 v2K) | 4158 remaining
!!! Geo: Error code: 2
20:09:13: Added entry 1: 4335451905, Teresa | 4157 remaining
!!! Geo: Error code: 2
20:09:14: Added entry 2: 48676898896, https://biochemistry.aureliusconferences.com/immunology-enzymology/ | 4156 remaining
!!! Geo: Error code: 2
20:09:14: Added entry 3: 14143478563, pone.0041389.g005.png | 4155 remaining
!!! Geo: Error code: 2
20:09:15

In [None]:
df_photo_ids.to_csv(data_dir + 'df_photo_ids.csv')