# Grab some sample data from flickr API

In this notebook, we use the Flickr API to obtain nearly randomly selected photo and user IDs. We use this information to query EXIF data through the API. The goal is to obtain a large sample dataset to be able to conduct further investigations.

In [None]:
# Python modules
import flickrapi
import pandas as pd
import requests
import json
import time
import os
from datetime import date
from dotenv import load_dotenv
load_dotenv()
from datetime import datetime
import googlemaps

# Import additional functions
from flickr_functions import df_remove_dupes

# Keys needed for API access
api_key = os.getenv('flickr_api_key')
api_secret = os.getenv('flickr_api_secret')
maps_api_key = os.getenv('maps_api_key')

# Configure maps access
gmaps = googlemaps.Client(key=maps_api_key)

# Flickr API object
flickr = flickrapi.FlickrAPI(api_key, api_secret, format='parsed-json')

# Data directory used to store CVS files
data_dir = './data/'

In [None]:
# Define the dataframes and lists with columns

df_photo_ids = pd.DataFrame(columns = ['id', 
                                       'owner', 
                                       'secret', 
                                       'title', 
                                       'ispublic'])

columns_df_photo_exif = ['id', 'Image Width', 'Image Height', 'Compression', 'Make', 'Model',
       'Orientation', 'Software', 'Date and Time (Modified)', 'Exposure',
       'Aperture', 'ISO Speed', 'Date and Time (Original)',
       'Date and Time (Digitized)', 'Flash', 'Focal Length', 'White Balance',
       'owner', 'secret', 'title', 'lat', 'lon', 'acc', 'country', 'admin_lvl1', 'admin_lvl2', 'city']

df_photo_ids = pd.DataFrame(columns = columns_df_photo_exif)

## First step: Harvest image and user IDs

In [None]:
# Get some nice random words from an API

URL='https://random-word-api.herokuapp.com/word?number='

def get_words(number, length):
    response = requests.get(URL + str(number) + '&length=' + str(length)).text
    return json.loads(response)

In [None]:
# Iterate through words, query API for results containing this word, add to dataframe

df_photo_ids = pd.DataFrame(columns = ['id', 
                                       'owner', 
                                       'secret', 
                                       'title', 
                                       'ispublic'])

for i in range(1):

    # start timer
    start_time = time.time()

    # count the number of added entries
    counter = 0

    for word in get_words(1, 5):

        try:
            get_photos = flickr.photos.search(text = word,
                            privacy_filter = 1, 
                            content_types = 0,
                            page = 1,
                            per_page = 500)
        except flickrapi.exceptions.FlickrError as ex:
            print("Error code: %s" % ex.code)
        
        for photo in get_photos.get('photos').get('photo'):
            df_photo_ids.loc[len(df_photo_ids)] = {'id': photo.get('id'), 
                                                'owner': photo.get('owner'),
                                                'secret': photo.get('secret'),
                                                'title': photo.get('title'),
                                                'ispublic': photo.get('ispublic')}
            counter += 1
        
    # end the timer and calculate duration
    end_time = time.time()
    minutes, seconds = divmod(int(end_time - start_time), 60)

    print(f'Fetched {counter} entries in {minutes} minutes and {seconds} seconds. Dataframe is now {len(df_photo_ids)} rows long.')

    # Remove duplicates
    df_photo_ids = df_remove_dupes(df_photo_ids)

    # Look for ids that are in the exif file already (and drop them)
    df_exif = pd.read_csv(data_dir + 'df_photo_exif_final.csv', index_col=[0])
    print(f'Length before: {len(df_photo_ids)}')
    df_photo_ids = df_photo_ids[~df_photo_ids.id.isin(df_exif.id)]
    print(f'Length after: {len(df_photo_ids)}')

    df_photo_ids.to_csv(data_dir + 'df_photo_ids.csv')

In [None]:
'''
today = date.today().strftime("%Y%m%d")
counter = 0

for filename in os.listdir(data_dir):
    if filename.startswith(today) and filename.endswith('df_photo_ids.csv'):
        counter += 1
        
print(f'Found {counter} files for todays data')

next = i + 1

df_photo_ids.to_csv(data_dir + f'{today}_{next}-df_photo_ids.csv')

# Save as csv, just in case
#df_photo_ids.to_csv('./data/20230910_6-df_photo_ids.csv')
# Or restore it?
#df_photo_ids = pd.read_csv('./data/df_photo_ids.csv')
'''

In [None]:
'''
# Create dataframes from cvs files in data_dir

df_list = []

for filename in os.listdir(data_dir):
    if filename.endswith('df_photo_ids.csv'):
        df_tmp = pd.read_csv(data_dir + filename)
        print(f'Found file {filename}. Dataframe with {len(df_tmp)} rows created.')
        df_list.append(df_tmp)

# Concatenate dataframes to one
df_photo_ids = pd.concat(df_list)
print(f'Resulting dataframe with {len(df_photo_ids)} rows created.')

# Create a cvs file as backup
df_photo_ids.to_csv(data_dir + f'{date.today().strftime("%Y%m%d")}-df_photo_ids-final.csv')
'''

## Second step: Get EXIF data with IDs

In [102]:
# Iterate through dataframe, query API for EXIF data and add to another dataframe (and csv!)

print(f'Number of rows of dataframe: {len(df_photo_ids)}')

# start timer
start_time = time.time()

# Count the number of added entries
counter = 0

for i, row in df_photo_ids.iterrows():

    # Query the API
    try:
        exif_data = flickr.photos.getExif(photo_id = row['id'], photo_secret = 'secret').get('photo').get('exif')

        # Print counter
        print(f'{datetime.now().strftime("%H:%M:%S")}: Added entry {counter}: {row["id"]}, {row["title"]} | {len(df_photo_ids) - counter} remaining')
        counter += 1

        # Temporary dict
        dict_tmp = {}

        # Go through every EXIF key value pair available and add to tmp dict
        for exif in exif_data:
            key = exif.get('label')
            value = exif.get('raw').get('_content')
            dict_tmp[key] = value

        # Add ID from ID dataframe
        dict_tmp.update({'id': row['id']})
        dict_tmp.update({'owner': row['owner']})
        dict_tmp.update({'secret': row['secret']})
        dict_tmp.update({'title': row['title']})
        
        # Look for geodata and add it
        try:
            geodata = flickr.photos.geo.getLocation(photo_id = row['id'])

            lat = geodata.get('photo').get('location').get('latitude')
            lon = geodata.get('photo').get('location').get('longitude')
            acc = geodata.get('photo').get('location').get('accuracy')

            dict_tmp.update({'lat': lat})
            dict_tmp.update({'lon': lon})
            dict_tmp.update({'acc': acc})

            # Look up reverse geocoding by querying maps API

            country, admin_lvl1, admin_lvl2, city = '', '', '', ''

            # Query the API with the location data
            try:
                geo = gmaps.reverse_geocode((lat, lon))

                print(geo)

                # Get the data from the response
                for comp in geo[0].get('address_components'):
                    if 'country' in comp.get('types'):
                        country = comp.get('long_name')
                    if 'administrative_area_level_1' in comp.get('types'):
                        admin_lvl1 = comp.get('long_name')
                    if 'administrative_area_level_2' in comp.get('types'):
                        admin_lvl2 = comp.get('long_name')
                    if 'locality' in comp.get('types'):
                        city = comp.get('long_name')
                    elif 'postal_town' in comp.get('types'):
                        city = comp.get('long_name')

                dict_tmp.update({'country': country})
                dict_tmp.update({'admin_lvl1': admin_lvl1})
                dict_tmp.update({'admin_lvl2': admin_lvl2})
                dict_tmp.update({'city': city})

                print(f'Added geodata: {lat}, {lon} in {country}, {admin_lvl1}, {admin_lvl2}, {city}')
                
            except googlemaps.exceptions.ApiError as err :
                print('API key is invalid')

                dict_tmp.update({'country': 'na'})
                dict_tmp.update({'admin_lvl1': 'na'})
                dict_tmp.update({'admin_lvl2': 'na'})
                dict_tmp.update({'city': 'na'})


        except flickrapi.exceptions.FlickrError as ex:

            # Add n/a if there's no geodata
            dict_tmp.update({'lat': 'na'})
            dict_tmp.update({'lon': 'na'})
            dict_tmp.update({'acc': 'na'})
            dict_tmp.update({'country': 'na'})
            dict_tmp.update({'admin_lvl1': 'na'})
            dict_tmp.update({'admin_lvl2': 'na'})
            dict_tmp.update({'city': 'na'})

            print("!!! Geo: Error code: %s" % ex.code)

        # Add to dataframe
        df_photo_exif.loc[len(df_photo_exif)] = dict_tmp

        # Filename of csv to add data to
        filename = './data/df_photo_exif_final.csv'
        #filename = './data/df_photo_exif_final_bak.csv'
        
        # Create a temporary dataframe
        df_tmp = pd.DataFrame(columns = columns_df_photo_exif)
        df_tmp.loc[len(df_tmp)] = dict_tmp

        # If there is not enough information in dataset, do not add to csv
        if len(df_tmp[df_tmp.count(axis='columns') >= 10]) > 0:
            df_tmp.to_csv(filename, mode='a', header=not os.path.exists(filename))
        else:
            print('Not enough data, sry.')

    except flickrapi.exceptions.FlickrError as ex:
        print(f'!!! Error code: {ex.code} for id {row["id"]}')
    
    # Delete row from photo id dataframe
    df_photo_ids = df_photo_ids.drop(i)

# end the timer and calculate duration
end_time = time.time()
minutes, seconds = divmod(int(end_time - start_time), 60)

print(f'Fetched {counter} entries in {minutes} minutes and {seconds} seconds. Dataframe is now {len(df_photo_exif)} rows long.')

Number of rows of dataframe: 125701
10:07:32: Added entry 0: 53163392417, ...skids the turn and backs into the dock at 30 | 125701 remaining
!!! Geo: Error code: 2
!!! Error code: 2 for id 53151587248
!!! Error code: 2 for id 53140523287
!!! Error code: 2 for id 53141324464
!!! Error code: 2 for id 53141324449
!!! Error code: 2 for id 53141115566
!!! Error code: 2 for id 53141536230
!!! Error code: 2 for id 53140523212
!!! Error code: 2 for id 53140523237
!!! Error code: 2 for id 53141590843
!!! Error code: 2 for id 53141324354
!!! Error code: 2 for id 53141590858
!!! Error code: 2 for id 53141115481
!!! Error code: 2 for id 53141590813
!!! Error code: 2 for id 53141324309
!!! Error code: 2 for id 53141590778
!!! Error code: 2 for id 53141536110
!!! Error code: 2 for id 53141115326
!!! Error code: 2 for id 53140523052
!!! Error code: 2 for id 53141590678
!!! Error code: 2 for id 53141535935
!!! Error code: 2 for id 53141115221
!!! Error code: 2 for id 53141535930
!!! Error code: 2 for 

do_request: Status code 500 received, content:
    <!DOCTYPE html>
<html xmlns:cc="http://creativecommons.org/ns#" lang="en-us" class="no-js fluid html-error-500-page-view scrolling-layout ">
<head>
	<meta property="og:site_name" content="Flickr" />
	<meta property="og:updated_time" content="2023-09-17T08:35:03.091Z" />
	
	<script type="application/ld+json">
		[{
			"@context": "http://schema.org",
			"@type": "WebSite",
			"name": "Flickr",
			"url": "https://www.flickr.com",
			"potentialAction": {
				"@type": "SearchAction",
				"target": "https://www.flickr.com/search?text={search_term_string}
    structured=yes",
				"query-input": "required name=search_term_string"
			}
		},
		{
			"@context": "http://schema.org",
			"@type": "Organization",
			"url": "http://www.flickr.com",
			"logo": "https://www.flickr.com/images/opensearch-flickr-logo.png"
		},
		{
			"@context": "http://schema.org",
			"@type": "Person",
			"name": "Flickr",
			"url": "https://www.flickr.com",
			"sameAs":

!!! Error code: None for id 52999829132
!!! Error code: 2 for id 53000429591
!!! Error code: 2 for id 53000904008
!!! Error code: 2 for id 53000903993
!!! Error code: 2 for id 53000429511
!!! Error code: 2 for id 53000581104
!!! Error code: 2 for id 53000807655
!!! Error code: 2 for id 52999829017
!!! Error code: 2 for id 53000807680
!!! Error code: 2 for id 53000807565
!!! Error code: 2 for id 53000903878
!!! Error code: 2 for id 52999828897
!!! Error code: 2 for id 53000807605
!!! Error code: 2 for id 52999828927
!!! Error code: 2 for id 53000429431
!!! Error code: 2 for id 53000581009
!!! Error code: 2 for id 53000807390
!!! Error code: 2 for id 53000429201
!!! Error code: 2 for id 53000580839
!!! Error code: 2 for id 52999828727
!!! Error code: 2 for id 52999828717
!!! Error code: 2 for id 53000580804
!!! Error code: 2 for id 53000580829
!!! Error code: 2 for id 53000429101
!!! Error code: 2 for id 53000429091
!!! Error code: 2 for id 53000903588
!!! Error code: 2 for id 5299982862

In [100]:
df_photo_ids.to_csv(data_dir + 'df_photo_ids.csv')