# Grab some sample data from flickr API

In this notebook, we use the Flickr API to obtain nearly randomly selected photo and user IDs. We use this information to query EXIF data through the API. The goal is to obtain a large sample dataset to be able to conduct further investigations.

In [11]:
# Python modules
import flickrapi
import pandas as pd
import random
import requests
import json
import string
import time
import os
from datetime import date
from dotenv import load_dotenv
load_dotenv()
from datetime import datetime

# Import additional functions
from flickr_functions import df_remove_dupes

# Keys needed for API access
api_key = os.getenv('flickr_api_key')
api_secret = os.getenv('flickr_api_secret')

# Flickr API object
flickr = flickrapi.FlickrAPI(api_key, api_secret, format='parsed-json')

# Data directory used to store CVS files
data_dir = './data/'

In [12]:
# Define the dataframes and lists with columns

df_photo_ids = pd.DataFrame(columns = ['id', 
                                       'owner', 
                                       'secret', 
                                       'title', 
                                       'ispublic'])

columns_df_photo_exif = ['id', 
        'Image Width', 
        'Image Height', 
        'Bits Per Sample', 
        'Compression', 
        'Photometric Interpretation', 
        'Make', 
        'Model', 
        'Orientation', 
        'Samples Per Pixel', 
        'X-Resolution', 
        'Y-Resolution', 
        'Resolution Unit', 
        'Software', 
        'Date and Time (Modified)', 
        'YCbCr Positioning', 
        'Exposure', 
        'Aperture', 
        'Exposure Program', 
        'ISO Speed', 
        'Sensitivity Type', 
        'Exif Version', 
        'Date and Time (Original)', 
        'Date and Time (Digitized)', 
        'Components Configuration', 
        'Compressed Bits Per Pixel', 
        'Exposure Bias', 
        'Max Aperture Value', 
        'Metering Mode', 
        'Light Source', 
        'Flash', 
        'Focal Length', 
        'Flashpix Version', 
        'Color Space', 
        'File Source', 
        'Scene Type', 
        'Custom Rendered', 
        'Exposure Mode', 
        'White Balance', 
        'Digital Zoom Ratio', 
        'Focal Length (35mm format)', 
        'Scene Capture Type', 
        'Gain Control', 
        'Contrast', 
        'Saturation', 
        'Sharpness', 
        'Subject Distance Range', 
        'Interop Index', 
        'Interop Version', 
        'Coded Character Set', 
        'Envelope Record Version', 
        'Application Record Version', 
        'Date Created', 
        'Time Created', 
        'Global Angle', 
        'Global Altitude', 
        'IPTCDigest', 
        'XMPToolkit', 
        'Format', 
        'Color Mode', 
        'ICCProfile Name', 
        'Legacy IPTCDigest', 
        'Creator Tool', 
        'Metadata Date', 
        'Document ID', 
        'Instance ID', 
        'Original Document ID',
        'owner',
        'secret',
        'title',
        'lat',
        'lon',
        'acc']

df_photo_ids_final = pd.DataFrame(columns = columns_df_photo_exif)

In [13]:
# Maybe for later use?

columns_exif = ['id',
                'Image Width', 
                'Image Height', 
                'Compression', 
                'Make', 
                'Model', 
                'Orientation', 
                'Software', 
                'Exposure', 
                'Aperture', 
                'ISO Speed', 
                'Date and Time (Original)', 
                'Flash', 
                'Focal Length', 
                'Focal Length (35mm format)', 
                'Lens Make', 
                'Lens Model']

## Harvest image and user IDs

In [14]:
# Get some nice random words from an API

URL='https://random-word-api.herokuapp.com/word?number='

def get_words(number, length):
    response = requests.get(URL + str(number) + '&length=' + str(length)).text
    return json.loads(response)

In [17]:
get_words(5, 5)

['peins', 'pride', 'favor', 'misty', 'saner']

In [None]:
# Iterate through words, query API for results containing this word, add to dataframe

for i in range(1):

    # start timer
    start_time = time.time()

    # count the number of added entries
    counter = 0

    for word in get_words(2, 5):

        try:
            get_photos = flickr.photos.search(text = 'word',
                            privacy_filter = 1, 
                            content_types = 0,
                            page = 1,
                            per_page = 500)
        except flickrapi.exceptions.FlickrError as ex:
            print("Error code: %s" % ex.code)
        
        for photo in get_photos.get('photos').get('photo'):
            df_photo_ids.loc[len(df_photo_ids)] = {'id': photo.get('id'), 
                                                'owner': photo.get('owner'),
                                                'secret': photo.get('secret'),
                                                'title': photo.get('title'),
                                                'ispublic': photo.get('ispublic')}
            counter += 1
        
    # end the timer and calculate duration
    end_time = time.time()
    minutes, seconds = divmod(int(end_time - start_time), 60)

    print(f'Fetched {counter} entries in {minutes} minutes and {seconds} seconds. Dataframe is now {len(df_photo_ids)} rows long.')

In [None]:
# Remove duplicates
df_photo_ids = df_remove_dupes(df_photo_ids)

In [None]:
today = date.today().strftime("%Y%m%d")
counter = 0

for filename in os.listdir(data_dir):
    if filename.startswith(today) and filename.endswith('df_photo_ids.csv'):
        counter += 1
        
print(f'Found {counter} files for todays data')

next = i + 1

df_photo_ids.to_csv(data_dir + f'{today}_{next}-df_photo_ids.csv')

# Save as csv, just in case
#df_photo_ids.to_csv('./data/20230910_6-df_photo_ids.csv')
# Or restore it?
#df_photo_ids = pd.read_csv('./data/df_photo_ids.csv')

In [None]:
# Create dataframes from cvs files in data_dir

df_list = []

for filename in os.listdir(data_dir):
    if filename.endswith('df_photo_ids.csv'):
        df_tmp = pd.read_csv(data_dir + filename)
        print(f'Found file {filename}. Dataframe with {len(df_tmp)} rows created.')
        df_list.append(df_tmp)

# Concatenate dataframes to one
df_photo_ids_final = pd.concat(df_list)
print(f'Resulting dataframe with {len(df_photo_ids_final)} rows created.')

# Create a cvs file as backup
df_photo_ids_final.to_csv(data_dir + f'{date.today().strftime("%Y%m%d")}-df_photo_ids-final.csv')

In [None]:
# Read csv file if needed
#df_photo_ids_final = pd.read_csv(f'./data/{date.today().strftime("%Y%m%d")}-df_photo_ids-final.csv')
#print(f'Imported csv file with {len(df_photo_ids_final)} rows.')

In [46]:
#df_photo_ids_final.to_csv('./data/df_photo_ids_final.bak')
df_photo_ids_final = pd.read_csv('./data/old/20230910-df_photo_ids-final.csv', index_col=[0])

## Get EXIF data with IDs

In [None]:
#df_photo_ids_final[df_photo_ids_final['id'] == 53176360476]
df_photo_ids_final = df_photo_ids_final.drop(range(27404), errors='ignore')
df_photo_ids_final

In [47]:
# Iterate through dataframe, query API for EXIF data and add to another dataframe

print(f'Number of rows of dataframe: {len(df_photo_ids_final)}')

# start timer
start_time = time.time()

# Count the number of added entries
counter = 0

for i, row in df_photo_ids_final.iterrows():

    # Query the API
    try:
        exif_data = flickr.photos.getExif(photo_id = row['id'], photo_secret = 'secret').get('photo').get('exif')

        # Print counter
        print(f'{datetime.now().strftime("%H:%M:%S")}: Added entry {counter}: {row["id"]}, {row["title"]} | {len(df_photo_ids_final) - counter} remaining')
        counter += 1

        # Temporary dict
        dict_tmp = {}

        # Go through every EXIF key value pair available and add to tmp dict
        for exif in exif_data:
            key = exif.get('label')
            value = exif.get('raw').get('_content')
            dict_tmp[key] = value

        # Add ID from ID dataframe
        dict_tmp.update({'id': row['id']})
        dict_tmp.update({'owner': row['owner']})
        dict_tmp.update({'secret': row['secret']})
        dict_tmp.update({'title': row['title']})
        
        dict_tmp.update({'lat': 'tbd'})
        dict_tmp.update({'lon': 'tbd'})
        dict_tmp.update({'acc': 'tbd'})

        # Add to dataframe
        df_photo_exif.loc[len(df_photo_exif)] = dict_tmp

        # Add to CSV file
        filename = './data/df_photo_exif_final_tmp.csv'
        df_tmp = pd.DataFrame(columns = columns_df_photo_exif)
        df_tmp.loc[len(df_tmp)] = dict_tmp
        df_tmp.to_csv(filename, mode='a', header=not os.path.exists(filename))

    except flickrapi.exceptions.FlickrError as ex:
        print(f'!!! Error code: {ex.code} for id {row["id"]}')
    
    # Delete row from photo id dataframe
    df_photo_ids_final = df_photo_ids_final.drop(i)

# end the timer and calculate duration
end_time = time.time()
minutes, seconds = divmod(int(end_time - start_time), 60)

print(f'Fetched {counter} entries in {minutes} minutes and {seconds} seconds. Dataframe is now {len(df_photo_exif)} rows long.')

Number of rows of dataframe: 360498
19:19:37: Added entry 0: 53175498298, Buy Tadarise 20mg Online | 360498 remaining
19:19:38: Added entry 1: 53175304873, Versturdalur, Jokulsargljufur N.P. (Iceland) | 360496 remaining
19:19:38: Added entry 2: 53174094612, Geologic Formations, Theodore Roosevelt NP (North Unit), McKenzie County, ND (5) | 360494 remaining
19:19:39: Added entry 3: 53174087012, Geologic Formations, Theodore Roosevelt NP (North Unit), McKenzie County, ND (9) | 360492 remaining
19:19:39: Added entry 4: 53175117870, Geologic Formations, Theodore Roosevelt NP (North Unit), McKenzie County, ND (8) | 360490 remaining
19:19:40: Added entry 5: 53175117725, Geologic Formations, Theodore Roosevelt NP (North Unit), McKenzie County, ND (7) | 360488 remaining
19:19:41: Added entry 6: 53174112289, DSC_4686 former Crown Hotel, 2 Oxide Street, Broken Hill NSW | 360486 remaining
!!! Error code: 2 for id 53172905117
19:19:42: Added entry 7: 53172870344, Rural Farm Life - Crab Orchard, Ten

KeyboardInterrupt: 

In [None]:
#df_photo_ids_final = df_photo_ids_final.drop(range(1000), errors='ignore')

In [None]:
df_photo_exif_final = pd.read_csv(data_dir + f'df_photo_exif_final.csv', index_col=[0])
print(f'Imported csv file with {len(df_photo_exif_final)} rows')

In [None]:
# Remove duplicates and save dataframe to csv
df_photo_exif_final = df_remove_dupes(df_photo_exif_final)
#df_photo_exif_final.to_csv(f'./data/df_photo_exif_final.csv')