In [9]:
# Python modules
import flickrapi
import pandas as pd
import os
import json
from dotenv import load_dotenv
load_dotenv()
import numpy as np
import subprocess
from geopy import distance
import pycountry_convert as pc
import geopandas

# Import additional functions
from flickr_functions import df_remove_dupes

# Keys needed for API access
api_key = os.getenv('flickr_api_key')
api_secret = os.getenv('flickr_api_secret')

# Flickr API object
flickr = flickrapi.FlickrAPI(api_key, api_secret, format='parsed-json')

# Data directory used to store CVS files
data_dir = './data/'

# Load EXIF dataframe from CVS file
df = pd.read_csv(data_dir + 'df_photo_exif_cleaned.csv', index_col=[0])

# Data types
df['id'] = df['id'].astype(int)
df['aperture'] = pd.to_numeric(df['aperture'])
df['iso_speed'] = pd.to_numeric(df['iso_speed'])
df.date_and_time = pd.to_datetime(df.date_and_time, format='%Y:%m:%d %H:%M:%S', errors='coerce')
df.lat, df.lon = df.lat.astype(float), df.lon.astype(float)
df['focal_length'] = pd.to_numeric(df['focal_length'])
df['acc'] = pd.to_numeric(df['acc'])
df['compression'] = df['compression'].astype('string')
df['make'] = df['make'].astype('string')
df['model'] = df['model'].astype('string')
df['software'] = df['software'].astype('string')
df['exposure'] = df['exposure'].astype('string')
df['flash'] = df['flash'].astype('string')
df['white_balance'] = df['white_balance'].astype('string')
df['user_id'] = df['user_id'].astype('string')
df['secret'] = df['secret'].astype('string')
df['country'] = df['country'].astype('string')
df['admin_lvl1'] = df['admin_lvl1'].astype('string')
df['admin_lvl2'] = df['admin_lvl2'].astype('string')
df['city'] = df['city'].astype('string')

# Run command function
def runcmd(cmd, verbose = False, *args, **kwargs):

    process = subprocess.Popen(
        cmd,
        stdout = subprocess.PIPE,
        stderr = subprocess.PIPE,
        text = True,
        shell = True
    )
    std_out, std_err = process.communicate()
    if verbose:
        print(std_out.strip(), std_err)
    pass

In [11]:
df['user_id'].nunique() / df['user_id'].count() * 100

14.576845864589579

In [17]:
flickr = flickrapi.FlickrAPI(api_key, api_secret, format='parsed-json')
for index, row in df[df['country'] == 'Faroe Islands'].iterrows():
    sizes = flickr.photos.getSizes(photo_id = row.get('id'))
    for size in sizes.get('sizes').get('size'):
        print(size)
        #if size.get('label') == 'Original':
            #runcmd('wget ' + size.get('source'), verbose = True)

{'label': 'Square', 'width': 75, 'height': 75, 'source': 'https://live.staticflickr.com/65535/53151215564_bd9a9e27a8_s.jpg', 'url': 'https://www.flickr.com/photos/paul-mcclure/53151215564/sizes/sq/', 'media': 'photo'}
{'label': 'Large Square', 'width': 150, 'height': 150, 'source': 'https://live.staticflickr.com/65535/53151215564_bd9a9e27a8_q.jpg', 'url': 'https://www.flickr.com/photos/paul-mcclure/53151215564/sizes/q/', 'media': 'photo'}
{'label': 'Thumbnail', 'width': 100, 'height': 75, 'source': 'https://live.staticflickr.com/65535/53151215564_bd9a9e27a8_t.jpg', 'url': 'https://www.flickr.com/photos/paul-mcclure/53151215564/sizes/t/', 'media': 'photo'}
{'label': 'Small', 'width': 240, 'height': 180, 'source': 'https://live.staticflickr.com/65535/53151215564_bd9a9e27a8_m.jpg', 'url': 'https://www.flickr.com/photos/paul-mcclure/53151215564/sizes/s/', 'media': 'photo'}
{'label': 'Small 320', 'width': 320, 'height': 240, 'source': 'https://live.staticflickr.com/65535/53151215564_bd9a9e2

In [18]:
df.notna().sum() * 100 / len(df)

index            100.000000
id               100.000000
compression       70.906287
make              82.237413
model             82.301227
software          78.744382
exposure          80.680421
aperture          80.318081
iso_speed         80.445708
date_and_time     82.817759
flash             80.042834
focal_length      80.243313
white_balance     79.696927
user_id          100.000000
secret           100.000000
title             98.903943
lat               58.295333
lon               58.295333
acc               58.295333
country           57.371271
admin_lvl1        55.780591
admin_lvl2        45.765994
city              52.581978
dtype: float64

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 365127 entries, 0 to 365126
Data columns (total 23 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   index          365127 non-null  int64  
 1   id             365127 non-null  int64  
 2   compression    258898 non-null  object 
 3   make           300271 non-null  object 
 4   model          300504 non-null  object 
 5   software       287517 non-null  object 
 6   exposure       294586 non-null  object 
 7   aperture       293263 non-null  float64
 8   iso_speed      293729 non-null  float64
 9   date_and_time  302390 non-null  object 
 10  flash          292258 non-null  object 
 11  focal_length   292990 non-null  float64
 12  white_balance  290995 non-null  object 
 13  user_id        365127 non-null  object 
 14  secret         365127 non-null  object 
 15  title          361125 non-null  object 
 16  lat            212852 non-null  float64
 17  lon            212852 non-null  fl

In [20]:
exif_data = flickr.photos.getExif(photo_id = 53095647615, photo_secret = 'secret').get('photo')
exif_data

{'id': '53095647615',
 'secret': 'd3c95204e2',
 'server': '65535',
 'farm': 66,
 'camera': '',
 'exif': [{'tagspace': 'IFD0',
   'tagspaceid': 0,
   'tag': 'ImageWidth',
   'label': 'Image Width',
   'raw': {'_content': '2240'}},
  {'tagspace': 'IFD0',
   'tagspaceid': 0,
   'tag': 'ImageHeight',
   'label': 'Image Height',
   'raw': {'_content': '1680'}},
  {'tagspace': 'IFD0',
   'tagspaceid': 0,
   'tag': 'BitsPerSample',
   'label': 'Bits Per Sample',
   'raw': {'_content': '8 8 8'}},
  {'tagspace': 'IFD0',
   'tagspaceid': 0,
   'tag': 'Compression',
   'label': 'Compression',
   'raw': {'_content': 'JPEG (old-style)'}},
  {'tagspace': 'IFD0',
   'tagspaceid': 0,
   'tag': 'PhotometricInterpretation',
   'label': 'Photometric Interpretation',
   'raw': {'_content': 'RGB'}},
  {'tagspace': 'IFD0',
   'tagspaceid': 0,
   'tag': 'Orientation',
   'label': 'Orientation',
   'raw': {'_content': 'Horizontal (normal)'}},
  {'tagspace': 'IFD0',
   'tagspaceid': 0,
   'tag': 'SamplesPerPix

In [21]:
for x in exif_data.get('camera'):
    print(x)

In [23]:
df.groupby(df['make']).count().sort_values(by='id', ascending=False).head(25)

Unnamed: 0_level_0,index,id,compression,model,software,exposure,aperture,iso_speed,date_and_time,flash,...,user_id,secret,title,lat,lon,acc,country,admin_lvl1,admin_lvl2,city
make,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Canon,95958,95958,82155,95881,75301,94388,94136,93750,94930,93385,...,95958,95958,95546,54463,54463,54463,53225,51775,42492,49332
NIKON CORPORATION,58022,58022,48945,58019,56958,57550,57433,56898,57772,57349,...,58022,58022,57340,34364,34364,34364,33842,32879,27007,31133
Apple,33068,33068,18466,33068,32774,32892,33022,32911,32834,32864,...,33068,33068,31385,24439,24439,24439,24156,23528,19887,22201
SONY,31560,31560,27165,31560,29984,31436,30935,31407,31444,31105,...,31560,31560,31400,21220,21220,21220,20996,20300,15762,19720
Panasonic,16410,16410,13786,16410,15696,15901,15880,16357,16336,16307,...,16410,16410,16353,9409,9409,9409,9326,9119,7455,8881
samsung,12434,12434,9616,12434,11605,12180,12212,12218,12271,12155,...,12434,12434,12343,8963,8963,8963,8919,8766,7427,7929
FUJIFILM,9353,9353,7819,9353,9245,9298,9141,9308,9321,9120,...,9353,9353,9151,5552,5552,5552,5502,5314,4218,5117
Google,3982,3982,3350,3982,3929,3950,3950,3953,3969,3938,...,3982,3982,3977,2990,2990,2990,2970,2843,2278,2802
OLYMPUS CORPORATION,3836,3836,3215,3836,3673,3822,3818,3829,3590,3789,...,3836,3836,3826,2774,2774,2774,2700,2614,2244,2446
NIKON,3749,3749,3165,3749,3626,3692,3692,3732,3653,3719,...,3749,3749,3737,2525,2525,2525,2495,2465,1873,2345


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 365127 entries, 0 to 365126
Data columns (total 23 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   index          365127 non-null  int64  
 1   id             365127 non-null  int64  
 2   compression    258898 non-null  object 
 3   make           300271 non-null  object 
 4   model          300504 non-null  object 
 5   software       287517 non-null  object 
 6   exposure       294586 non-null  object 
 7   aperture       293263 non-null  float64
 8   iso_speed      293729 non-null  float64
 9   date_and_time  302390 non-null  object 
 10  flash          292258 non-null  object 
 11  focal_length   292990 non-null  float64
 12  white_balance  290995 non-null  object 
 13  user_id        365127 non-null  object 
 14  secret         365127 non-null  object 
 15  title          361125 non-null  object 
 16  lat            212852 non-null  float64
 17  lon            212852 non-null  fl

### Filter by radius around location

In [25]:
# Paris center
loc_paris = (48.85341, 2.3488)
df_paris = pd.DataFrame(columns = df.columns.tolist())

# Amsterdam center
loc_amsterdam = (52.37308, 4.89245)
df_amsterdam = pd.DataFrame(columns = df.columns.tolist())

# Barcelona center
loc_barcelona = (41.38289, 2.17743)
df_barcelona = pd.DataFrame(columns = df.columns.tolist())

# Yosemite national park
loc_yosemite = (37.83930, -119.51646)
df_yosemite = pd.DataFrame(columns = df.columns.tolist())

# Lake Geneva (Switzerland)
loc_geneva = (46.448961, 6.503401)
df_geneva = pd.DataFrame(columns = df.columns.tolist())

In [26]:
radius = 25

# Iterate through df
for i, row in df[df['lat'].notnull()].iterrows():

    # Assign lat and lon to tuple
    img_loc = (row['lat'], row['lon'])

    # Check if distance is smaller than 25 km
    if distance.distance(loc_paris, img_loc).km <= radius:
        df_paris.loc[len(df_paris)] = row

    if distance.distance(loc_amsterdam, img_loc).km <= radius:
        df_amsterdam.loc[len(df_amsterdam)] = row

    if distance.distance(loc_barcelona, img_loc).km <= radius:
        df_barcelona.loc[len(df_barcelona)] = row

    if distance.distance(loc_yosemite, img_loc).km <= radius + 25:
        df_yosemite.loc[len(df_yosemite)] = row

    if distance.distance(loc_geneva, img_loc).km <= radius + 25:
        df_geneva.loc[len(df_geneva)] = row

In [28]:
print(f'Paris: {len(df_paris)}')
print(f'Amsterdam: {len(df_amsterdam)}')
print(f'Barcelona: {len(df_barcelona)}')
print(f'Yosemite: {len(df_yosemite)}')
print(f'Lake Geneva: {len(df_geneva)}')

Paris: 2195
Amsterdam: 816
Barcelona: 1995
Yosemite: 85
Lake Geneva: 596


In [29]:
df_paris.groupby('make')['id'].nunique().sort_values(ascending=False)

make
Canon                          439
Apple                          406
SONY                           360
NIKON CORPORATION              320
Panasonic                      201
samsung                         96
FUJIFILM                        64
Xiaomi                          41
RICOH IMAGING COMPANY, LTD.     26
SAMSUNG                         21
NIKON                            9
OLYMPUS IMAGING CORP.            7
Phase One                        6
PENTAX Corporation               6
OLYMPUS CORPORATION              5
Google                           5
Leica Camera AG                  5
SAMSUNG TECHWIN                  4
Plustek                          4
PENTAX                           4
OnePlus                          3
UMIDIGI                          2
FUJIFILM                         2
SEIKO EPSON CORP.                2
Hipstamatic                      2
Nokia                            1
HUAWEI                           1
HONOR                            1
Sony           

In [30]:
df_amsterdam.groupby('make')['id'].nunique().sort_values(ascending=False)

make
Canon                          257
SONY                           185
samsung                         83
NIKON CORPORATION               65
Apple                           54
FUJIFILM                        21
OLYMPUS CORPORATION             15
Panasonic                       11
OLYMPUS IMAGING CORP.            7
Google                           5
Phase One A/S                    5
Nokia                            5
OLYMPUS OPTICAL CO.,LTD          4
NIKON                            4
LG Electronics                   4
HTC                              3
motorola                         3
HUAWEI                           3
LEICA CAMERA AG                  3
Nikon                            2
Plustek                          2
RICOH IMAGING COMPANY, LTD.      2
SAMSUNG                          2
FUJI PHOTO FILM CO., LTD.        1
Xiaomi                           1
Sony Ericsson                    1
Samsung Techwin                  1
NORITSU KOKI                     1
Phase One      

In [31]:
df_barcelona.groupby('make')['id'].nunique().sort_values(ascending=False)

make
Xiaomi                         880
Apple                          293
NIKON CORPORATION              178
Canon                          139
LENOVO                         116
FUJIFILM                        62
Panasonic                       43
HUAWEI                          42
samsung                         34
SONY                            30
OM Digital Solutions            28
EPSON                           18
LEICA CAMERA AG                 10
OLYMPUS IMAGING CORP.            7
OLYMPUS CORPORATION              5
asus                             4
PENTAX                           3
RICOH IMAGING COMPANY, LTD.      3
Google                           3
ZTE                              2
NORITSU KOKI                     1
NIKON                            1
LG Electronics                   1
PENTAX Corporation               1
KONICA MINOLTA                   1
Samsung                          1
FUJI PHOTO FILM CO., LTD.        1
bq                               1
EASTMAN KODAK C

In [32]:
df_yosemite.groupby('make')['id'].nunique().sort_values(ascending=False)

make
Canon                    35
SONY                     11
NIKON CORPORATION         8
PENTAX                    6
Apple                     5
FUJIFILM                  4
samsung                   4
Eastman Kodak Company     1
OLYMPUS IMAGING CORP.     1
Name: id, dtype: int64

In [33]:
df_geneva.groupby('make')['id'].nunique().sort_values(ascending=False)

make
Canon                          146
FUJIFILM                        94
SONY                            84
NIKON CORPORATION               44
OLYMPUS CORPORATION             43
samsung                         30
Apple                           21
RICOH IMAGING COMPANY, LTD.     20
Panasonic                        7
OLYMPUS IMAGING CORP.            5
OLYMPUS IMAGING CORP.            5
Leica Camera AG                  4
NIKON                            3
LEICA                            3
SAMSUNG                          2
OLYMPUS                          1
OM Digital Solutions             1
SIGMA                            1
Hasselblad                       1
Name: id, dtype: int64

### Add continent to df

In [12]:
def country_to_continent(country):
    while True:
      try:
        country_alpha2 = pc.country_name_to_country_alpha2(country)
        country_continent_code = pc.country_alpha2_to_continent_code(country_alpha2)
        country_continent_name = pc.convert_continent_code_to_continent_name(country_continent_code)
        return country_continent_name
      except:
        return np.NaN

df['continent'] = df['country'].apply(lambda x: country_to_continent(x))

In [15]:
df['continent'] = df['continent'].astype('string')

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 365127 entries, 0 to 365126
Data columns (total 24 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   index          365127 non-null  int64         
 1   id             365127 non-null  int64         
 2   compression    258898 non-null  string        
 3   make           300271 non-null  string        
 4   model          300504 non-null  string        
 5   software       287517 non-null  string        
 6   exposure       294586 non-null  string        
 7   aperture       293263 non-null  float64       
 8   iso_speed      293729 non-null  float64       
 9   date_and_time  0 non-null       datetime64[ns]
 10  flash          292258 non-null  string        
 11  focal_length   292990 non-null  float64       
 12  white_balance  290995 non-null  string        
 13  user_id        365127 non-null  string        
 14  secret         365127 non-null  string        
 15  title

In [13]:
df.sample(10)

Unnamed: 0,index,id,compression,make,model,software,exposure,aperture,iso_speed,date_and_time,...,secret,title,lat,lon,acc,country,admin_lvl1,admin_lvl2,city,continent
232308,232308,50100495677,JPEG (old-style),Panasonic,DMC-FZ72,Ver.1.0,1/250,3.8,100.0,NaT,...,f2031d9632,Return to Castle Bromwich Hall Gardens - Lower...,52.50576,-1.793658,16.0,United Kingdom,England,West Midlands,Birmingham,Europe
218713,218713,752911819,JPEG (old-style),Canon,Canon DIGITAL IXUS 65,,1/1000,2.8,,NaT,...,7be2fcb30e,Montreux Jazz Festival,46.432651,6.913833,12.0,Switzerland,Vaud,Riviera-Pays-d'Enhaut,Montreux,Europe
59028,59028,53104128575,JPEG (old-style),Panasonic,DMC-TZ30,Ver.1.0,1/1600,3.3,100.0,NaT,...,ba62444b19,Sisyrinchium angustifolium Mill. 1768 (IRIDACE...,43.245859,-2.159221,16.0,Spain,Euskadi,Gipuzkoa,Laurgain,Europe
150806,150806,50214756943,JPEG (old-style),Canon,Canon EOS 5D Mark III,,1/40,8.0,125.0,NaT,...,df3fc9683c,Altlasten,,,,,,,,
303900,303900,53193049440,JPEG (old-style),Google,Pixel 5,HDR+ 1.0.540104767zd,1/900,1.7,54.0,NaT,...,47e2cc7e4f,Lake Hollywood,34.124691,-118.33575,16.0,United States,California,Los Angeles County,Los Angeles,North America
38499,38499,28328163003,,Apple,iPhone 6,8.1.3,1/40,2.2,32.0,NaT,...,b05a67b50b,"#1A-6, NASCAR, INDY, NHRA, Lamp Box Has 67 Au...",,,,,,,,
307104,307104,52984484897,,,,GIMP 2.10.34,,,,NaT,...,b01bf0f222,Pfaffenstein - 2023-01,50.905157,14.079236,15.0,Germany,Sachsen,,Königstein,Europe
42189,42189,53150788759,JPEG (old-style),NIKON CORPORATION,NIKON D500,CyberLink PhotoDirector 11 (Windows),1/800,4.0,640.0,NaT,...,ce2e5116a1,DSC_1003,,,,,,,,
354189,354189,25318014699,,,,,,,,NaT,...,87a302f5b5,#ortakoy mosque istanbul #instacool #instamoo...,41.0186,28.9647,16.0,Türkiye,İstanbul,Fatih,,
275146,275146,49937042428,JPEG (old-style),,,,,,,NaT,...,18cf3e5083,"Trockenheit, Sachsen-Anhalt",51.795027,11.711425,5.0,Germany,Sachsen-Anhalt,,Bernburg (Saale),Europe


In [36]:
df.groupby('continent')['id'].nunique().sort_values(ascending=False)

continent
Europe           133726
North America     45450
Asia              14026
South America      6307
Oceania            5506
Africa             2780
Name: id, dtype: int64

In [38]:
df.to_csv('./data/df_photo_exif_cleaned.csv')