In [18]:
# Python modules
import flickrapi
import pandas as pd
import os
import json
from dotenv import load_dotenv
load_dotenv()
import numpy as np
import subprocess
from geopy import distance
import pycountry_convert as pc
import geopandas

# Import additional functions
from flickr_functions import df_remove_dupes

# Keys needed for API access
api_key = os.getenv('flickr_api_key')
api_secret = os.getenv('flickr_api_secret')

# Flickr API object
flickr = flickrapi.FlickrAPI(api_key, api_secret, format='parsed-json')

# Data directory used to store CVS files
data_dir = './data/'

# Load EXIF dataframe from CVS file
df = pd.read_csv(data_dir + 'df_photo_exif_cleaned.csv', index_col=[0])

# Data types
df['id'] = df['id'].astype(int)
df['aperture'] = pd.to_numeric(df['aperture'])
df['iso_speed'] = pd.to_numeric(df['iso_speed'])
df.date_and_time = pd.to_datetime(df.date_and_time, format='%Y-%m-%d %H:%M:%S', errors='coerce')
df.date_and_time_modified = pd.to_datetime(df.date_and_time_modified, format='%Y-%m-%d %H:%M:%S', errors='coerce')
df.date_and_time_digitized = pd.to_datetime(df.date_and_time_digitized, format='%Y-%m-%d %H:%M:%S', errors='coerce')
df.lat, df.lon = df.lat.astype(float), df.lon.astype(float)
df['focal_length'] = pd.to_numeric(df['focal_length'])
df['acc'] = pd.to_numeric(df['acc'])
df['compression'] = df['compression'].astype('string')
df['make'] = df['make'].astype('string')
df['model'] = df['model'].astype('string')
df['software'] = df['software'].astype('string')
df['exposure'] = df['exposure'].astype('string')
df['flash'] = df['flash'].astype('string')
df['white_balance'] = df['white_balance'].astype('string')
df['user_id'] = df['user_id'].astype('string')
df['secret'] = df['secret'].astype('string')
df['country'] = df['country'].astype('string')
df['admin_lvl1'] = df['admin_lvl1'].astype('string')
df['admin_lvl2'] = df['admin_lvl2'].astype('string')
df['city'] = df['city'].astype('string')

# Run command function
def runcmd(cmd, verbose = False, *args, **kwargs):

    process = subprocess.Popen(
        cmd,
        stdout = subprocess.PIPE,
        stderr = subprocess.PIPE,
        text = True,
        shell = True
    )
    std_out, std_err = process.communicate()
    if verbose:
        print(std_out.strip(), std_err)
    pass

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 365127 entries, 0 to 365126
Data columns (total 25 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   index                    365127 non-null  int64         
 1   id                       365127 non-null  int64         
 2   compression              258898 non-null  string        
 3   make                     300271 non-null  string        
 4   model                    300504 non-null  string        
 5   software                 287517 non-null  string        
 6   date_and_time_modified   307267 non-null  datetime64[ns]
 7   exposure                 294586 non-null  string        
 8   aperture                 293263 non-null  float64       
 9   iso_speed                293729 non-null  float64       
 10  date_and_time            302390 non-null  datetime64[ns]
 11  date_and_time_digitized  299969 non-null  datetime64[ns]
 12  flash                

In [36]:
df['user_id'].nunique() / df['user_id'].count() * 100

14.576845864589579

In [17]:
flickr = flickrapi.FlickrAPI(api_key, api_secret, format='parsed-json')
for index, row in df[df['country'] == 'Faroe Islands'].iterrows():
    sizes = flickr.photos.getSizes(photo_id = row.get('id'))
    for size in sizes.get('sizes').get('size'):
        print(size)
        #if size.get('label') == 'Original':
            #runcmd('wget ' + size.get('source'), verbose = True)

{'label': 'Square', 'width': 75, 'height': 75, 'source': 'https://live.staticflickr.com/65535/53151215564_bd9a9e27a8_s.jpg', 'url': 'https://www.flickr.com/photos/paul-mcclure/53151215564/sizes/sq/', 'media': 'photo'}
{'label': 'Large Square', 'width': 150, 'height': 150, 'source': 'https://live.staticflickr.com/65535/53151215564_bd9a9e27a8_q.jpg', 'url': 'https://www.flickr.com/photos/paul-mcclure/53151215564/sizes/q/', 'media': 'photo'}
{'label': 'Thumbnail', 'width': 100, 'height': 75, 'source': 'https://live.staticflickr.com/65535/53151215564_bd9a9e27a8_t.jpg', 'url': 'https://www.flickr.com/photos/paul-mcclure/53151215564/sizes/t/', 'media': 'photo'}
{'label': 'Small', 'width': 240, 'height': 180, 'source': 'https://live.staticflickr.com/65535/53151215564_bd9a9e27a8_m.jpg', 'url': 'https://www.flickr.com/photos/paul-mcclure/53151215564/sizes/s/', 'media': 'photo'}
{'label': 'Small 320', 'width': 320, 'height': 240, 'source': 'https://live.staticflickr.com/65535/53151215564_bd9a9e2

In [18]:
df.notna().sum() * 100 / len(df)

index            100.000000
id               100.000000
compression       70.906287
make              82.237413
model             82.301227
software          78.744382
exposure          80.680421
aperture          80.318081
iso_speed         80.445708
date_and_time     82.817759
flash             80.042834
focal_length      80.243313
white_balance     79.696927
user_id          100.000000
secret           100.000000
title             98.903943
lat               58.295333
lon               58.295333
acc               58.295333
country           57.371271
admin_lvl1        55.780591
admin_lvl2        45.765994
city              52.581978
dtype: float64

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 365127 entries, 0 to 365126
Data columns (total 25 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   index                    365127 non-null  int64         
 1   id                       365127 non-null  int64         
 2   compression              258898 non-null  string        
 3   make                     300271 non-null  string        
 4   model                    300504 non-null  string        
 5   software                 287517 non-null  string        
 6   date_and_time_modified   307267 non-null  object        
 7   exposure                 294586 non-null  string        
 8   aperture                 293263 non-null  float64       
 9   iso_speed                293729 non-null  float64       
 10  date_and_time            0 non-null       datetime64[ns]
 11  date_and_time_digitized  299969 non-null  object        
 12  flash                

In [20]:
exif_data = flickr.photos.getExif(photo_id = 53095647615, photo_secret = 'secret').get('photo')
exif_data

{'id': '53095647615',
 'secret': 'd3c95204e2',
 'server': '65535',
 'farm': 66,
 'camera': '',
 'exif': [{'tagspace': 'IFD0',
   'tagspaceid': 0,
   'tag': 'ImageWidth',
   'label': 'Image Width',
   'raw': {'_content': '2240'}},
  {'tagspace': 'IFD0',
   'tagspaceid': 0,
   'tag': 'ImageHeight',
   'label': 'Image Height',
   'raw': {'_content': '1680'}},
  {'tagspace': 'IFD0',
   'tagspaceid': 0,
   'tag': 'BitsPerSample',
   'label': 'Bits Per Sample',
   'raw': {'_content': '8 8 8'}},
  {'tagspace': 'IFD0',
   'tagspaceid': 0,
   'tag': 'Compression',
   'label': 'Compression',
   'raw': {'_content': 'JPEG (old-style)'}},
  {'tagspace': 'IFD0',
   'tagspaceid': 0,
   'tag': 'PhotometricInterpretation',
   'label': 'Photometric Interpretation',
   'raw': {'_content': 'RGB'}},
  {'tagspace': 'IFD0',
   'tagspaceid': 0,
   'tag': 'Orientation',
   'label': 'Orientation',
   'raw': {'_content': 'Horizontal (normal)'}},
  {'tagspace': 'IFD0',
   'tagspaceid': 0,
   'tag': 'SamplesPerPix

In [21]:
for x in exif_data.get('camera'):
    print(x)

In [39]:
df.groupby(df['make']).count().sort_values(by='id', ascending=False).head(25)['id']

make
Canon                          95958
NIKON CORPORATION              58022
Apple                          33068
SONY                           31560
Panasonic                      16410
samsung                        12434
FUJIFILM                        9353
Google                          3982
OLYMPUS CORPORATION             3836
NIKON                           3749
OLYMPUS IMAGING CORP.           3163
RICOH IMAGING COMPANY, LTD.     2971
Xiaomi                          2872
HUAWEI                          2601
DJI                             1382
SAMSUNG                         1359
motorola                        1105
OM Digital Solutions            1074
PENTAX                           950
LEICA CAMERA AG                  831
Leica Camera AG                  806
OLYMPUS IMAGING CORP.            582
OPPO                             559
Hasselblad                       553
OnePlus                          513
Name: id, dtype: int64

In [38]:
df.groupby(df['model']).count().sort_values(by='id', ascending=False).head(25)['id']

model
Canon EOS 7D Mark II     8672
Canon EOS 5D Mark IV     7216
Canon EOS R5             4782
Canon EOS 6D             4592
Canon EOS 5D Mark III    4288
Canon EOS 6D Mark II     3979
NIKON D850               3830
ILCE-7M3                 3729
NIKON D750               3640
Canon EOS 7D             3197
ILCE-7C                  3190
Canon EOS R6             2971
Canon EOS 70D            2932
NIKON D7200              2837
Canon EOS 5D Mark II     2722
NIKON D500               2643
NIKON Z 9                2575
Canon EOS 80D            2399
NIKON D7000              2331
iPhone 11                2320
NIKON D90                2309
iPhone 13 Pro            2248
NIKON Z 6_2              2230
NIKON D7500              2229
NIKON D7100              2097
Name: id, dtype: int64

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 365127 entries, 0 to 365126
Data columns (total 23 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   index          365127 non-null  int64  
 1   id             365127 non-null  int64  
 2   compression    258898 non-null  object 
 3   make           300271 non-null  object 
 4   model          300504 non-null  object 
 5   software       287517 non-null  object 
 6   exposure       294586 non-null  object 
 7   aperture       293263 non-null  float64
 8   iso_speed      293729 non-null  float64
 9   date_and_time  302390 non-null  object 
 10  flash          292258 non-null  object 
 11  focal_length   292990 non-null  float64
 12  white_balance  290995 non-null  object 
 13  user_id        365127 non-null  object 
 14  secret         365127 non-null  object 
 15  title          361125 non-null  object 
 16  lat            212852 non-null  float64
 17  lon            212852 non-null  fl

### Filter by radius around location

In [19]:
# Paris center
loc_paris = (48.85341, 2.3488)
df_paris = pd.DataFrame(columns = df.columns.tolist())

# Amsterdam center
loc_amsterdam = (52.37308, 4.89245)
df_amsterdam = pd.DataFrame(columns = df.columns.tolist())

# Barcelona center
loc_barcelona = (41.38289, 2.17743)
df_barcelona = pd.DataFrame(columns = df.columns.tolist())

# Yosemite national park
loc_yosemite = (37.83930, -119.51646)
df_yosemite = pd.DataFrame(columns = df.columns.tolist())

# Lake Geneva (Switzerland)
loc_geneva = (46.448961, 6.503401)
df_geneva = pd.DataFrame(columns = df.columns.tolist())

In [20]:
radius = 25

# Iterate through df
for i, row in df[df['lat'].notnull()].iterrows():

    # Assign lat and lon to tuple
    img_loc = (row['lat'], row['lon'])

    # Check if distance is smaller than 25 km
    if distance.distance(loc_paris, img_loc).km <= radius:
        df_paris.loc[len(df_paris)] = row

    if distance.distance(loc_amsterdam, img_loc).km <= radius:
        df_amsterdam.loc[len(df_amsterdam)] = row

    if distance.distance(loc_barcelona, img_loc).km <= radius:
        df_barcelona.loc[len(df_barcelona)] = row

    if distance.distance(loc_yosemite, img_loc).km <= radius + 25:
        df_yosemite.loc[len(df_yosemite)] = row

    if distance.distance(loc_geneva, img_loc).km <= radius + 25:
        df_geneva.loc[len(df_geneva)] = row

In [21]:
print(f'Paris: {len(df_paris)}')
print(f'Amsterdam: {len(df_amsterdam)}')
print(f'Barcelona: {len(df_barcelona)}')
print(f'Yosemite: {len(df_yosemite)}')
print(f'Lake Geneva: {len(df_geneva)}')

Paris: 2195
Amsterdam: 816
Barcelona: 1995
Yosemite: 85
Lake Geneva: 596


In [22]:
df_paris.groupby('make')['id'].nunique().sort_values(ascending=False)

make
Canon                          439
Apple                          406
SONY                           360
NIKON CORPORATION              320
Panasonic                      201
samsung                         96
FUJIFILM                        64
Xiaomi                          41
RICOH IMAGING COMPANY, LTD.     26
SAMSUNG                         21
NIKON                            9
OLYMPUS IMAGING CORP.            7
Phase One                        6
PENTAX Corporation               6
OLYMPUS CORPORATION              5
Google                           5
Leica Camera AG                  5
SAMSUNG TECHWIN                  4
Plustek                          4
PENTAX                           4
OnePlus                          3
UMIDIGI                          2
FUJIFILM                         2
SEIKO EPSON CORP.                2
Hipstamatic                      2
Nokia                            1
HUAWEI                           1
HONOR                            1
Sony           

In [23]:
df_amsterdam.groupby('make')['id'].nunique().sort_values(ascending=False)

make
Canon                          257
SONY                           185
samsung                         83
NIKON CORPORATION               65
Apple                           54
FUJIFILM                        21
OLYMPUS CORPORATION             15
Panasonic                       11
OLYMPUS IMAGING CORP.            7
Google                           5
Phase One A/S                    5
Nokia                            5
OLYMPUS OPTICAL CO.,LTD          4
NIKON                            4
LG Electronics                   4
HTC                              3
motorola                         3
HUAWEI                           3
LEICA CAMERA AG                  3
Nikon                            2
Plustek                          2
RICOH IMAGING COMPANY, LTD.      2
SAMSUNG                          2
FUJI PHOTO FILM CO., LTD.        1
Xiaomi                           1
Sony Ericsson                    1
Samsung Techwin                  1
NORITSU KOKI                     1
Phase One      

In [24]:
df_barcelona.groupby('make')['id'].nunique().sort_values(ascending=False)

make
Xiaomi                         880
Apple                          293
NIKON CORPORATION              178
Canon                          139
LENOVO                         116
FUJIFILM                        62
Panasonic                       43
HUAWEI                          42
samsung                         34
SONY                            30
OM Digital Solutions            28
EPSON                           18
LEICA CAMERA AG                 10
OLYMPUS IMAGING CORP.            7
OLYMPUS CORPORATION              5
asus                             4
PENTAX                           3
RICOH IMAGING COMPANY, LTD.      3
Google                           3
ZTE                              2
NORITSU KOKI                     1
NIKON                            1
LG Electronics                   1
PENTAX Corporation               1
KONICA MINOLTA                   1
Samsung                          1
FUJI PHOTO FILM CO., LTD.        1
bq                               1
EASTMAN KODAK C

In [25]:
df_yosemite.groupby('make')['id'].nunique().sort_values(ascending=False)

make
Canon                    35
SONY                     11
NIKON CORPORATION         8
PENTAX                    6
Apple                     5
FUJIFILM                  4
samsung                   4
Eastman Kodak Company     1
OLYMPUS IMAGING CORP.     1
Name: id, dtype: int64

In [26]:
df_geneva.groupby('make')['id'].nunique().sort_values(ascending=False)

make
Canon                          146
FUJIFILM                        94
SONY                            84
NIKON CORPORATION               44
OLYMPUS CORPORATION             43
samsung                         30
Apple                           21
RICOH IMAGING COMPANY, LTD.     20
Panasonic                        7
OLYMPUS IMAGING CORP.            5
OLYMPUS IMAGING CORP.            5
Leica Camera AG                  4
NIKON                            3
LEICA                            3
SAMSUNG                          2
OLYMPUS                          1
OM Digital Solutions             1
SIGMA                            1
Hasselblad                       1
Name: id, dtype: int64

### Add continent to df

In [20]:
def country_to_continent(country):
    while True:
      try:
        country_alpha2 = pc.country_name_to_country_alpha2(country)
        country_continent_code = pc.country_alpha2_to_continent_code(country_alpha2)
        country_continent_name = pc.convert_continent_code_to_continent_name(country_continent_code)
        return country_continent_name
      except:
        return np.NaN

df['continent'] = df['country'].apply(lambda x: country_to_continent(x))

In [21]:
df['continent'] = df['continent'].astype('string')

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 365127 entries, 0 to 365126
Data columns (total 26 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   index                    365127 non-null  int64         
 1   id                       365127 non-null  int64         
 2   compression              258898 non-null  string        
 3   make                     300271 non-null  string        
 4   model                    300504 non-null  string        
 5   software                 287517 non-null  string        
 6   date_and_time_modified   307267 non-null  datetime64[ns]
 7   exposure                 294586 non-null  string        
 8   aperture                 293263 non-null  float64       
 9   iso_speed                293729 non-null  float64       
 10  date_and_time            302390 non-null  datetime64[ns]
 11  date_and_time_digitized  299969 non-null  datetime64[ns]
 12  flash                

In [23]:
df.sample(10)

Unnamed: 0,index,id,compression,make,model,software,date_and_time_modified,exposure,aperture,iso_speed,...,secret,title,lat,lon,acc,country,admin_lvl1,admin_lvl2,city,continent
104442,104442,53156254477,,,,,NaT,,,,...,c7948e2608,"UFFICIALE. Primavera Hellas Verona, dal Sassuo...",,,,,,,,
4579,4579,36230982601,,samsung,SM-G800F,G800FXXU1CQA1,2017-08-04 15:00:26,,2.4,,...,5729c4671f,Photoshoot in my garage,50.866111,5.992777,16.0,Netherlands,Limburg,Heerlen,Heerlen,Europe
288478,288478,53162483673,JPEG (old-style),Canon,Canon EOS R6m2,Adobe Photoshop Lightroom Classic 12.5 (Macint...,2023-09-03 22:25:47,1/2500,4.5,250.0,...,7b2a0a153e,AL1A5872.jpg,46.164672,6.552361,16.0,France,Auvergne-Rhône-Alpes,Haute-Savoie,Mieussy,Europe
141943,141943,51381585665,Uncompressed,,,,NaT,,,,...,26fa2ef993,‘Los Durrell’: un soplo de libertad en una isl...,,,,,,,,
361740,361740,33256707073,JPEG (old-style),,,Adobe Photoshop CC 2017 (Windows),2017-04-16 12:28:33,,,,...,3b95064fc6,LUTO Y PASIÓN,40.415913,-3.708872,16.0,Spain,Comunidad de Madrid,Madrid,Madrid,Europe
267539,267539,3017069706,JPEG (old-style),,,Adobe Photoshop Elements 2.0,2008-11-09 14:47:18,,,,...,d8240f4aa5,The Yellowjacket,39.662531,-74.849596,16.0,United States,New Jersey,Camden County,Winslow Township,North America
127454,127454,50346208596,JPEG (old-style),OLYMPUS CORPORATION,C8080WZ,ACDSee Ultimate 2020,2020-09-15 20:23:09,1/160,3.5,50.0,...,7048db0711,Myanmar - Bagan - Young Girl With Thanaka - 2d,,,,,,,,
320203,320203,52937828526,JPEG (old-style),NIKON CORPORATION,NIKON D300S,Adobe Photoshop Camera Raw 10.5 (Macintosh),2023-05-30 17:26:03,1/60,3.5,400.0,...,ab2b33116a,Brown Bag Seminars - Observatório das Ativida...,38.749574,-9.152867,16.0,Portugal,Lisboa,,Lisboa,Europe
40784,40784,53089288323,JPEG (old-style),SONY,ILCE-1,Adobe Photoshop 24.6 (Macintosh),2023-08-02 22:36:20,1/200,4.5,10000.0,...,c7a8aee459,MZA09109,,,,,,,,
234819,234819,48205962917,,Canon,Canon EOS 5D Mark IV,Adobe Photoshop Lightroom Classic 8.3 (Macintosh),2019-07-05 18:52:19,1/640,3.2,200.0,...,3a854d9052,Pasha @ Kadetten 2019,59.888294,10.52998,16.0,Norway,Viken,Bærum,Sandvika,Europe


In [24]:
df.groupby('continent')['id'].nunique().sort_values(ascending=False)

continent
Europe           133726
North America     45450
Asia              14026
South America      6307
Oceania            5506
Africa             2780
Name: id, dtype: int64

In [25]:
df.to_csv('./data/df_photo_exif_cleaned.csv')

### Playing with the cities csv

In [42]:
df = pd.read_csv(data_dir + 'geonames-all-cities-with-a-population-1000.csv', delimiter=';')

In [46]:
df.Name = df.Name.astype('string')

In [65]:
df

Unnamed: 0,Geoname ID,Name,ASCII Name,Alternate Names,Feature Class,Feature Code,Country Code,Country name EN,Country Code 2,Admin1 Code,Admin2 Code,Admin3 Code,Admin4 Code,Population,Elevation,DIgital Elevation Model,Timezone,Modification date,LABEL EN,Coordinates
0,532459,Lyudinovo,Lyudinovo,"Ljudinovo,Lyudinovo,Людиново",P,PPLA2,RU,Russian Federation,,25,,,,41392,,182,Europe/Moscow,2023-05-11,Russian Federation,"53.86639, 34.44778"
1,533793,Lubyany,Lubyany,"Lubjany,Lubyany,Лубяны",P,PPL,RU,Russian Federation,,73,,,,1899,,60,Europe/Moscow,2012-01-17,Russian Federation,"56.0378, 51.40001"
2,534639,Lobanovo,Lobanovo,"Lobanovo,Лобаново",P,PPL,RU,Russian Federation,,90,,,,3563,,132,Asia/Yekaterinburg,2019-08-13,Russian Federation,"57.8595, 56.3026"
3,535183,Linda,Linda,"Linda,Линда",P,PPL,RU,Russian Federation,,51,,,,5489,,94,Europe/Moscow,2012-01-17,Russian Federation,"56.61571, 44.09544"
4,535806,Leskolovo,Leskolovo,"Leskolovo,Lieskula,Лесколово",P,PPL,RU,Russian Federation,,42,471099,,,4039,,75,Europe/Moscow,2016-08-08,Russian Federation,"60.26341, 30.45462"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141116,3458575,Limeira,Limeira,"Limeira,Limejra,QGB,li mei la,li mey ra,limaye...",P,PPL,BR,Brazil,,27,3526902,,,289665,,582,America/Sao_Paulo,2012-08-03,Brazil,"-22.56472, -47.40167"
141117,3460584,Itaúna,Itauna,"Itauna,ita'una,yi ta wu na,Итауна,Իտաունա,ইটাউ...",P,PPL,BR,Brazil,,15,3133808,,,77400,,848,America/Sao_Paulo,2012-08-03,Brazil,"-20.07528, -44.57639"
141118,3460666,Itaporanga,Itaporanga,,P,PPL,BR,Brazil,,27,3522802,,,10202,,580,America/Sao_Paulo,2012-08-03,Brazil,"-23.70778, -49.48972"
141119,3460707,Itapicuru,Itapicuru,"Missao,Missão",P,PPL,BR,Brazil,,05,2916500,,,4994,,117,America/Bahia,2012-08-03,Brazil,"-11.31667, -38.23333"


In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 141121 entries, 0 to 141120
Data columns (total 20 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   Geoname ID               141121 non-null  int64  
 1   Name                     141120 non-null  object 
 2   ASCII Name               141119 non-null  object 
 3   Alternate Names          116800 non-null  object 
 4   Feature Class            141121 non-null  object 
 5   Feature Code             141121 non-null  object 
 6   Country Code             141076 non-null  object 
 7   Country name EN          140953 non-null  object 
 8   Country Code 2           81 non-null      object 
 9   Admin1 Code              141099 non-null  object 
 10  Admin2 Code              123142 non-null  object 
 11  Admin3 Code              69081 non-null   object 
 12  Admin4 Code              20573 non-null   object 
 13  Population               141121 non-null  int64  
 14  Elev

### Select most used camera models

In [1]:
import pandas as pd
from dotenv import dotenv_values
import sqlalchemy
import psycopg2
import numpy as np

needed_keys = ['host', 'port', 'database','user','password']
dotenv_dict = dotenv_values(".env")
sql_config = {key:dotenv_dict[key] for key in needed_keys if key in dotenv_dict}

engine = sqlalchemy.create_engine('postgresql://user:pass@host/database',
        connect_args=sql_config
        )

# Schema used for our capstone project
schema = 'capstone_jorittega'

# Table name
table_name = 'photo_exif'

def get_dataframe(sql_query):
    # Connect to the PostgreSQL database server, run query and return data
    from dotenv import dotenv_values
    # get the connection configuration dictionary using the get_sql_config function
    from sql_functions import get_sql_config
    # create a connection engine to the PostgreSQL server
    engine = sqlalchemy.create_engine('postgresql://user:pass@host/database',
                        connect_args=get_sql_config() # use dictionary with config details
                        )
        # open a conn session using 'with', execute the query, and return the results
    return pd.read_sql_query(sql=sql_query, con=engine)

In [37]:
df = get_dataframe(f"select * from {schema}.{table_name}")

In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 365127 entries, 0 to 365126
Data columns (total 26 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   index                    365127 non-null  int64         
 1   id                       365127 non-null  int64         
 2   compression              258898 non-null  object        
 3   make                     300271 non-null  object        
 4   model                    300504 non-null  object        
 5   software                 287517 non-null  object        
 6   date_and_time_modified   307267 non-null  datetime64[ns]
 7   exposure                 294586 non-null  object        
 8   aperture                 293263 non-null  float64       
 9   iso_speed                293729 non-null  float64       
 10  date_and_time            302390 non-null  datetime64[ns]
 11  date_and_time_digitized  299969 non-null  datetime64[ns]
 12  flash           

In [102]:
df_cameras = df.groupby('model')['id'].nunique().sort_values(ascending=False).head(100)
df_cameras = df_cameras.to_frame()
df_cameras = df_cameras.reset_index()
df_cameras.columns = ['model', 'count']

In [103]:
df_cameras

Unnamed: 0,model,count
0,Canon EOS 7D Mark II,8672
1,Canon EOS 5D Mark IV,7216
2,Canon EOS R5,4782
3,Canon EOS 6D,4592
4,Canon EOS 5D Mark III,4288
...,...,...
95,Canon EOS 450D,664
96,DMC-FZ200,662
97,ILCA-77M2,661
98,Canon EOS REBEL T3i,652


In [138]:
df_makes = df[['make', 'model']].drop_duplicates()
df_makes = df_makes[df_makes['make'].notna()]

In [149]:
df_cameras_top = df_cameras.merge(df_makes, on='model', how='left')

In [153]:
df_cameras_top.to_csv('data/df_cameras_top.csv')

In [166]:
df_cameras_top = pd.read_csv('data/df_cameras_top.csv', delimiter=';')
df_cameras_top = df_cameras_top.drop('Unnamed: 0', axis=1)
df_cameras_top.semiprof = df_cameras_top.semiprof.astype(bool)

In [167]:
df_cameras_top.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   model     100 non-null    object
 1   make      100 non-null    object
 2   count     100 non-null    int64 
 3   type      100 non-null    object
 4   semiprof  100 non-null    bool  
dtypes: bool(1), int64(1), object(3)
memory usage: 3.3+ KB


In [170]:
df_cameras_top.head(15)

Unnamed: 0,model,make,count,type,semiprof
0,Canon EOS 7D Mark II,Canon,8672,DSLR,True
1,Canon EOS 5D Mark IV,Canon,7216,DSLR,True
2,Canon EOS R5,Canon,4782,DLSM,True
3,Canon EOS 6D,Canon,4592,DSLR,True
4,Canon EOS 5D Mark III,Canon,4288,DSLR,True
5,Canon EOS 6D Mark II,Canon,3979,DSLR,True
6,NIKON D850,NIKON CORPORATION,3830,DSLR,True
7,ILCE-7M3,SONY,3729,DLSM,True
8,NIKON D750,NIKON CORPORATION,3640,DSLR,True
9,Canon EOS 7D,Canon,3197,DSLR,True


### Join both tables

In [177]:
df = get_dataframe(f"select * from {schema}.photo_exif as e left join {schema}.camera_top100 as c on e.model = c.model")

In [179]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 365127 entries, 0 to 365126
Data columns (total 31 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   index                    365127 non-null  int64         
 1   id                       365127 non-null  int64         
 2   compression              258898 non-null  object        
 3   make                     300271 non-null  object        
 4   model                    300504 non-null  object        
 5   software                 287517 non-null  object        
 6   date_and_time_modified   307267 non-null  datetime64[ns]
 7   exposure                 294586 non-null  object        
 8   aperture                 293263 non-null  float64       
 9   iso_speed                293729 non-null  float64       
 10  date_and_time            302390 non-null  datetime64[ns]
 11  date_and_time_digitized  299969 non-null  datetime64[ns]
 12  flash           

### EDA on type and pricing

#### Do semiprofs have more cameras? No.

In [229]:
df[df['semiprof'] == True].groupby('user_id').nunique()['model'].describe()

Unnamed: 0,model,model.1
count,8176.0,8176.0
mean,1.113381,1.113381
std,0.405753,0.405753
min,1.0,1.0
25%,1.0,1.0
50%,1.0,1.0
75%,1.0,1.0
max,10.0,10.0


In [230]:
df[df['semiprof'] == False].groupby('user_id').nunique()['model'].describe()

Unnamed: 0,model,model.1
count,12471.0,12471.0
mean,1.115548,1.115548
std,0.399929,0.399929
min,1.0,1.0
25%,1.0,1.0
50%,1.0,1.0
75%,1.0,1.0
max,7.0,7.0


#### Do semiprofs use different apertures? No.

In [236]:
df[(df['semiprof'] == True) & (df['aperture'] > 0.95)].groupby('user_id').median()['aperture'].describe()

  df[(df['semiprof'] == True) & (df['aperture'] > 0.95)].groupby('user_id').median()['aperture'].describe()


count    8077.000000
mean        7.080407
std         3.967143
min         1.000000
25%         4.000000
50%         6.300000
75%         8.150000
max        36.000000
Name: aperture, dtype: float64

In [237]:
df[(df['semiprof'] == False) & (df['aperture'] > 0.95)].groupby('user_id').median()['aperture'].describe()

  df[(df['semiprof'] == False) & (df['aperture'] > 0.95)].groupby('user_id').median()['aperture'].describe()


count    12355.000000
mean         5.682384
std          3.971234
min          1.000000
25%          2.200000
50%          5.600000
75%          8.000000
max         40.000000
Name: aperture, dtype: float64

#### Do semipros use RAW? Kind of.

In [265]:
df.loc[df['compression'] == 'Deflate', 'compression'] = 'RAW'

In [261]:
df.loc[df['compression'] == 'JPEG (old-style)', 'compression'] = 'JPEG'

In [284]:
df.groupby('compression')['compression'].count()

compression
CCIRLEW                       1
CCITT 1D                      1
JPEG                     250108
PackBits                     23
Pentax PEF Compressed         1
RAW                        8598
Unknown (0)                 164
Unknown (1536)                1
Unknown (256)                 1
Name: compression, dtype: int64

In [280]:
df[(df['compression'] == 'RAW') & (df['semiprof'] == True)]['compression'].count()

3482

In [281]:
df[(df['compression'] == 'JPEG') & (df['semiprof'] == True)]['compression'].count()

59447

In [282]:
df[(df['compression'] == 'RAW') & (df['semiprof'] == False)]['compression'].count()

1276

In [283]:
df[(df['compression'] == 'JPEG') & (df['semiprof'] == False)]['compression'].count()

74336

True 62629 5,56%
False 75612 1,69%

In [289]:
df.groupby(['semiprof', 'compression']).size()

semiprof  compression
False     JPEG           74336
          RAW             1276
          Unknown (0)       34
True      CCITT 1D           1
          JPEG           59447
          RAW             3482
          Unknown (0)        1
dtype: int64

In [302]:
df[(df['type'] == 'DLSM') | (df['type'] == 'DSLR')]

Unnamed: 0,index,id,compression,make,model,software,date_and_time_modified,exposure,aperture,iso_speed,...,country,admin_lvl1,admin_lvl2,city,continent,model.1,make.1,count,type,semiprof
57,57,52652996572,JPEG,FUJIFILM,X-T3,Digital Camera X-T3 Ver4.30,2023-01-28 08:51:05,1/1000,5.6,160.0,...,,,,,,X-T3,FUJIFILM,1437.0,DLSM,True
66,66,51003040067,JPEG,SONY,ILCE-7M3,Adobe Photoshop Lightroom Classic 8.2 (Macintosh),2021-03-03 19:48:40,1/100,2.8,800.0,...,,,,,,ILCE-7M3,SONY,3729.0,DLSM,True
69,69,50964297722,JPEG,NIKON CORPORATION,NIKON D3300,Adobe Lightroom 4.1 (Macintosh),2021-02-20 17:09:23,1/1250,9.0,1600.0,...,,,,,,NIKON D3300,NIKON CORPORATION,681.0,DSLR,False
71,714,52832976678,JPEG,Canon,Canon EOS 6D Mark II,,2023-04-20 08:29:38,1/320,4.5,5000.0,...,,,,,,Canon EOS 6D Mark II,Canon,3979.0,DSLR,True
78,77,50617285327,JPEG,SONY,ILCE-7M3,Adobe Photoshop Lightroom Classic 8.4 (Windows),2020-10-06 15:35:43,1/125,4.0,1000.0,...,,,,,,ILCE-7M3,SONY,3729.0,DLSM,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
365026,365026,52834588607,JPEG,Canon,Canon EOS 60D,Microsoft Windows Photo Viewer 6.1.7600.16385,2023-02-27 00:55:45,1/160,5.6,160.0,...,Chile,Aysén,Capitán Prat,Cochrane,South America,Canon EOS 60D,Canon,1751.0,DSLR,False
365062,365062,25867815923,JPEG,Canon,Canon EOS 70D,Adobe Photoshop Lightroom 4.4 (Macintosh),2016-04-16 18:10:10,1/500,8.0,2000.0,...,United States,New Mexico,Eddy County,,North America,Canon EOS 70D,Canon,2932.0,DSLR,False
365098,365098,3318432316,JPEG,NIKON CORPORATION,NIKON D80,Adobe Photoshop Elements 7.0,2009-02-28 23:51:17,1/250,2.8,1600.0,...,United States,Delaware,Kent County,Dover,North America,NIKON D80,NIKON CORPORATION,920.0,DSLR,False
365102,365102,15399930665,JPEG,Canon,Canon EOS 7D,Adobe Photoshop CC (Windows),2014-08-17 09:27:48,1/40,4.5,200.0,...,Spain,Galicia,Lugo,Céramo,Europe,Canon EOS 7D,Canon,3197.0,DSLR,True


In [303]:
df[(df['type'] == 'DLSM') | (df['type'] == 'DSLR')].groupby(['semiprof', 'compression']).size()

semiprof  compression
False     JPEG           53669
          RAW             1205
          Unknown (0)       28
True      CCITT 1D           1
          JPEG           59447
          RAW             3482
          Unknown (0)        1
dtype: int64

In [307]:
print(f'{1205 / (53669 + 1205) * 100}')
print(f'{3482 / (59447 + 3482) * 100}')

2.1959397893355685
5.533219978070524


### Number of images per User

In [310]:
df['user_id'].value_counts().sort_values(ascending=False)

123597211@N03    5266
151639852@N07    2713
133876835@N08    1186
71296413@N02     1140
91044419@N08     1135
                 ... 
71678638@N00        1
74728804@N04        1
79819735@N00        1
53755594@N00        1
186241381@N05       1
Name: user_id, Length: 53224, dtype: int64

### Sort apertures to values that make sense

In [334]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 365127 entries, 0 to 365126
Data columns (total 31 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   index                    365127 non-null  int64         
 1   id                       365127 non-null  int64         
 2   compression              258898 non-null  object        
 3   make                     300271 non-null  object        
 4   model                    300504 non-null  object        
 5   software                 287517 non-null  object        
 6   date_and_time_modified   307267 non-null  datetime64[ns]
 7   exposure                 294586 non-null  object        
 8   aperture                 293263 non-null  float64       
 9   iso_speed                293729 non-null  float64       
 10  date_and_time            302390 non-null  datetime64[ns]
 11  date_and_time_digitized  299969 non-null  datetime64[ns]
 12  flash           

In [337]:
apertures = [1, 1.2, 1.4, 1.8, 2, 2.5, 2.8, 3.2, 4, 5.6, 8, 11, 16, 22, 32]

In [339]:
df1 = df

In [369]:
for i, row in df1[df1['aperture'].notna()].iterrows():
    df1.at[i, 'aperture_est'] = min(possible_apertures, key=lambda x:abs(x - row['aperture']))

In [381]:
df1 = df1.drop(['index', 'make', 'model', 'software', 'date_and_time_modified',
       'exposure', 'iso_speed', 'date_and_time',
       'date_and_time_digitized', 'flash', 'focal_length', 'white_balance',
       'user_id', 'secret', 'title', 'lat', 'lon', 'acc', 'country',
       'admin_lvl1', 'admin_lvl2', 'city', 'continent', 'model', 'make',
       'count', 'type', 'semiprof'], axis=1)

In [382]:
df1.columns

Index(['id', 'aperture', 'aperture_est'], dtype='object')

In [413]:
df1.aperture_est.sample(10)

136537    NaN
190150    NaN
186736    1.8
92704     NaN
100759    NaN
157384    NaN
71944     5.6
189390    4.0
122100    4.0
339364    4.0
Name: aperture_est, dtype: float64

In [383]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 365127 entries, 0 to 365126
Data columns (total 3 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   id            365127 non-null  int64  
 1   aperture      293263 non-null  float64
 2   aperture_est  293263 non-null  float64
dtypes: float64(2), int64(1)
memory usage: 8.4 MB


### Flash usage

In [399]:
df.flash.value_counts().to_frame().head(50)

Unnamed: 0,flash
"Off, Did not fire",185066
No Flash,72040
"Auto, Did not fire",14354
"On, Fired",7035
"On, Did not fire",2947
"On, Return detected",2194
No flash function,2031
"On, Return not detected",1710
"Auto, Fired",1177
"Auto, Fired, Return detected",1109
