In [2]:
# Python modules
import flickrapi
import pandas as pd
import os
import json
from dotenv import load_dotenv
load_dotenv()
import numpy as np
import subprocess
from geopy import distance
import pycountry_convert as pc
import geopandas

# Import additional functions
from flickr_functions import df_remove_dupes

# Keys needed for API access
api_key = os.getenv('flickr_api_key')
api_secret = os.getenv('flickr_api_secret')

# Flickr API object
flickr = flickrapi.FlickrAPI(api_key, api_secret, format='parsed-json')

# Data directory used to store CVS files
data_dir = './data/'

# Load EXIF dataframe from CVS file
df = pd.read_csv(data_dir + 'df_photo_exif_cleaned.csv', index_col=[0])

# Data types
df['id'] = df['id'].astype(int)
df['aperture'] = pd.to_numeric(df['aperture'])
df['iso_speed'] = pd.to_numeric(df['iso_speed'])
df.date_and_time = pd.to_datetime(df.date_and_time, format='%Y-%m-%d %H:%M:%S', errors='coerce')
df.date_and_time_modified = pd.to_datetime(df.date_and_time_modified, format='%Y-%m-%d %H:%M:%S', errors='coerce')
df.date_and_time_digitized = pd.to_datetime(df.date_and_time_digitized, format='%Y-%m-%d %H:%M:%S', errors='coerce')
df.lat, df.lon = df.lat.astype(float), df.lon.astype(float)
df['focal_length'] = pd.to_numeric(df['focal_length'])
df['acc'] = pd.to_numeric(df['acc'])
df['compression'] = df['compression'].astype('string')
df['make'] = df['make'].astype('string')
df['model'] = df['model'].astype('string')
df['software'] = df['software'].astype('string')
df['exposure'] = df['exposure'].astype('string')
df['flash'] = df['flash'].astype('string')
df['white_balance'] = df['white_balance'].astype('string')
df['user_id'] = df['user_id'].astype('string')
df['secret'] = df['secret'].astype('string')
df['country'] = df['country'].astype('string')
df['admin_lvl1'] = df['admin_lvl1'].astype('string')
df['admin_lvl2'] = df['admin_lvl2'].astype('string')
df['city'] = df['city'].astype('string')

# Run command function
def runcmd(cmd, verbose = False, *args, **kwargs):

    process = subprocess.Popen(
        cmd,
        stdout = subprocess.PIPE,
        stderr = subprocess.PIPE,
        text = True,
        shell = True
    )
    std_out, std_err = process.communicate()
    if verbose:
        print(std_out.strip(), std_err)
    pass

In [None]:
df.info()

In [None]:
df['user_id'].nunique() / df['user_id'].count() * 100

In [None]:
flickr = flickrapi.FlickrAPI(api_key, api_secret, format='parsed-json')
for index, row in df[df['country'] == 'Faroe Islands'].iterrows():
    sizes = flickr.photos.getSizes(photo_id = row.get('id'))
    for size in sizes.get('sizes').get('size'):
        print(size)
        #if size.get('label') == 'Original':
            #runcmd('wget ' + size.get('source'), verbose = True)

In [None]:
df.notna().sum() * 100 / len(df)

In [None]:
df.info()

In [None]:
exif_data = flickr.photos.getExif(photo_id = 53095647615, photo_secret = 'secret').get('photo')
exif_data

In [None]:
for x in exif_data.get('camera'):
    print(x)

In [None]:
df.groupby(df['make']).count().sort_values(by='id', ascending=False).head(25)['id']

In [None]:
df.groupby(df['model']).count().sort_values(by='id', ascending=False).head(25)['id']

In [None]:
df.info()

### Filter by radius around location

In [None]:
# Paris center
loc_paris = (48.85341, 2.3488)
df_paris = pd.DataFrame(columns = df.columns.tolist())

# Amsterdam center
loc_amsterdam = (52.37308, 4.89245)
df_amsterdam = pd.DataFrame(columns = df.columns.tolist())

# Barcelona center
loc_barcelona = (41.38289, 2.17743)
df_barcelona = pd.DataFrame(columns = df.columns.tolist())

# Yosemite national park
loc_yosemite = (37.83930, -119.51646)
df_yosemite = pd.DataFrame(columns = df.columns.tolist())

# Lake Geneva (Switzerland)
loc_geneva = (46.448961, 6.503401)
df_geneva = pd.DataFrame(columns = df.columns.tolist())

In [None]:
radius = 25

# Iterate through df
for i, row in df[df['lat'].notnull()].iterrows():

    # Assign lat and lon to tuple
    img_loc = (row['lat'], row['lon'])

    # Check if distance is smaller than 25 km
    if distance.distance(loc_paris, img_loc).km <= radius:
        df_paris.loc[len(df_paris)] = row

    if distance.distance(loc_amsterdam, img_loc).km <= radius:
        df_amsterdam.loc[len(df_amsterdam)] = row

    if distance.distance(loc_barcelona, img_loc).km <= radius:
        df_barcelona.loc[len(df_barcelona)] = row

    if distance.distance(loc_yosemite, img_loc).km <= radius + 25:
        df_yosemite.loc[len(df_yosemite)] = row

    if distance.distance(loc_geneva, img_loc).km <= radius + 25:
        df_geneva.loc[len(df_geneva)] = row

In [None]:
print(f'Paris: {len(df_paris)}')
print(f'Amsterdam: {len(df_amsterdam)}')
print(f'Barcelona: {len(df_barcelona)}')
print(f'Yosemite: {len(df_yosemite)}')
print(f'Lake Geneva: {len(df_geneva)}')

In [None]:
df_paris.groupby('make')['id'].nunique().sort_values(ascending=False)

In [None]:
df_amsterdam.groupby('make')['id'].nunique().sort_values(ascending=False)

In [None]:
df_barcelona.groupby('make')['id'].nunique().sort_values(ascending=False)

In [None]:
df_yosemite.groupby('make')['id'].nunique().sort_values(ascending=False)

In [None]:
df_geneva.groupby('make')['id'].nunique().sort_values(ascending=False)

### Add continent to df

In [None]:
def country_to_continent(country):
    while True:
      try:
        country_alpha2 = pc.country_name_to_country_alpha2(country)
        country_continent_code = pc.country_alpha2_to_continent_code(country_alpha2)
        country_continent_name = pc.convert_continent_code_to_continent_name(country_continent_code)
        return country_continent_name
      except:
        return np.NaN

df['continent'] = df['country'].apply(lambda x: country_to_continent(x))

In [None]:
df['continent'] = df['continent'].astype('string')

In [None]:
df.info()

In [None]:
df.sample(10)

In [None]:
df.groupby('continent')['id'].nunique().sort_values(ascending=False)

In [None]:
df.to_csv('./data/df_photo_exif_cleaned.csv')

### Playing with the cities csv

In [None]:
df = pd.read_csv(data_dir + 'geonames-all-cities-with-a-population-1000.csv', delimiter=';')

In [None]:
df.Name = df.Name.astype('string')

In [None]:
df

In [None]:
df.info()

### Select most used camera models

In [None]:
import pandas as pd
from dotenv import dotenv_values
import sqlalchemy
import psycopg2
import numpy as np

needed_keys = ['host', 'port', 'database','user','password']
dotenv_dict = dotenv_values(".env")
sql_config = {key:dotenv_dict[key] for key in needed_keys if key in dotenv_dict}

engine = sqlalchemy.create_engine('postgresql://user:pass@host/database',
        connect_args=sql_config
        )

# Schema used for our capstone project
schema = 'capstone_jorittega'

# Table name
table_name = 'photo_exif'

def get_dataframe(sql_query):
    # Connect to the PostgreSQL database server, run query and return data
    from dotenv import dotenv_values
    # get the connection configuration dictionary using the get_sql_config function
    from sql_functions import get_sql_config
    # create a connection engine to the PostgreSQL server
    engine = sqlalchemy.create_engine('postgresql://user:pass@host/database',
                        connect_args=get_sql_config() # use dictionary with config details
                        )
        # open a conn session using 'with', execute the query, and return the results
    return pd.read_sql_query(sql=sql_query, con=engine)

In [None]:
df = get_dataframe(f"select * from {schema}.{table_name}")

In [None]:
df.info()

In [None]:
df_cameras = df.groupby('model')['id'].nunique().sort_values(ascending=False).head(100)
df_cameras = df_cameras.to_frame()
df_cameras = df_cameras.reset_index()
df_cameras.columns = ['model', 'count']

In [None]:
df_cameras

In [None]:
df_makes = df[['make', 'model']].drop_duplicates()
df_makes = df_makes[df_makes['make'].notna()]

In [None]:
df_cameras_top = df_cameras.merge(df_makes, on='model', how='left')

In [None]:
df_cameras_top.to_csv('data/df_cameras_top.csv')

In [None]:
df_cameras_top = pd.read_csv('data/df_cameras_top.csv', delimiter=';')
df_cameras_top = df_cameras_top.drop('Unnamed: 0', axis=1)
df_cameras_top.semiprof = df_cameras_top.semiprof.astype(bool)

In [None]:
df_cameras_top.info()

In [None]:
df_cameras_top.head(15)

### Join both tables

In [None]:
df = get_dataframe(f"select * from {schema}.photo_exif as e left join {schema}.camera_top100 as c on e.model = c.model")

In [None]:
df.info()

### EDA on type and pricing

#### Do semiprofs have more cameras? No.

In [None]:
df[df['semiprof'] == True].groupby('user_id').nunique()['model'].describe()

In [None]:
df[df['semiprof'] == False].groupby('user_id').nunique()['model'].describe()

#### Do semiprofs use different apertures? No.

In [None]:
df[(df['semiprof'] == True) & (df['aperture'] > 0.95)].groupby('user_id').median()['aperture'].describe()

In [None]:
df[(df['semiprof'] == False) & (df['aperture'] > 0.95)].groupby('user_id').median()['aperture'].describe()

#### Do semipros use RAW? Kind of.

In [None]:
df.loc[df['compression'] == 'Deflate', 'compression'] = 'RAW'

In [None]:
df.loc[df['compression'] == 'JPEG (old-style)', 'compression'] = 'JPEG'

In [None]:
df.groupby('compression')['compression'].count()

In [None]:
df[(df['compression'] == 'RAW') & (df['semiprof'] == True)]['compression'].count()

In [None]:
df[(df['compression'] == 'JPEG') & (df['semiprof'] == True)]['compression'].count()

In [None]:
df[(df['compression'] == 'RAW') & (df['semiprof'] == False)]['compression'].count()

In [None]:
df[(df['compression'] == 'JPEG') & (df['semiprof'] == False)]['compression'].count()

True 62629 5,56%
False 75612 1,69%

In [None]:
df.groupby(['semiprof', 'compression']).size()

In [None]:
df[(df['type'] == 'DLSM') | (df['type'] == 'DSLR')]

In [None]:
df[(df['type'] == 'DLSM') | (df['type'] == 'DSLR')].groupby(['semiprof', 'compression']).size()

In [None]:
print(f'{1205 / (53669 + 1205) * 100}')
print(f'{3482 / (59447 + 3482) * 100}')

### Number of images per User

In [None]:
df['user_id'].value_counts().sort_values(ascending=False)

### Sort apertures to values that make sense

In [None]:
df1.info()

In [None]:
apertures = [1, 1.2, 1.4, 1.8, 2, 2.5, 2.8, 3.2, 4, 5.6, 8, 11, 16, 22, 32]

In [None]:
df1 = df

In [None]:
for i, row in df1[df1['aperture'].notna()].iterrows():
    df1.at[i, 'aperture_est'] = min(possible_apertures, key=lambda x:abs(x - row['aperture']))

In [None]:
df1 = df1.drop(['index', 'make', 'model', 'software', 'date_and_time_modified',
       'exposure', 'iso_speed', 'date_and_time',
       'date_and_time_digitized', 'flash', 'focal_length', 'white_balance',
       'user_id', 'secret', 'title', 'lat', 'lon', 'acc', 'country',
       'admin_lvl1', 'admin_lvl2', 'city', 'continent', 'model', 'make',
       'count', 'type', 'semiprof'], axis=1)

In [None]:
df1.columns

In [None]:
df1.aperture_est.sample(10)

In [None]:
df1.info()

### Flash usage

In [None]:
df.flash.value_counts().to_frame().head(50)

### Play around with location data

In [None]:
import geopy.distance

images_around_loc = pd.DataFrame(columns = ['id', 'count'])
images_around_loc['id'] = images_around_loc['id'].astype(int)
images_around_loc['count'] = images_around_loc['count'].astype(int)

In [18]:
df_tmp = df[df['lat'].notna()].sample(10)

In [19]:
from scipy.spatial import distance_matrix

radius = 1
pos = df_tmp[['lat','lon']]

df_tmp['images_within_radius'] = (distance_matrix(pos, pos) <= radius).sum(axis=0) - 1

In [20]:
df_tmp

Unnamed: 0,index,id,compression,make,model,software,date_and_time_modified,exposure,aperture,iso_speed,...,title,lat,lon,acc,country,admin_lvl1,admin_lvl2,city,continent,images_within_radius
181556,181556,33208038685,,Apple,iPhone 6s,10.2.1,2017-02-18 07:39:11,1/50,2.2,25.0,...,#Plash_Flower #Sculpture,24.368,88.635246,0.0,Bangladesh,Rajshahi Division,Rajshahi District,Rajshahi,Asia,0
277162,277162,21949163449,,NIKON CORPORATION,NIKON D7100,ACDSee Pro 7,2015-10-13 14:55:28,1/400,5.6,200.0,...,Bundesplatz Fest 101,52.478945,13.328646,16.0,Germany,Berlin,,Berlin,Europe,1
291865,291865,53141587746,JPEG (old-style),Canon,Canon EOS 6D,Adobe Photoshop Lightroom 5.7 (Macintosh),2020-06-15 21:41:01,1/200,6.3,200.0,...,Woman Outside her Hut 2973,5.004527,36.459685,16.0,Ethiopia,"Southern Nations, Nationalities and Peoples",South Omo,Turmi,Africa,0
326685,326685,53142657274,JPEG (old-style),NIKON CORPORATION,NIKON Z 6_2,Adobe Photoshop Lightroom Classic 10.1 (Windows),2023-08-25 23:08:22,1/1000,3.5,3200.0,...,DSC_5907,51.048922,3.684957,16.0,Belgium,Vlaams Gewest,Oost-Vlaanderen,Gent,Europe,0
258582,258582,52688651697,JPEG (old-style),Kodak,DCS Pro 14N,Adobe Photoshop 24.1 (Windows),2023-02-14 14:18:56,1/750,6.7,200.0,...,Blue Angels 2007_a,37.810336,-122.422976,15.0,United States,California,San Francisco County,San Francisco,North America,0
324221,324221,51204017222,JPEG (old-style),NIKON CORPORATION,NIKON D7200,Adobe Photoshop Lightroom Classic 12.1 (Macint...,2023-01-09 09:29:54,1/100,11.0,5600.0,...,Meticulous Work,44.946716,-75.068098,16.0,Canada,Ontario,"Stormont, Dundas and Glengarry United Counties",,North America,0
231024,231024,53065426977,JPEG (old-style),Apple,iPhone 8,Windows Photo Editor 10.0.10011.16384,2023-07-23 18:52:16,1/147,1.8,20.0,...,Fernsehturm 08,52.520794,13.409358,16.0,Germany,Berlin,,Berlin,Europe,1
215578,215578,53139005029,JPEG (old-style),samsung,Galaxy S23 Ultra,Adobe Photoshop Lightroom Classic 12.5 (Windows),2023-08-24 11:19:34,1/1200,1.7,10.0,...,Exploring the Princess Regal,50.891791,-1.399817,16.0,United Kingdom,England,,Southampton,Europe,0
60965,60965,52548931615,JPEG (old-style),SONY,SLT-A99V,Adobe Photoshop CC 2015 (Windows),2022-12-07 12:00:06,1/10,32.0,125.0,...,DSC00259D0129lo,56.897878,-4.94419,15.0,United Kingdom,Scotland,Highland Council,Spean Bridge,Europe,0
283252,283252,52850299049,,motorola,Moto G (5) Plus,potter_n-user 8.1.0 OPS28.85-17-6-2 77e7 relea...,2023-03-25 15:24:11,1/4704,1.7,80.0,...,Emirates_01,25.484946,55.671383,16.0,United Arab Emirates,Umm Al Quawain,,Al Salamah,Asia,0


In [None]:
def look_for_near_loc(row):

    counter = 0

    for i, searchrow in df[df['lat'].notna()][['lat', 'lon']].iterrows():

        distance = geopy.distance.geodesic((row.to_list()[1], row.to_list()[2]), (searchrow['lat'], searchrow['lon']))

        if distance <= 50:
            counter += 1

    if counter > 0:
        images_around_loc.loc[len(images_around_loc)] = [int(row.to_list()[0]), counter]

df[df['lat'].notna()].sample(1)[['id', 'lat', 'lon']].apply(look_for_near_loc, axis=1)

In [None]:
images_around_loc

In [None]:
base_coords = (38.782666, -109.595113)
for i, x in df[df['lat'].notna()].sample(100).iterrows():
    distance = geopy.distance.geodesic(base_coords, (x['lat'], x['lon']))
    print(f'{distance}')
    

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 365127 entries, 0 to 365126
Data columns (total 26 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   index                    365127 non-null  int64         
 1   id                       365127 non-null  int64         
 2   compression              258898 non-null  string        
 3   make                     300271 non-null  string        
 4   model                    300504 non-null  string        
 5   software                 287517 non-null  string        
 6   date_and_time_modified   307267 non-null  datetime64[ns]
 7   exposure                 294586 non-null  string        
 8   aperture                 293263 non-null  float64       
 9   iso_speed                293729 non-null  float64       
 10  date_and_time            302390 non-null  datetime64[ns]
 11  date_and_time_digitized  299969 non-null  datetime64[ns]
 12  flash                

In [44]:
df[df['country'].notna()].count()

index                      209478
id                         209478
compression                154807
make                       187014
model                      187133
software                   178388
date_and_time_modified     189041
exposure                   183860
aperture                   183346
iso_speed                  183528
date_and_time              187558
date_and_time_digitized    185447
flash                      182315
focal_length               183181
white_balance              180982
user_id                    209478
secret                     209478
title                      207054
lat                        209478
lon                        209478
acc                        209478
country                    209478
admin_lvl1                 203667
admin_lvl2                 167101
city                       191781
continent                  207795
dtype: int64

In [111]:
df['time_period'] = df[(df['date_and_time'].notnull()) & (df['date_and_time_modified'].notnull())]['date_and_time_modified'] - df[(df['date_and_time'].notnull()) & (df['date_and_time_modified'].notnull())]['date_and_time']

In [112]:
df['time_period'].describe()

count                          294408
mean      188 days 15:42:20.644401646
std      1207 days 05:52:50.230667680
min             -29499 days +22:28:16
25%                   0 days 00:00:00
50%            0 days 11:22:01.500000
75%                  11 days 11:06:47
max               59136 days 00:06:00
Name: time_period, dtype: object

In [56]:
df.groupby('make').count().sort_values(by='id', ascending=False)

Unnamed: 0_level_0,index,id,compression,model,software,date_and_time_modified,exposure,aperture,iso_speed,date_and_time,...,secret,title,lat,lon,acc,country,admin_lvl1,admin_lvl2,city,continent
make,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Canon,95958,95958,82155,95881,75301,94470,94388,94136,93750,94930,...,95958,95546,54463,54463,54463,53225,51775,42492,49332,52969
NIKON CORPORATION,58022,58022,48945,58019,56958,56719,57550,57433,56898,57772,...,58022,57340,34364,34364,34364,33842,32879,27007,31133,33488
Apple,33068,33068,18466,33068,32774,32832,32892,33022,32911,32834,...,33068,31385,24439,24439,24439,24156,23528,19887,22201,24020
SONY,31560,31560,27165,31560,29984,30687,31436,30935,31407,31444,...,31560,31400,21220,21220,21220,20996,20300,15762,19720,20648
Panasonic,16410,16410,13786,16410,15696,15701,15901,15880,16357,16336,...,16410,16353,9409,9409,9409,9326,9119,7455,8881,9307
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Linhof Technorama 617s III,1,1,1,0,1,1,0,0,1,0,...,1,1,1,1,1,1,1,1,1,1
"MINOLTA CO.,LTD",1,1,1,1,1,1,1,1,1,1,...,1,1,0,0,0,0,0,0,0,0
MOULTRIE,1,1,1,1,1,1,1,1,1,1,...,1,1,0,0,0,0,0,0,0,0
MSM6250,1,1,1,1,1,0,1,0,0,1,...,1,1,1,1,1,1,1,1,1,1


In [73]:
df['focal_length'].median()

28.0

In [70]:
df[(df['make'] == 'Canon') & (df['aperture'] > 1) | (df['make'] == 'NIKON CORPORATION')  & (df['aperture'] > 1) | (df['make'] == 'SONY') & (df['aperture'] > 1) ]['aperture'].describe().astype(int)

count    181882
mean          6
std           3
min           1
25%           4
50%           5
75%           8
max          95
Name: aperture, dtype: int64

In [67]:
df[(df['make'] == 'Canon') & (df['focal_length'] > 10) | (df['make'] == 'NIKON CORPORATION')  & (df['focal_length'] > 10) | (df['make'] == 'SONY') & (df['focal_length'] > 10) ]['focal_length'].describe().astype(int)

count     169087
mean         118
std         4022
min           11
25%           28
50%           55
75%          125
max      1653182
Name: focal_length, dtype: int64

In [None]:
df[(df['make'] == 'Canon') & (df['focal_length'] > 10) | (df['make'] == 'NIKON CORPORATION')  & (df['exposure'] > 10) | (df['make'] == 'SONY') & (df['focal_length'] > 10) ]['focal_length'].describe().astype(int)

In [98]:
df.exposure.info()

<class 'pandas.core.series.Series'>
Index: 365127 entries, 0 to 365126
Series name: exposure
Non-Null Count   Dtype 
--------------   ----- 
294586 non-null  string
dtypes: string(1)
memory usage: 5.6 MB


In [97]:
df.exposure.describe()

count     294586
unique      3160
top         1/60
freq       17437
Name: exposure, dtype: object

In [99]:
df.groupby('exposure').nunique().sort_values(by='id', ascending=False).head(10)

Unnamed: 0_level_0,index,id,compression,make,model,software,date_and_time_modified,aperture,iso_speed,date_and_time,...,secret,title,lat,lon,acc,country,admin_lvl1,admin_lvl2,city,continent
exposure,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1/60,17437,17437,8,91,1277,1419,16682,82,328,16966,...,17434,12758,6690,6681,16,133,700,1398,2713,6
1/250,17212,17212,9,63,995,1184,16346,98,85,16658,...,17208,13129,7213,7209,15,150,829,1576,3316,6
1/500,17206,17206,8,60,929,1118,16095,90,82,16182,...,17205,13550,6964,7085,17,135,767,1538,3262,6
1/125,16103,16103,9,80,942,1156,15102,93,74,15589,...,16102,11681,5575,5585,16,140,730,1395,2710,6
1/200,15416,15416,9,65,857,1072,14384,91,87,14830,...,15415,11335,5860,5873,15,134,752,1386,2803,6
1/160,13697,13697,11,53,811,992,12719,82,70,13033,...,13694,10323,5031,5081,16,131,710,1313,2432,6
1/100,13576,13576,9,93,1149,1368,12873,78,289,13130,...,13575,10335,5485,5501,16,130,699,1266,2426,6
1/400,12680,12680,8,56,857,1026,12013,87,72,12205,...,12680,9946,5511,5516,17,137,712,1352,2710,6
1/320,11972,11972,8,59,862,1043,11465,82,73,11578,...,11972,9377,5404,5414,16,135,758,1351,2740,6
1/1000,11638,11638,7,54,774,980,10731,81,87,10889,...,11637,8739,4735,4745,14,125,658,1261,2443,6


In [82]:
df.groupby('focal_length').nunique().sort_values(by='id', ascending=False).head(10)

Unnamed: 0_level_0,index,id,compression,make,model,software,date_and_time_modified,exposure,aperture,iso_speed,...,secret,title,lat,lon,acc,country,admin_lvl1,admin_lvl2,city,continent
focal_length,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4.0,36673,36673,6,91,1090,2272,35112,2222,72,843,...,36667,23134,20080,20179,16,165,973,2118,4819,6
5.0,15228,15228,6,76,805,1151,14339,1655,69,458,...,15226,10868,7138,7156,15,117,589,1158,2195,6
18.0,11952,11952,8,37,534,893,11594,170,63,53,...,11950,9530,5213,5237,16,113,650,1214,2330,6
6.0,11218,11218,6,67,677,944,10621,1366,74,578,...,11215,7984,5916,5895,17,127,573,1106,1979,6
24.0,10980,10980,8,27,407,813,10362,124,58,62,...,10980,7733,4589,4594,13,126,624,1193,2273,6
50.0,10450,10450,7,46,395,813,9836,143,58,57,...,10448,7568,3504,3516,16,111,549,927,1689,6
70.0,8030,8030,8,26,244,649,7546,86,39,61,...,8027,5627,2230,2231,15,103,450,760,1283,6
35.0,7776,7776,8,41,396,765,7211,119,56,55,...,7774,5996,3085,3087,15,111,515,879,1658,6
200.0,6975,6975,8,22,238,572,6587,86,38,54,...,6975,4769,1606,1624,13,83,353,610,961,6
28.0,6052,6052,8,31,375,624,5557,102,51,56,...,6052,5078,2455,2559,14,99,439,735,1309,6


In [83]:
df.groupby('aperture').nunique().sort_values(by='id', ascending=False).head(10)

Unnamed: 0_level_0,index,id,compression,make,model,software,date_and_time_modified,exposure,iso_speed,date_and_time,...,secret,title,lat,lon,acc,country,admin_lvl1,admin_lvl2,city,continent
aperture,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5.6,27347,27347,9,57,827,1367,25797,171,62,26319,...,27346,20604,9905,9926,17,145,956,1902,4200,6
2.8,25973,25973,9,102,1018,1242,23875,690,314,24679,...,25972,16822,6760,6881,17,131,675,1348,2519,6
1.8,24799,24799,9,60,648,1646,23653,2200,705,24055,...,24796,15522,13149,13162,16,132,750,1583,3270,6
8.0,24158,24158,10,61,760,1282,22891,180,69,22751,...,24154,17967,10915,10906,16,147,961,2027,4660,6
4.0,23824,23824,8,58,861,1191,21690,170,73,22741,...,23818,16939,6726,6765,16,135,719,1421,2909,6
6.3,14495,14495,9,39,559,1015,13675,102,57,13750,...,14493,10741,5838,5851,14,129,753,1451,2905,6
7.1,12238,12238,8,35,528,906,11638,117,56,11675,...,12236,9188,5501,5505,16,127,741,1369,2827,6
5.0,12183,12183,7,43,664,899,11473,122,85,11687,...,12181,9168,4432,4442,14,122,625,1143,2136,6
4.5,12112,12112,8,41,733,936,11321,121,59,11706,...,12111,8691,4068,4084,16,127,622,1109,1982,6
3.5,10815,10815,9,56,770,845,10283,299,154,10503,...,10814,7506,3828,3860,15,113,537,975,1718,6


In [85]:
df.groupby('white_balance').nunique().sort_values(by='id', ascending=False).head(10)

Unnamed: 0_level_0,index,id,compression,make,model,software,date_and_time_modified,exposure,aperture,iso_speed,...,secret,title,lat,lon,acc,country,admin_lvl1,admin_lvl2,city,continent
white_balance,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Auto,255288,255288,12,236,3754,5948,236304,3069,134,1416,...,255230,166908,93130,93630,17,212,2052,5224,17871,6
Manual,35610,35610,9,84,977,1335,33173,313,94,97,...,35607,24353,10973,10982,17,143,942,1996,4469,6
Unknown (5),44,44,0,1,1,0,44,9,2,2,...,44,44,1,1,1,1,1,0,1,1
Unknown (4),10,10,2,2,5,5,8,2,2,5,...,10,10,5,5,2,1,2,2,1,1
Unknown (6),7,7,1,1,1,1,7,4,1,1,...,7,7,0,0,0,0,0,0,0,0
Unknown (65535),3,3,1,3,2,2,2,0,0,0,...,3,3,2,2,2,2,2,1,0,1
Unknown (288),3,3,1,1,1,1,1,2,1,1,...,3,3,1,1,1,1,1,1,1,1
Unknown (32),3,3,1,1,1,1,3,3,3,2,...,3,3,0,0,0,0,0,0,0,0
Unknown (2640),2,2,0,1,1,1,2,2,2,1,...,2,2,0,0,0,0,0,0,0,0
Unknown (256),2,2,0,1,1,1,2,2,1,1,...,2,2,1,1,1,1,1,1,1,1


In [92]:
df['title'] = df['title'].astype('string')

In [95]:
df['title'].info()

<class 'pandas.core.series.Series'>
Index: 365127 entries, 0 to 365126
Series name: title
Non-Null Count   Dtype 
--------------   ----- 
361125 non-null  string
dtypes: string(1)
memory usage: 5.6 MB


In [121]:
from collections import Counter
words = Counter(" ".join(df[df['title'].notna()]['title']).split()).most_common(100)

In [125]:
type(words[0])

tuple

In [128]:
df_words_top100 = pd.DataFrame(words, columns = ['word', 'count'])

In [129]:
df_words_top100

Unnamed: 0,word,count
0,-,61988
1,de,20862
2,the,18493
3,of,17531
4,in,15127
...,...,...
95,Of,858
96,House,854
97,Art,853
98,Church,849


In [130]:
df_words_top100.to_csv('./data/df_words_top100.csv')

In [118]:
df[df['country'] == 'Germany'].groupby('admin_lvl1').nunique().sort_values(by='id', ascending=False)

Unnamed: 0_level_0,index,id,compression,make,model,software,date_and_time_modified,exposure,aperture,iso_speed,...,secret,title,lat,lon,acc,country,admin_lvl2,city,continent,time_period
admin_lvl1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Nordrhein-Westfalen,6668,6668,5,47,320,379,6109,237,68,103,...,6667,2042,1812,1823,10,1,5,207,1,4396
Baden-Württemberg,3220,3220,5,41,295,345,2657,242,62,91,...,3220,2554,1758,1779,14,1,4,306,1,1600
Berlin,2573,2573,4,45,264,294,2195,239,63,71,...,2573,1629,1185,1207,12,1,0,2,1,1580
Bayern,2248,2248,7,46,290,420,1863,224,62,52,...,2248,1934,1285,1301,10,1,7,319,1,1119
Hessen,1322,1322,4,43,221,243,1088,169,54,47,...,1322,1089,753,764,9,1,4,137,1,716
Schleswig-Holstein,1236,1236,5,30,173,177,1182,114,58,64,...,1236,848,817,841,9,1,0,177,1,604
Rheinland-Pfalz,1097,1097,5,30,135,148,1005,130,48,67,...,1097,716,453,451,9,1,0,138,1,677
Niedersachsen,928,928,3,29,181,224,839,143,44,71,...,928,621,593,604,8,1,0,184,1,591
Hamburg,879,879,4,29,152,178,806,147,49,67,...,879,708,646,647,9,1,0,1,1,730
Sachsen,725,725,2,30,139,133,661,103,44,62,...,725,528,487,486,9,1,0,90,1,359
