### If the data is dirty it has to be cleaned

In [4]:
# Python modules
import flickrapi
import pandas as pd
import os
import json
from dotenv import load_dotenv
load_dotenv()
import numpy as np
from datetime import date

# Import additional functions
from flickr_functions import df_remove_dupes

# Keys needed for API access
api_key = os.getenv('flickr_api_key')
api_secret = os.getenv('flickr_api_secret')

# Flickr API object
flickr = flickrapi.FlickrAPI(api_key, api_secret, format='parsed-json')

# Data directory used to store CVS files
data_dir = './data/'

In [5]:
# Load EXIF dataframe from CVS file
df = pd.read_csv(data_dir + 'df_photo_exif_final.csv', index_col=[0])

In [6]:
# Reset the index so it looks better
df = df.reset_index()

# Drop columns not needed
columns_to_drop = ['index', 
                   'Image Width', 
                   'Image Height', 
                   'Orientation', 
                   'Date and Time (Modified)', 
                   'Date and Time (Digitized)']

df = df.drop(columns = columns_to_drop)

# Rename columns and get rid of weird chars
df.columns = df.columns.str.lower()
df.columns = df.columns.str.replace(' ', '_')
df = df.rename(columns={'date_and_time_(original)': 'date_and_time', 'owner': 'user_id'})

# Drop duplicates
df.drop_duplicates(subset='id', keep='first', inplace=True)

# Replace na and tbd with proper Nan
df = df.replace(to_replace='na', value=np.nan)
df = df.replace(to_replace='tbd', value=np.nan)

# Repair some values
for i, row in df[(df['iso_speed'].notnull()) & (df['iso_speed'].str.contains(','))].iterrows():
    df.at[i, 'iso_speed'] = int(float(row.get('iso_speed').split(',')[0]))
df.loc[df['aperture'] == 'undef', 'aperture'] = np.NAN
df.loc[df['aperture'] == 'Centered', 'aperture'] = np.NAN
for i, row in df[(df['focal_length'].notnull()) & (df['focal_length'].str.contains(' '))].iterrows():
    df.at[i, 'focal_length'] = int(float(row.get('focal_length').split(' ')[0]))
df.loc[df['focal_length'] == 'Other', 'focal_length'] = np.NAN

# Data types
df['id'] = df['id'].astype(int)
df['aperture'] = pd.to_numeric(df['aperture'])
df['iso_speed'] = pd.to_numeric(df['iso_speed'])
df.date_and_time = pd.to_datetime(df.date_and_time, format='%Y:%m:%d %H:%M:%S', errors='coerce')
df.lat, df.lon = df.lat.astype(float), df.lon.astype(float)
df['focal_length'] = pd.to_numeric(df['focal_length'])
df['acc'] = pd.to_numeric(df['acc'])
df['compression'] = df['compression'].astype('string')
df['make'] = df['make'].astype('string')
df['model'] = df['model'].astype('string')
df['software'] = df['software'].astype('string')
df['exposure'] = df['exposure'].astype('string')
df['flash'] = df['flash'].astype('string')
df['white_balance'] = df['white_balance'].astype('string')
df['user_id'] = df['user_id'].astype('string')
df['secret'] = df['secret'].astype('string')
df['country'] = df['country'].astype('string')
df['admin_lvl1'] = df['admin_lvl1'].astype('string')
df['admin_lvl2'] = df['admin_lvl2'].astype('string')
df['city'] = df['city'].astype('string')

# Reset index again
df = df.reset_index()

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 365127 entries, 0 to 365126
Data columns (total 23 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   index          365127 non-null  int64         
 1   id             365127 non-null  int64         
 2   compression    258898 non-null  string        
 3   make           300271 non-null  string        
 4   model          300504 non-null  string        
 5   software       287517 non-null  string        
 6   exposure       294586 non-null  string        
 7   aperture       293263 non-null  float64       
 8   iso_speed      293729 non-null  float64       
 9   date_and_time  302390 non-null  datetime64[ns]
 10  flash          292258 non-null  string        
 11  focal_length   292990 non-null  float64       
 12  white_balance  290995 non-null  string        
 13  user_id        365127 non-null  string        
 14  secret         365127 non-null  string        
 15  

In [4]:
df.to_csv('./data/df_photo_exif_cleaned.csv')