# Data Exploration and Processing

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import display # Allows the use of display() for DataFrames

In [3]:
import sys, os
from os import path

In [4]:
# Pretty display for notebooks
%matplotlib inline

In [5]:
# Data paths definitions :
data_dir = "../data"
stats_dir = "../docs/stats"
input_csv_dir = os.path.join(data_dir,"input_csv") #csv files that were dowlnoaded from kagggle
train_dir = os.path.join(data_dir, "train") ##Training images directory
validation_dir = os.path.join(data_dir, "validation") ##Validation images directory
test_dir = os.path.join(data_dir, "test") ##Validation images directory

The data was obtained from [CVDF Google Landmarks Dataset v2](https://github.com/cvdfoundation/google-landmark)

## Data Exploration

### Exploring file : train_label_to_category.csv

[train_label_to_category.csv](https://s3.amazonaws.com/google-landmark/metadata/train_label_to_category.csv) : CSV with landmark_id,category fields:  
    `landmark_id` is an integer, `category` is a Wikimedia URL referring to the
    class definition.


In [6]:
# Load file
try:
    train_label_to_category = pd.read_csv(os.path.join(input_csv_dir, "train_label_to_category.csv"))
    print("File has {} samples with {} features each.".format(*train_label_to_category.shape))
except:
    print("File could not be loaded. Is the dataset missing?")

File has 203094 samples with 2 features each.


In [7]:
# Display a description of the file
display(train_label_to_category.head())

Unnamed: 0,landmark_id,category
0,0,http://commons.wikimedia.org/wiki/Category:Hap...
1,1,http://commons.wikimedia.org/wiki/Category:Lui...
2,2,http://commons.wikimedia.org/wiki/Category:Gra...
3,3,http://commons.wikimedia.org/wiki/Category:Twe...
4,4,http://commons.wikimedia.org/wiki/Category:San...


In [8]:
# del train_label_to_category #delete dataframe to free memory

### Exploring input file : train.csv

[train.csv](https://s3.amazonaws.com/google-landmark/metadata/train.csv): CSV with id,url,landmark_id fields. `id` is a 16-character
    string, `url` is a string, `landmark_id` is an integer.
 
 Contains url to download images from the web, along with their ids.

In [9]:
# Load file
try:
    train_data = pd.read_csv(os.path.join(input_csv_dir, "train.csv"))
    print("File has {} samples with {} features each.".format(*train_data.shape))
except:
    print("File could not be loaded. Is the dataset missing?")

File has 4132914 samples with 3 features each.


In [10]:
# Display a top rows of the file
display(train_data.head())

Unnamed: 0,id,url,landmark_id
0,6e158a47eb2ca3f6,https://upload.wikimedia.org/wikipedia/commons...,142820
1,202cd79556f30760,http://upload.wikimedia.org/wikipedia/commons/...,104169
2,3ad87684c99c06e1,http://upload.wikimedia.org/wikipedia/commons/...,37914
3,e7f70e9c61e66af3,https://upload.wikimedia.org/wikipedia/commons...,102140
4,4072182eddd0100e,https://upload.wikimedia.org/wikipedia/commons...,2474


### Selecting Image Subset from Input File :

#### Displaying the top Landmarks with largest number of image samples 

Here we do frequency count on `landmark_id` and display the top 60 results

In [11]:
top_train_data = train_data['landmark_id'].value_counts().to_frame().nlargest(60, 'landmark_id')
top_train_data.columns = ['images_count']

In [12]:
display(top_train_data[33:55])

Unnamed: 0,images_count
149980,1108
46500,1107
14915,1093
120734,1093
33992,1088
40088,1056
164773,1048
171683,1047
176018,1045
168098,1037


#### Select 10 Labels with number of samples $\approx$ 1000 :

From observations with see that labels from 40th to 50th have around 1000 samples each

In [13]:
#selected_labels_indices = top_train_data['landmark_id'][40:50].keys()
selected_labels_indices = [40088, 164773, 176018, 168098, 165900, 25093, 9070, 127516, 56827, 147897]
selected_labels_indices

selected_labels_df = train_label_to_category[train_label_to_category['landmark_id'].isin(selected_labels_indices)]
selected_labels_df = selected_labels_df.join(top_train_data, on='landmark_id', how='left').sort_values(by='images_count',  ascending=False)
selected_labels_df.index = np.arange(1, selected_labels_df.shape[0] + 1)
selected_labels_subset = selected_labels_df.replace({'http://commons.wikimedia.org/wiki/Category:':''}, regex=True)
display(selected_labels_subset)

Unnamed: 0,landmark_id,category,images_count
1,40088,Masada,1056
2,164773,Dead_Sea,1048
3,176018,Hayravank_monastery,1045
4,168098,Golden_Gate_Bridge,1037
5,165900,Mount_Arapiles,1027
6,25093,Matka_Canyon,1017
7,9070,Feroz_Shah_Kotla,978
8,127516,Burrator,976
9,56827,Kazan,975
10,147897,Kasteel_Amerongen,961


#### Save Selected Labels to CSV

In [50]:
selected_labels_subset.to_csv(path.join(stats_dir, "selected_labels_subset.csv")) #save selected labels subset to csv

## Data Preprocessing

### Extracting subset of training data :

Here we extract subset of `train_data` that only contain the ten landmarks that are defined in `selected_labels_subset`

In [14]:
train_data_subset = train_data[train_data['landmark_id'].isin(selected_labels_subset['landmark_id'])]

In [15]:
train_data_subset

Unnamed: 0,id,url,landmark_id
349,9c19ee11f0902178,https://upload.wikimedia.org/wikipedia/commons...,165900
922,37b2522f3895b2f9,https://upload.wikimedia.org/wikipedia/commons...,176018
1244,bba39914defa5ca0,https://upload.wikimedia.org/wikipedia/commons...,127516
2013,7a12b3cc85fb8410,https://upload.wikimedia.org/wikipedia/commons...,164773
2236,208406f06dd0b2fc,https://upload.wikimedia.org/wikipedia/commons...,164773
...,...,...,...
4131020,41821760967e8625,https://upload.wikimedia.org/wikipedia/commons...,165900
4131367,22023ea9d00371a0,https://upload.wikimedia.org/wikipedia/commons...,168098
4131585,f0e41092bce16658,https://upload.wikimedia.org/wikipedia/commons...,165900
4131864,fc17525a42d608fa,https://upload.wikimedia.org/wikipedia/commons...,56827


Saving the subst training data to csv file :

### Splitting Selected Image Subset into Train, Test and Validation Sets

In [25]:
from sklearn.model_selection import train_test_split

Splitting data into 70% training, 15% validation and 15% testing. The split is stratified :

In [17]:
train, validation_test, _ , _ = train_test_split(train_data_subset, train_data_subset['landmark_id'], test_size=0.3, random_state=1, stratify=train_data_subset['landmark_id'])

validation, test, _ , _ = train_test_split(validation_test, validation_test['landmark_id'], test_size=0.5, random_state=1, stratify=validation_test['landmark_id'])

#### Frequency count of training, validation and test data

In [18]:
train.groupby('landmark_id').nunique()

Unnamed: 0_level_0,id,url,landmark_id
landmark_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
9070,685,685,1
25093,712,712,1
40088,739,739,1
56827,682,682,1
127516,683,683,1
147897,673,673,1
164773,734,734,1
165900,719,719,1
168098,726,726,1
176018,731,731,1


In [19]:
validation.groupby('landmark_id').nunique()

Unnamed: 0_level_0,id,url,landmark_id
landmark_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
9070,146,146,1
25093,153,153,1
40088,159,159,1
56827,146,146,1
127516,147,147,1
147897,144,144,1
164773,157,157,1
165900,154,154,1
168098,155,155,1
176018,157,157,1


In [20]:
test.groupby('landmark_id').nunique()

Unnamed: 0_level_0,id,url,landmark_id
landmark_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
9070,147,147,1
25093,152,152,1
40088,158,158,1
56827,147,147,1
127516,146,146,1
147897,144,144,1
164773,157,157,1
165900,154,154,1
168098,156,156,1
176018,157,157,1


### `landmark_id` Lookup and Construction of Download Paths

In [21]:
train = train.merge(selected_labels_subset, left_on='landmark_id', right_on='landmark_id')
train['path'] = train_dir + "/" + train['category'].astype(str)
train

Unnamed: 0,id,url,landmark_id,category,path
0,e8e3e507493b837e,https://upload.wikimedia.org/wikipedia/commons...,40088,Masada,../data/train/Masada
1,2c642f3336735f92,https://upload.wikimedia.org/wikipedia/commons...,40088,Masada,../data/train/Masada
2,2fc7b54c17fee6c8,https://upload.wikimedia.org/wikipedia/commons...,40088,Masada,../data/train/Masada
3,c219e4813fd16ed7,https://upload.wikimedia.org/wikipedia/commons...,40088,Masada,../data/train/Masada
4,2c3cea363ee7cd66,https://upload.wikimedia.org/wikipedia/commons...,40088,Masada,../data/train/Masada
...,...,...,...,...,...
7079,f427e30b38a778b7,https://upload.wikimedia.org/wikipedia/commons...,164773,Dead_Sea,../data/train/Dead_Sea
7080,99a65e07d6a1ec83,https://upload.wikimedia.org/wikipedia/commons...,164773,Dead_Sea,../data/train/Dead_Sea
7081,5066d07323d5264d,https://upload.wikimedia.org/wikipedia/commons...,164773,Dead_Sea,../data/train/Dead_Sea
7082,a81dfde083ce4602,https://upload.wikimedia.org/wikipedia/commons...,164773,Dead_Sea,../data/train/Dead_Sea


In [22]:
validation = validation.merge(selected_labels_subset, left_on='landmark_id', right_on='landmark_id')
validation['path'] = validation_dir + "/" + validation['category'].astype(str)
validation

Unnamed: 0,id,url,landmark_id,category,path
0,e4a769b80565d595,https://upload.wikimedia.org/wikipedia/commons...,25093,Matka_Canyon,../data/validation/Matka_Canyon
1,c1e879406e4c0c96,http://upload.wikimedia.org/wikipedia/commons/...,25093,Matka_Canyon,../data/validation/Matka_Canyon
2,dc678fe7b579924f,http://upload.wikimedia.org/wikipedia/commons/...,25093,Matka_Canyon,../data/validation/Matka_Canyon
3,814be91435675ef9,http://upload.wikimedia.org/wikipedia/commons/...,25093,Matka_Canyon,../data/validation/Matka_Canyon
4,48fc1c67c46a5691,http://upload.wikimedia.org/wikipedia/commons/...,25093,Matka_Canyon,../data/validation/Matka_Canyon
...,...,...,...,...,...
1513,b06a1a6902900046,https://upload.wikimedia.org/wikipedia/commons...,56827,Kazan,../data/validation/Kazan
1514,449c56391e0e9624,https://upload.wikimedia.org/wikipedia/commons...,56827,Kazan,../data/validation/Kazan
1515,0d858e81ff36a2d0,https://upload.wikimedia.org/wikipedia/commons...,56827,Kazan,../data/validation/Kazan
1516,0c297ae3d645185a,https://upload.wikimedia.org/wikipedia/commons...,56827,Kazan,../data/validation/Kazan


In [23]:
test = test.merge(selected_labels_subset, left_on='landmark_id', right_on='landmark_id')
test['path'] = test_dir + "/" + test['category'].astype(str)
test

Unnamed: 0,id,url,landmark_id,category,path
0,2012261ca8b015be,https://upload.wikimedia.org/wikipedia/commons...,176018,Hayravank_monastery,../data/test/Hayravank_monastery
1,a79a31c20635877e,https://upload.wikimedia.org/wikipedia/commons...,176018,Hayravank_monastery,../data/test/Hayravank_monastery
2,8538767d1668a0a8,https://upload.wikimedia.org/wikipedia/commons...,176018,Hayravank_monastery,../data/test/Hayravank_monastery
3,00a68f850d7b4d4b,https://upload.wikimedia.org/wikipedia/commons...,176018,Hayravank_monastery,../data/test/Hayravank_monastery
4,d710d27f36e5b67c,https://upload.wikimedia.org/wikipedia/commons...,176018,Hayravank_monastery,../data/test/Hayravank_monastery
...,...,...,...,...,...
1513,e8b8ae6551193999,https://upload.wikimedia.org/wikipedia/commons...,147897,Kasteel_Amerongen,../data/test/Kasteel_Amerongen
1514,b8244b4e3ab285b2,https://upload.wikimedia.org/wikipedia/commons...,147897,Kasteel_Amerongen,../data/test/Kasteel_Amerongen
1515,ddd4dafa74179605,https://upload.wikimedia.org/wikipedia/commons...,147897,Kasteel_Amerongen,../data/test/Kasteel_Amerongen
1516,a3d96e3ef3d331dc,http://upload.wikimedia.org/wikipedia/commons/...,147897,Kasteel_Amerongen,../data/test/Kasteel_Amerongen


### Saving Training, Validation and Test Image data to csv files

In [24]:
train.to_csv(os.path.join(data_dir, "index_train.csv"), index=False) #save training data subset to csv

In [25]:
test.to_csv(os.path.join(data_dir, "index_test.csv"), index=False) #save training data subset to csv

In [26]:
validation.to_csv(os.path.join(data_dir, "index_validation.csv"), index=False) #save training data subset to csv

Deleting input `train_data` dataframe to free memory

In [27]:
# del train_data #delete train dataframe to free memory

### Downloading Images

Here we download images with the help of `image_downloader.py`

#### load image-downloader.py

In [57]:
%load_ext autoreload
%autoreload 2

from image_downloader import * #import custom script to download images
from IPython.display import clear_output

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


#### Create Subdirectory tree for landmarks training, validation and test images :

In [None]:
os.mkdir(train_dir)
os.mkdir(validation_dir)
os.mkdir(test_dir)

for landmark_name in selected_labels_subset['category']:
    os.mkdir(os.path.join(train_dir,landmark_name))
    os.mkdir(os.path.join(validation_dir,landmark_name))
    os.mkdir(os.path.join(test_dir,landmark_name))

#### Defining a download helper function

In [None]:
# Serial Downloader function

#def download_images(dataset):
#    i = 0
#    total_sample_count = dataset.shape[0]
#    for image_data in dataset[['id', 'url', 'path']].values.tolist():
#        i+=1
#        print("image {} of {}".format(i, total_sample_count))
#        RunDownloadImage(image_data)
#        if i % 10 == 0 :
#            clear_output()

In [58]:
def download_images(dataset):
    image_data = dataset[['id', 'url', 'path']].values.tolist()
    RunDownloadImage(image_data)

#### Downloading training images

In [60]:
download_images(train)

../data/train/Dead_Sea/d9431d5ed4f2f1d2.jpg


#### Downloading validation images

In [61]:
download_images(validation)

../data/validation/Kazan/c46622f0dd832bbd.jpg


#### Downloading test images

In [62]:
download_images(test)

../data/test/Mount_Arapiles/35caae52597f19ea.jpg
