# Kaggle Landmark Recognition Data Exploration

In this document, we explore the training data

In [50]:
# Import modules

import os

import pandas as pd
import numpy as np
import urllib2
from PIL import Image
from StringIO import StringIO

import plotly.graph_objs as go
from plotly.graph_objs import *
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly import tools
#import colorlover as cl


init_notebook_mode(connected=True)

In [31]:
# Load data

data_dir = 'C:\\Users\\Colleen\\Documents\\Kaggle Landmark Recognition\\raw_data'

# Load training info
f = open(os.path.join(data_dir, 'train.csv'), 'r')
train_info = pd.read_csv(f)
f.close()

# Load testing info
f = open(os.path.join(data_dir, 'test.csv'), 'r')
test_info = pd.read_csv(f)
f.close()

# Load list of downloaded training images
im_dir = 'train_images'
train_im_ids =[os.path.splitext(x)[0] for x in os.listdir(os.path.join(data_dir, im_dir))]

In [17]:
train_info.head()

Unnamed: 0,id,url,landmark_id
0,cacf8152e2d2ae60,http://static.panoramio.com/photos/original/70...,4676
1,0a58358a2afd3e4e,http://lh6.ggpht.com/-igpT6wu0mIA/ROV8HnUuABI/...,6651
2,6b2bb500b6a38aa0,http://lh6.ggpht.com/-vKr5G5MEusk/SR6r6SJi6mI/...,11284
3,b399f09dee9c3c67,https://lh3.googleusercontent.com/-LOW2cjAqubA...,8429
4,19ace29d77a5be66,https://lh5.googleusercontent.com/-tnmSXwQcWL8...,6231


In [18]:
# Get class counts of all training images
train_class_cnts = train_info.groupby('landmark_id').count()

print 'Total number of classes: ' + str(train_class_cnts.shape[0])

Total number of classes: 14951


In [21]:
iplot(go.Figure(data = [go.Histogram(x = train_class_cnts['id'])],
                layout = go.Layout(title = 'Distribution of Class Counts in Total Training Data',
                                  xaxis = dict(range = [0, 1000]))))

This graph shows that the class counts are heavily skewed, with few classes having more than 200 samples in the training set.  In order to train with all classes with a portion of the full training set, we'll need to stratify the downloading to get 100 images per class (if available)

In [29]:
# Get class counts of current downloaded training images
temp_df = train_info.copy()
temp_df.index = train_info['id']
data_df = temp_df.loc[train_im_ids,:]
data_df.index = range(data_df.shape[0])

data_class_cnts = data_df.groupby('landmark_id').count()

print 'Number of classes in downloaded images: ' + str(data_class_cnts.shape[0])

Number of classes in downloaded images: 7816


In [30]:
iplot(go.Figure(data = [go.Histogram(x = data_class_cnts['id'])],
                layout = go.Layout(title = 'Distribution of Class Counts in Downloaded Data',
                                  xaxis = dict(range = [0, 1000]))))

In [35]:
# Get number of test samples:

print 'Number of test samples to submit: ' + str(test_info.shape[0])
print 'Number of duplicate test samples: ' + str(test_info.shape[0] - len(np.unique(test_info['id'])))

Number of test samples to submit: 117703
Number of duplicate test samples: 0


In [39]:
train_info['url'].iloc[1].

'http://lh6.ggpht.com/-igpT6wu0mIA/ROV8HnUuABI/AAAAAAAAAE4/cDiNGkoQX88/s1600/'

In [45]:
try:
    response = urllib2.urlopen('http://lh6.ggpht.com/-igpT6wu0mIA/ROV8HnUuABI/AAAAAAAAAE4/cDiNGkoQX88/s128/')
    image_data = response.read()
except:
    print('Warning: Could not download image')

In [51]:
pil_image = Image.open(StringIO(image_data))
pil_image_rgb = pil_image.convert('RGB')
pil_image_rgb.save('test.jpg', format='JPEG', quality=90)

In [56]:
endings = [x.split('/')[-2] for x in train_info['url']]

In [58]:
unique_endings = np.unique(endings)

array(['', '1920x1280', '21490', '3x8TmrJa51I', 'S128', 'S36-C', 'd',
       'h120', 'h181', 'h301', 'h371', 'h400', 'h415', 'h450', 'h50',
       'h500', 'h520', 'imgcache', 'iw-thumbnail', 'large',
       'lh3.googleusercontent.com', 'lh4.ggpht.com',
       'lh4.googleusercontent.com', 'lh5.googleusercontent.com', 'medium',
       'mini_square', 'original', 'pLBiAq6ptGA', 'page', 'public', 'rj',
       's0', 's0-d', 's0-d-d', 's0-w90-c-d-h90-n-k', 's100', 's100-c',
       's100-c-k', 's100-c-k-no', 's100-c-o', 's100-o', 's1000',
       's1000-fcrop64%253D1%2C03010cf8ff9ee21c', 's1020', 's1024',
       's1024-d', 's1034', 's104', 's104-c', 's105-c', 's106-p', 's1065',
       's1069', 's1080', 's1085', 's110', 's1100', 's1109', 's1128',
       's114', 's1150', 's1152', 's120', 's120-c', 's1200', 's122',
       's1238', 's126-p', 's127-c', 's128', 's128-c', 's1280', 's130',
       's130-c', 's133', 's1362', 's1365', 's1371', 's14', 's140',
       's140-c', 's1400', 's1403', 's141-p', 's