Data Preprocessing Pipeline for this [competition](https://www.kaggle.com/c/planet-understanding-the-amazon-from-space/).

In [1]:
import os, sys
from subprocess import Popen, PIPE
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import tifffile as tiff
import cv2
import random
import lmdb
from tqdm import tqdm

In [11]:
# append the path for custom modules
sys.path.append('../pyimgsaliency/')
import pyimgsaliency

sys.path.append('../new_data_pipeline/')
from datum_pb2 import Datum

In [2]:
proc = Popen('du -sh ../data/raw/*', shell=True, stdout=PIPE, stderr=PIPE)
print proc.communicate()[0]

690M	../data/raw/test-jpg
348M	../data/raw/test-jpg-additional
31G	../data/raw/test-tif-v2
687M	../data/raw/train-jpg
20G	../data/raw/train-tif-v2
1.4M	../data/raw/train_v2.csv



In [3]:
tiff_img_train = '../data/raw/train-tif-v2/'
tiff_img_test = '../data/raw/test-tif-v2/'
lmdb_dir = '../data/processed/lmdb/traindb/'

In [4]:
df_train = pd.read_csv('../data/raw/train_v2.csv')
print df_train.info()
df_train.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40479 entries, 0 to 40478
Data columns (total 2 columns):
image_name    40479 non-null object
tags          40479 non-null object
dtypes: object(2)
memory usage: 632.6+ KB
None


Unnamed: 0,image_name,tags
0,train_0,haze primary
1,train_1,agriculture clear primary water
2,train_2,clear primary
3,train_3,clear primary
4,train_4,agriculture clear habitation primary road


In [6]:
tags = df_train['tags'].apply(lambda x: x.split(' '))

labels = []
for i in xrange(len(df_train)):
    labels.append(tags.values[i])

labels = set([item for sublist in labels for item in sublist])
print "{} unique labels: ".format(len(labels))
for label in labels: print label

17 unique labels: 
slash_burn
clear
blooming
primary
cloudy
conventional_mine
water
haze
cultivation
partly_cloudy
artisinal_mine
habitation
bare_ground
blow_down
agriculture
road
selective_logging


In [7]:
labelmap = {l:i for i, l in enumerate(labels)}
print "labelmap:", labelmap

labelmap: {'selective_logging': 16, 'cultivation': 8, 'clear': 1, 'habitation': 11, 'conventional_mine': 5, 'cloudy': 4, 'primary': 3, 'water': 6, 'haze': 7, 'slash_burn': 0, 'partly_cloudy': 9, 'artisinal_mine': 10, 'blooming': 2, 'bare_ground': 12, 'blow_down': 13, 'agriculture': 14, 'road': 15}


In [16]:
# write to LMDB

env = lmdb.open(lmdb_dir + 'datumdb', max_dbs=2)
labeldb = env.open_db('labeldb')
key = 0

for fname, tags in tqdm(df_train.values, miniters=500):
    key += 1
    im = tiff.imread(tiff_img_train + fname + '.tif')
    targets = np.zeros(17, dtype='uint8')
    for t in tags.split(' '):
        targets[labelmap[t]] = 1
    
    datum = Datum()
    imageDatum = datum.imgdata.add()
    imageDatum.data = im.tobytes()
    imageDatum.identifier = str(key)
    
    label = Datum()
    labelDatum = label.classs
    labelDatum.multilabel = targets.tobytes()
    labelDatum.identifier = str(key)
    
    with env.begin(write=True) as txn:
        txn.put(str(key).encode('ascii'), datum.SerializeToString())
        
    with env.begin(write=True, db=labeldb):
        txn.put(str(key).encode('ascii'), label.SerializeToString())
    
env.close()

  0%|          | 0/40479 [00:00<?, ?it/s]


Error: Attempt to operate on closed/deleted/dropped object.

In [13]:
# some basic tests
try:
    assert os.path.exists(lmdb_dir + 'datumdb')
except AssertionError:
    print "lmdb database was not created."

lmdb database was not created.


In [8]:
# create oneHot vector

oneHotOutput = []
for _, tags in tqdm(df_train.values, miniters=500):
    targets = np.zeros(17, dtype='uint8')
    for t in tags.split(' '):
        targets[labelmap[t]] = 1
    
    oneHotOutput.append(targets)

100%|██████████| 40479/40479 [00:00<00:00, 162644.71it/s]


In [11]:
oneHotOutput = np.array(oneHotOutput)
oneHot_df = pd.DataFrame(oneHotOutput)
print oneHot_df.info()
oneHot_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40479 entries, 0 to 40478
Data columns (total 17 columns):
0     40479 non-null uint8
1     40479 non-null uint8
2     40479 non-null uint8
3     40479 non-null uint8
4     40479 non-null uint8
5     40479 non-null uint8
6     40479 non-null uint8
7     40479 non-null uint8
8     40479 non-null uint8
9     40479 non-null uint8
10    40479 non-null uint8
11    40479 non-null uint8
12    40479 non-null uint8
13    40479 non-null uint8
14    40479 non-null uint8
15    40479 non-null uint8
16    40479 non-null uint8
dtypes: uint8(17)
memory usage: 672.1 KB
None


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0
1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0
2,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,1,0,1,0,0,0,0,0,0,0,1,0,0,1,1,0


In [None]:
oneHot_df.to_csv('../data')

In [5]:
df_train.head()

Unnamed: 0,image_name,tags
0,train_0,haze primary
1,train_1,agriculture clear primary water
2,train_2,clear primary
3,train_3,clear primary
4,train_4,agriculture clear habitation primary road


In [7]:
image_binding_df = df_train['image_name']
print image_binding_df.count()
image_binding_df.head()

40479


0    train_0
1    train_1
2    train_2
3    train_3
4    train_4
Name: image_name, dtype: object

In [16]:
image_binding_df.to_csv('../data/processed/image_binding.csv', header=True, index=False)

In [9]:
proc = Popen('du -sh ../data/processed/*', shell=True, stdout=PIPE, stderr=PIPE)
print proc.communicate()[0]

692K	../data/processed/image_binding.csv
8.0K	../data/processed/lmdb
1.6M	../data/processed/oneHotFeatures.csv



In [17]:
temp_df = pd.read_csv('../data/processed/image_binding.csv')
temp_df.head()

Unnamed: 0,image_name
0,train_0
1,train_1
2,train_2
3,train_3
4,train_4
