In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from __future__ import print_function, division

In [3]:
import os.path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras.optimizers import Adam

Using TensorFlow backend.


In [4]:
import os
import sys
base_module_path = os.path.abspath(os.path.join('..'))
if base_module_path not in sys.path:
    sys.path.append(base_module_path)
import ama as a

In [5]:
BASE_PATH = '../data/'

In [6]:
df = pd.read_csv(BASE_PATH + 'train_v2.csv'); print(len(df)); df.head()

40479


Unnamed: 0,image_name,tags
0,train_0,haze primary
1,train_1,agriculture clear primary water
2,train_2,clear primary
3,train_3,clear primary
4,train_4,agriculture clear habitation primary road


In [7]:
df['path'] = df['image_name'].map(lambda x: BASE_PATH + 'train-full-jpg/' + x + '.jpg'); df.head()

Unnamed: 0,image_name,tags,path
0,train_0,haze primary,../data/train-full-jpg/train_0.jpg
1,train_1,agriculture clear primary water,../data/train-full-jpg/train_1.jpg
2,train_2,clear primary,../data/train-full-jpg/train_2.jpg
3,train_3,clear primary,../data/train-full-jpg/train_3.jpg
4,train_4,agriculture clear habitation primary road,../data/train-full-jpg/train_4.jpg


In [8]:
# Build list with unique labels
label_list = []
for tag_str in df.tags.values:
    labels = tag_str.split(' ')
    for label in labels:
        if label not in label_list:
            label_list.append(label)
label_list      

['haze',
 'primary',
 'agriculture',
 'clear',
 'water',
 'habitation',
 'road',
 'cultivation',
 'slash_burn',
 'cloudy',
 'partly_cloudy',
 'conventional_mine',
 'bare_ground',
 'artisinal_mine',
 'blooming',
 'selective_logging',
 'blow_down']

In [9]:
# Add onehot features for every label
for label in label_list:
    df[label] = df['tags'].apply(lambda x: 1 if label in x.split(' ') else 0)
df.head()

Unnamed: 0,image_name,tags,path,haze,primary,agriculture,clear,water,habitation,road,cultivation,slash_burn,cloudy,partly_cloudy,conventional_mine,bare_ground,artisinal_mine,blooming,selective_logging,blow_down
0,train_0,haze primary,../data/train-full-jpg/train_0.jpg,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,train_1,agriculture clear primary water,../data/train-full-jpg/train_1.jpg,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0
2,train_2,clear primary,../data/train-full-jpg/train_2.jpg,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,train_3,clear primary,../data/train-full-jpg/train_3.jpg,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,train_4,agriculture clear habitation primary road,../data/train-full-jpg/train_4.jpg,0,1,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0


In [10]:
def check_image(path):
    print(path)
    img = plt.imread(path)
    print(img.shape)
    plt.imshow(img)

In [11]:
### COMPUTE FEATURES ###
from PIL import Image, ImageStat
import scipy
import scipy.stats
import cv2

def get_features(path):
    st = []
    img = Image.open(path)
    im_stats_ = ImageStat.Stat(img)
    st += im_stats_.sum
    st += im_stats_.mean
    st += im_stats_.rms
    st += im_stats_.var
    st += im_stats_.stddev
    img = np.array(img)[:,:,:3]
    st += [scipy.stats.kurtosis(img[:,:,0].ravel())]
    st += [scipy.stats.kurtosis(img[:,:,1].ravel())]
    st += [scipy.stats.kurtosis(img[:,:,2].ravel())]
    st += [scipy.stats.skew(img[:,:,0].ravel())]
    st += [scipy.stats.skew(img[:,:,1].ravel())]
    st += [scipy.stats.skew(img[:,:,2].ravel())]
    ### cv2 jpg ###
    bw = cv2.imread(path,0)
    st += list(cv2.calcHist([bw],[0],None,[256],[0,256]).flatten()) #bw 
    st += list(cv2.calcHist([img],[0],None,[256],[0,256]).flatten()) #r
    st += list(cv2.calcHist([img],[1],None,[256],[0,256]).flatten()) #g
    st += list(cv2.calcHist([img],[2],None,[256],[0,256]).flatten()) #b
    m, s = cv2.meanStdDev(img) #mean and standard deviation
    st += list(m.flatten())
    st += list(s.flatten())
    st += [cv2.Laplacian(bw, cv2.CV_64F).var()] 
    st += [cv2.Laplacian(img, cv2.CV_64F).var()]
    st += [cv2.Sobel(bw,cv2.CV_64F,1,0,ksize=5).var()]
    st += [cv2.Sobel(bw,cv2.CV_64F,0,1,ksize=5).var()]
    st += [cv2.Sobel(img,cv2.CV_64F,1,0,ksize=5).var()]
    st += [cv2.Sobel(img,cv2.CV_64F,0,1,ksize=5).var()]
    st += [(bw<30).sum()]
    st += [(bw>225).sum()]
    return [path, st]

In [14]:
from multiprocessing import Pool, cpu_count
def normalize_img(paths):
    imf_d = {}
    p = Pool(cpu_count())
    ret = p.map(get_features, paths)
    for i in range(len(ret)):
        imf_d[ret[i][0]] = ret[i][1]
    ret = []
    fdata = [imf_d[f] for f in paths]
    return fdata

In [15]:
import cPickle as pickle

In [16]:
try:
    print('trying to unpickle...')
    df = pickle.load( open(BASE_PATH+ 'df_train.pkl', 'rb'))
    print('unpickle successful')
except:
    print('failed pickle load')
    print('generating features...')
    x = normalize_img(df['path']);
    df['fdata'] = x
    print('pickling features')
    with open(BASE_PATH + 'df_train.pkl', 'wb') as output:
        pickle.dump(df, output)

trying to unpickle...
failed pickle load
generating features...
pickling features


In [17]:
df.head()

Unnamed: 0,image_name,tags,path,haze,primary,agriculture,clear,water,habitation,road,...,slash_burn,cloudy,partly_cloudy,conventional_mine,bare_ground,artisinal_mine,blooming,selective_logging,blow_down,fdata
0,train_0,haze primary,../data/train-full-jpg/train_0.jpg,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[10696302.0, 9545515.0, 9921551.0, 0.0, 163.21..."
1,train_1,agriculture clear primary water,../data/train-full-jpg/train_1.jpg,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,"[12409906.0, 11698348.0, 12497611.0, 0.0, 189...."
2,train_2,clear primary,../data/train-full-jpg/train_2.jpg,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,"[13882853.0, 12908011.0, 12985172.0, 0.0, 211...."
3,train_3,clear primary,../data/train-full-jpg/train_3.jpg,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,"[13342388.0, 12435209.0, 13174508.0, 0.0, 203...."
4,train_4,agriculture clear habitation primary road,../data/train-full-jpg/train_4.jpg,0,1,1,1,0,1,1,...,0,0,0,0,0,0,0,0,0,"[12953690.0, 13745953.0, 15432086.0, 0.0, 197...."
