In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from __future__ import print_function, division

In [3]:
import os.path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras.optimizers import Adam

Using TensorFlow backend.


In [4]:
import os
import sys
base_module_path = os.path.abspath(os.path.join('..'))
if base_module_path not in sys.path:
    sys.path.append(base_module_path)
import ama as a

In [5]:
BASE_PATH = '../data/'

In [6]:
df = pd.read_csv(BASE_PATH + 'train_v2.csv'); print(len(df)); df.head()

40479


Unnamed: 0,image_name,tags
0,train_0,haze primary
1,train_1,agriculture clear primary water
2,train_2,clear primary
3,train_3,clear primary
4,train_4,agriculture clear habitation primary road


In [7]:
df['path'] = df['image_name'].map(lambda x: BASE_PATH + 'train-full-jpg/' + x + '.jpg'); df.head()

Unnamed: 0,image_name,tags,path
0,train_0,haze primary,../data/train-full-jpg/train_0.jpg
1,train_1,agriculture clear primary water,../data/train-full-jpg/train_1.jpg
2,train_2,clear primary,../data/train-full-jpg/train_2.jpg
3,train_3,clear primary,../data/train-full-jpg/train_3.jpg
4,train_4,agriculture clear habitation primary road,../data/train-full-jpg/train_4.jpg


In [8]:
# Build list with unique labels
label_list = []
for tag_str in df.tags.values:
    labels = tag_str.split(' ')
    for label in labels:
        if label not in label_list:
            label_list.append(label)
label_list      

['haze',
 'primary',
 'agriculture',
 'clear',
 'water',
 'habitation',
 'road',
 'cultivation',
 'slash_burn',
 'cloudy',
 'partly_cloudy',
 'conventional_mine',
 'bare_ground',
 'artisinal_mine',
 'blooming',
 'selective_logging',
 'blow_down']

In [9]:
# Add onehot features for every label
for label in label_list:
    df[label] = df['tags'].apply(lambda x: 1 if label in x.split(' ') else 0)
df.head()

Unnamed: 0,image_name,tags,path,haze,primary,agriculture,clear,water,habitation,road,cultivation,slash_burn,cloudy,partly_cloudy,conventional_mine,bare_ground,artisinal_mine,blooming,selective_logging,blow_down
0,train_0,haze primary,../data/train-full-jpg/train_0.jpg,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,train_1,agriculture clear primary water,../data/train-full-jpg/train_1.jpg,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0
2,train_2,clear primary,../data/train-full-jpg/train_2.jpg,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,train_3,clear primary,../data/train-full-jpg/train_3.jpg,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,train_4,agriculture clear habitation primary road,../data/train-full-jpg/train_4.jpg,0,1,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0


In [10]:
def check_image(path):
    print(path)
    img = plt.imread(path)
    print(img.shape)
    plt.imshow(img)

In [11]:
### COMPUTE FEATURES ###
from PIL import Image, ImageStat
import scipy
import scipy.stats
import cv2

def get_features(path):
    st = []
    img = Image.open(path)
    im_stats_ = ImageStat.Stat(img)
    st += im_stats_.sum
    st += im_stats_.mean
    st += im_stats_.rms
    st += im_stats_.var
    st += im_stats_.stddev
    img = np.array(img)[:,:,:3]
    st += [scipy.stats.kurtosis(img[:,:,0].ravel())]
    st += [scipy.stats.kurtosis(img[:,:,1].ravel())]
    st += [scipy.stats.kurtosis(img[:,:,2].ravel())]
    st += [scipy.stats.skew(img[:,:,0].ravel())]
    st += [scipy.stats.skew(img[:,:,1].ravel())]
    st += [scipy.stats.skew(img[:,:,2].ravel())]
    ### cv2 jpg ###
    bw = cv2.imread(path,0)
    st += list(cv2.calcHist([bw],[0],None,[256],[0,256]).flatten()) #bw 
    st += list(cv2.calcHist([img],[0],None,[256],[0,256]).flatten()) #r
    st += list(cv2.calcHist([img],[1],None,[256],[0,256]).flatten()) #g
    st += list(cv2.calcHist([img],[2],None,[256],[0,256]).flatten()) #b
    m, s = cv2.meanStdDev(img) #mean and standard deviation
    st += list(m.flatten())
    st += list(s.flatten())
    st += [cv2.Laplacian(bw, cv2.CV_64F).var()] 
    st += [cv2.Laplacian(img, cv2.CV_64F).var()]
    st += [cv2.Sobel(bw,cv2.CV_64F,1,0,ksize=5).var()]
    st += [cv2.Sobel(bw,cv2.CV_64F,0,1,ksize=5).var()]
    st += [cv2.Sobel(img,cv2.CV_64F,1,0,ksize=5).var()]
    st += [cv2.Sobel(img,cv2.CV_64F,0,1,ksize=5).var()]
    st += [(bw<30).sum()]
    st += [(bw>225).sum()]
    return [path, st]

In [12]:
from multiprocessing import Pool, cpu_count
def normalize_img(paths):
    imf_d = {}
    p = Pool(cpu_count())
    ret = p.map(get_features, paths)
    for i in range(len(ret)):
        imf_d[ret[i][0]] = ret[i][1]
    ret = []
    fdata = [imf_d[f] for f in paths]
    return fdata

In [13]:
import cPickle as pickle

In [14]:
try:
    print('trying to unpickle...')
    x = pickle.load( open(BASE_PATH+ 'features_train.pkl', 'rb'))
    print('unpickle successful')
except:
    print('failed pickle load')
    print('generating features...')
    x = normalize_img(df['path']);
    df['fdata'] = x
    print('pickling features')
    with open(BASE_PATH + 'features_train.pkl', 'wb') as output:
        pickle.dump(df, output)

In [18]:
with open(BASE_PATH + 'df_train.pkl', 'wb') as output:
    pickle.dump(df, output)

MemoryError: 

In [15]:
len(x)

40479

In [17]:
df['fdata'] = x; df.head()

Unnamed: 0,image_name,tags,path,haze,primary,agriculture,clear,water,habitation,road,...,slash_burn,cloudy,partly_cloudy,conventional_mine,bare_ground,artisinal_mine,blooming,selective_logging,blow_down,fdata
0,train_0,haze primary,../data/train-full-jpg/train_0.jpg,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[10696302.0, 9545515.0, 9921551.0, 0.0, 163.21..."
1,train_1,agriculture clear primary water,../data/train-full-jpg/train_1.jpg,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,"[12409906.0, 11698348.0, 12497611.0, 0.0, 189...."
2,train_2,clear primary,../data/train-full-jpg/train_2.jpg,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,"[13882853.0, 12908011.0, 12985172.0, 0.0, 211...."
3,train_3,clear primary,../data/train-full-jpg/train_3.jpg,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,"[13342388.0, 12435209.0, 13174508.0, 0.0, 203...."
4,train_4,agriculture clear habitation primary road,../data/train-full-jpg/train_4.jpg,0,1,1,1,0,1,1,...,0,0,0,0,0,0,0,0,0,"[12953690.0, 13745953.0, 15432086.0, 0.0, 197...."


In [None]:
import glob
test_jpg = glob.glob(BASE_PATH+ 'test-full-jpg/*'); len(test_jpg)
test = pd.DataFrame([[p.split('/')[3].replace('.jpg',''),p] for p in test_jpg]); test
test.columns = ['image_name','path']; test

In [None]:
xtest = normalize_img(test['path'])
test['fdata'] = xtest

In [None]:
with open(BASE_PATH + 'df_test.pkl', 'wb') as output:
    pickle.dump(test, output)

In [32]:
ls '../data'

[0m[01;34mbackups[0m/            sample_submission_v2.csv  [01;34mtrain-full-jpg[0m/  [01;34mweights[0m/
dl.sh               [01;34mtest-jpg[0m/                 [01;34mtrain-jpg[0m/
features_train.pkl  [01;34mtest-jpg-additional[0m/      train_v2.csv
[01;34msample[0m/             test_v2_file_mapping.csv  [01;34mval-jpg[0m/


In [None]:
test_jpg = glob.glob(in_path + 'test-jpg-v2/*')[:1000]
test = pd.DataFrame([[p.split('/')[3].replace('.jpg',''),p] for p in test_jpg])
test.columns = ['image_name','path']
xtest = normalize_img(test['path']); print('test...')

In [None]:
try:
    x = pickle.load( open(BASE_PATH+ 'features_test.pkl', 'rb'))
except:
    print('failed pickle load')
    print('generating features...')
    x = normalize_img(df['path']);
    print('pickling features')
    with open(BASE_PATH + 'features_train.pkl', 'wb') as output:
        pickle.dump(x, output)

In [26]:
import xgboost as xgb
### TRAIN XGBOOST ###
dtrain = xgb.DMatrix(x, y)

(40479,)

#### Versions, changes, results
1. 'objective' : 'binary:logistic', 'eval_metric':'error@0.9', 'eta':1
    * Initial set up
    * Unsure about best objective or even eval metric
    * Result: step156, test0.022409, train0.0
2. 'eval_metric':'error@0.1'
    * just trying
    * Result: step77, test0.024509, train0.000019
3. 'eval_metric':'error@0.9', 'eta':0.03
    * make training slower
    * Result: step01, didn't train
4. 'eta':0.3
    * faster trianing
    * didnt bottom out test before end of run
    * Result: step349, test0.023545, train0.0
5. 'base_score':y_mean,
    * better init
    * faster trianing
    * Result: step349, test0.023323, train0.000006
6. 'eta':1
    * faster
    * Result: step2, test0.027177, train0.023441
7. 'max_depth':3,
    * reduce over fitting
    * Result: step

In [81]:
params = {
    'objective' : 'binary:logistic', # this outputs probability to be a 1
    'eval_metric':'error@0.9', # error with 0.9 threshold prob
    'eta':1, #learning rate, default 0.3
    'base_score':y_mean,
    'max_depth':3,
    #'subsample':0.5,
    #'colsample_bytree':0.5,
    #'gamma':0,
    #'min_child_weight':6,
    #'alpha':1,
}

In [82]:
cv_result = xgb.cv(params, dtrain, 
                   nfold=5, 
                   num_boost_round=350, 
                   early_stopping_rounds=50, 
                   verbose_eval=10, 
                   show_stdv=False)

[0]	train-error@0.9:0.030661	test-error@0.9:0.0318222
[10]	train-error@0.9:0.0342928	test-error@0.9:0.0357256
[20]	train-error@0.9:0.0308892	test-error@0.9:0.033082
[30]	train-error@0.9:0.0285732	test-error@0.9:0.0310808
[40]	train-error@0.9:0.0277826	test-error@0.9:0.0321432
[50]	train-error@0.9:0.0252008	test-error@0.9:0.0298704
[60]	train-error@0.9:0.0234776	test-error@0.9:0.029302
[70]	train-error@0.9:0.022446	test-error@0.9:0.0297222
[80]	train-error@0.9:0.0213898	test-error@0.9:0.0294008
[90]	train-error@0.9:0.0202902	test-error@0.9:0.029154
[100]	train-error@0.9:0.0190984	test-error@0.9:0.0288822
[110]	train-error@0.9:0.0181902	test-error@0.9:0.0287338
[120]	train-error@0.9:0.0175726	test-error@0.9:0.0287338
[130]	train-error@0.9:0.0167882	test-error@0.9:0.0286596
[140]	train-error@0.9:0.0161336	test-error@0.9:0.0287338
[150]	train-error@0.9:0.015627	test-error@0.9:0.0288326
[160]	train-error@0.9:0.0148362	test-error@0.9:0.0291292


In [83]:
cv_result.tail()

Unnamed: 0,test-error@0.9-mean,test-error@0.9-std,train-error@0.9-mean,train-error@0.9-std
109,0.028808,0.001244,0.018548,0.001959
110,0.028734,0.001134,0.01819,0.002042
111,0.028684,0.001023,0.018147,0.001966
112,0.028511,0.000919,0.018153,0.002055
113,0.028289,0.000781,0.018141,0.002071


In [46]:
model = xgb.train(dict(params), dtrain, num_boost_round=len(cv_result), 
                  evals=[(dtrain, 'eval')])

[0]	eval-error@0.9:0.051607
[1]	eval-error@0.9:0.032511
[2]	eval-error@0.9:0.030213
[3]	eval-error@0.9:0.028879
[4]	eval-error@0.9:0.028706
[5]	eval-error@0.9:0.027743
[6]	eval-error@0.9:0.024902
[7]	eval-error@0.9:0.025371
[8]	eval-error@0.9:0.024754
[9]	eval-error@0.9:0.022135
[10]	eval-error@0.9:0.021913
[11]	eval-error@0.9:0.021517
[12]	eval-error@0.9:0.020084
[13]	eval-error@0.9:0.019615
[14]	eval-error@0.9:0.019096
[15]	eval-error@0.9:0.018182
[16]	eval-error@0.9:0.017293
[17]	eval-error@0.9:0.016947
[18]	eval-error@0.9:0.01628
[19]	eval-error@0.9:0.015712
[20]	eval-error@0.9:0.014823
[21]	eval-error@0.9:0.014526
[22]	eval-error@0.9:0.014007
[23]	eval-error@0.9:0.013266
[24]	eval-error@0.9:0.012945
[25]	eval-error@0.9:0.012179
[26]	eval-error@0.9:0.011982
[27]	eval-error@0.9:0.011043
[28]	eval-error@0.9:0.011216
[29]	eval-error@0.9:0.01082
[30]	eval-error@0.9:0.010722
[31]	eval-error@0.9:0.0104
[32]	eval-error@0.9:0.009635
[33]	eval-error@0.9:0.009289
[34]	eval-error@0.9:0.008819

In [47]:
pred = model.predict(dtrain)