In [1]:
# follow the steps and modified from the github eval_*.py
# Github: https://github.com/titu1994/neural-image-assessment

import numpy as np
import argparse
from tqdm import tqdm

import tensorflow as tf
import os

from keras.models import Model
from keras.layers import Dense, Dropout
from keras.preprocessing.image import load_img, img_to_array

from utils.nasnet import NASNetMobile, preprocess_input
from utils.score_utils import mean_score, std_score

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 5124965882682021193
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 3174131302
locality {
  bus_id: 1
  links {
  }
}
incarnation: 16157427006511274458
physical_device_desc: "device: 0, name: GeForce GTX 1050 Ti, pci bus id: 0000:01:00.0, compute capability: 6.1"
]


In [3]:
target_size = (224, 224)  # NASNet requires strict size set to 224x224

In [4]:
import os
from zipfile import ZipFile
import cv2
import numpy as np
import pandas as pd
from dask import bag, threaded
from dask.diagnostics import ProgressBar
import matplotlib.pyplot as plt

In [5]:
def load_img_from_zipped(zipped, filename):
    try:
        exfile = zipped.read(filename)
        arr = np.frombuffer(exfile, np.uint8)
        imz = cv2.imdecode(arr, flags=cv2.IMREAD_UNCHANGED)
        imz = cv2.resize(imz, target_size, interpolation=cv2.INTER_AREA)
    except:
        print(filename, ' is invalid')
        imz = None
        
    return imz

In [6]:
# yield image_id, img
def img_loader():
    archives = ['imgs/train_jpg.zip', 'imgs/test_jpg.zip']
    
    invalid_img_ids = ['4f029e2a00e892aa2cac27d98b52ef8b13d91471f613c8d3c38e3f29d4da0b0c', 
                       '8513a91e55670c709069b5f85e12a59095b802877715903abef16b7a6f306e58', 
                       '60d310a42e87cdf799afcd89dc1b11ae3fdc3d0233747ec7ef78d82c87002e83', 
                       'b98b291bd04c3d92165ca515e00468fd9756af9a8f1df42505deed1dcfb5d7ae']
    
    for arch in archives:
        print('Reading ', arch)
        zipped = ZipFile(arch)
        filenames = zipped.namelist()[1:]
        
        for file in tqdm(filenames):
            img_id = file.replace('.jpg', '')
            
            if img_id in invalid_img_ids:
                continue
                
            img = load_img_from_zipped(zipped, file)
            if img is None:
                continue
                
            yield img_id, img

In [56]:
for
zipped = ZipFile('imgs/test_jpg.zip')
filenames = zipped.namelist()[1:]
filenames

['data/competition_files/test_jpg/4638cc8e7ee42a312ff354e4479fa64d6346aac6cfc161ae47449f21e6519a6c.jpg',
 'data/competition_files/test_jpg/e7ad8bc1bc562f5d8c7447e4c942cebcb01676de3426a947ef2b53b1896be8f9.jpg',
 'data/competition_files/test_jpg/3e8d73bed377ca25264d3280655eea3d639b9e5cb5cca505163f22226e34212d.jpg',
 'data/competition_files/test_jpg/51368d01802656883caf4d73aa51d2d2c571f70505100143148ae36d7b88baa6.jpg',
 'data/competition_files/test_jpg/32254d27ae5794696232b6c8d92b34eb5d5131eb7f99b122e008982aff226787.jpg',
 'data/competition_files/test_jpg/166c7f0adf518cc4ffc8a63ed0cd1df208d433d71645dcb2eed79ec71b52a775.jpg',
 'data/competition_files/test_jpg/9ff2ffa19982994f04ff2eaf936e31a237148bbad1d532a437c181488204f0c9.jpg',
 'data/competition_files/test_jpg/0d72decfed89beb5179bbaeb14578a09303356786703441e288431707940bc63.jpg',
 'data/competition_files/test_jpg/c59289021f8096251521f508960563eb331dc5b73d909e1639a189cc9cb7c4c0.jpg',
 'data/competition_files/test_jpg/5b76a7cf73891898f720f

In [7]:
DEBUG = False

with tf.device('/GPU:0'):
    base_model = NASNetMobile((224, 224, 3), include_top=False, pooling='avg', weights=None)
    x = Dropout(0.75)(base_model.output)
    x = Dense(10, activation='softmax')(x)
    model = Model(base_model.input, x)
    model.load_weights('weights/nasnet_weights.h5')
    print('Weights loaded')
    
    img_ids = []
    score_list = []

    for img_id, img in img_loader():
        
        x = img_to_array(img)
        x = np.expand_dims(x, axis=0)
        x = preprocess_input(x)

        scores = model.predict(x, batch_size=1, verbose=0)[0]
        img_ids.append(img_id)
        score_list.append(scores)
        
        if DEBUG:
            print(img_ids)
            print(score_list)
            plt.imshow(img)
            plt.show()
            break

Weights loaded
Reading  imgs/train_jpg_0.zip


100%|████████████████████████████████████████████████████████████████████████| 278167/278167 [2:47:17<00:00, 27.71it/s]


Reading  imgs/train_jpg_1.zip


100%|████████████████████████████████████████████████████████████████████████| 278166/278166 [2:47:35<00:00, 27.66it/s]


Reading  imgs/train_jpg_2.zip


100%|████████████████████████████████████████████████████████████████████████| 278166/278166 [2:47:55<00:00, 27.61it/s]


Reading  imgs/train_jpg_3.zip


100%|████████████████████████████████████████████████████████████████████████| 278166/278166 [2:47:49<00:00, 27.62it/s]


Reading  imgs/train_jpg_4.zip


100%|████████████████████████████████████████████████████████████████████████| 278166/278166 [2:49:36<00:00, 27.33it/s]


Reading  imgs/test_jpg.zip


100%|████████████████████████████████████████████████████████████████████████| 465829/465829 [4:48:41<00:00, 26.89it/s]


In [59]:
img_ids = [img_id.replace('data/competition_files/test_jpg/', '') for img_id in tqdm(img_ids)]


  0%|                                                                                      | 0/1856656 [00:00<?, ?it/s]
 11%|███████▋                                                            | 209329/1856656 [00:00<00:00, 2078189.78it/s]
 22%|██████████████▉                                                     | 407254/1856656 [00:00<00:00, 2021320.35it/s]
 33%|██████████████████████▎                                             | 607876/1856656 [00:00<00:00, 2011403.40it/s]
 45%|██████████████████████████████▎                                     | 826657/1856656 [00:00<00:00, 2046609.24it/s]
 56%|█████████████████████████████████████▋                             | 1045105/1856656 [00:00<00:00, 2075069.13it/s]
 68%|█████████████████████████████████████████████▎                     | 1256212/1856656 [00:00<00:00, 2078515.53it/s]
 79%|████████████████████████████████████████████████████▊              | 1463154/1856656 [00:00<00:00, 2075070.36it/s]
 89%|██████████████████████████████████

In [74]:
img_ids[-3:]

['7abf6c4aa44b9a0d04414c2e350d528e0a83c300f809bd4299226ba2f43e69d6',
 '1ce2716b4022d40c38d18dfbdfb37bbcdadd2a494dc4ce28339f68ab0c8f7cca',
 'bfc4ee7103ffa563152b94b25a29c28d6967ce8e8990072120902313963fd807']

In [14]:
score_list[:3]

[array([0.02021847, 0.05123571, 0.12512006, 0.2761127 , 0.31151772,
        0.13658957, 0.04995457, 0.02059884, 0.00557685, 0.00307551],
       dtype=float32),
 array([0.04311303, 0.08751361, 0.16680562, 0.28526697, 0.2504091 ,
        0.10246787, 0.0404527 , 0.01804111, 0.00350323, 0.00242668],
       dtype=float32),
 array([0.00881213, 0.02849251, 0.07964744, 0.21630877, 0.34663433,
        0.19870226, 0.07647053, 0.03049389, 0.00986233, 0.00457572],
       dtype=float32)]

In [73]:
import pickle

with open('nima_nasnet_img_ids.pickle', 'wb') as handle:
    pickle.dump(img_ids, handle)

with open('nima_nasnet_scores.pickle', 'wb') as handle:
    pickle.dump(score_list, handle)

In [17]:
score_list = np.array(score_list)
score_list.shape

(1856656, 10)

In [67]:
#feature engineering
res = pd.DataFrame()
res.loc[:, 'image'] = img_ids
for i in range(10):
    res.loc[:, 'raw_pred_{}'.format(i)] = score_list[:,i]

In [68]:
raw_feature_cols = ['raw_pred_{}'.format(i) for i in range(10)]
raw_feature_cols

['raw_pred_0',
 'raw_pred_1',
 'raw_pred_2',
 'raw_pred_3',
 'raw_pred_4',
 'raw_pred_5',
 'raw_pred_6',
 'raw_pred_7',
 'raw_pred_8',
 'raw_pred_9']

In [69]:
res.loc[:, 'mean'] = res[raw_feature_cols].mean(axis=1)
res.loc[:, 'med'] = res[raw_feature_cols].median(axis=1)
res.loc[:, 'std'] = res[raw_feature_cols].std(axis=1)
res.loc[:, 'max'] = res[raw_feature_cols].max(axis=1)
res.loc[:, 'min'] = res[raw_feature_cols].min(axis=1)
res.loc[:, '1_quartile'] = res[raw_feature_cols].quantile(.25, axis=1)
res.loc[:, '3_quartile'] = res[raw_feature_cols].quantile(.75, axis=1)
res.loc[:, '13_quartile_diff'] = res.loc[:, '3_quartile']-res.loc[:, '1_quartile']
res.loc[:, 'max_min_diff'] = res.loc[:, 'max'] - res.loc[:, 'min']
res.loc[:, 'non_max_mean'] = (res[raw_feature_cols].sum(axis=1)-res.loc[:, 'max'])/9
res.loc[:, 'max_non_max_mean_diff'] = res.loc[:, 'max']-res.loc[:, 'non_max_mean']

In [70]:
pd.options.display.max_columns=100
res.head(10)

Unnamed: 0,image,raw_pred_0,raw_pred_1,raw_pred_2,raw_pred_3,raw_pred_4,raw_pred_5,raw_pred_6,raw_pred_7,raw_pred_8,raw_pred_9,mean,med,std,max,min,1_quartile,3_quartile,13_quartile_diff,max_min_diff,non_max_mean,max_non_max_mean_diff
0,856e74b8c46edcf0c0e23444eab019bfda63687bb70a34...,0.020218,0.051236,0.12512,0.276113,0.311518,0.13659,0.049955,0.020599,0.005577,0.003076,0.1,0.050595,0.112342,0.311518,0.003076,0.020314,0.133722,0.113409,0.308442,0.076498,0.23502
1,122d198cf11ab32d2346bff455d6702f1ea519df957cea...,0.043113,0.087514,0.166806,0.285267,0.250409,0.102468,0.040453,0.018041,0.003503,0.002427,0.1,0.065313,0.102145,0.285267,0.002427,0.023644,0.150721,0.127077,0.28284,0.079415,0.205852
2,2809fd6afd6d3cae4dd4ad93a7f905a0db32292f4df4b3...,0.008812,0.028493,0.079647,0.216309,0.346634,0.198702,0.076471,0.030494,0.009862,0.004576,0.1,0.053482,0.115737,0.346634,0.004576,0.01452,0.168939,0.154419,0.342059,0.072596,0.274038
3,5ef4a19afe4ad593464931734ff43c1112cf94c6bdb459...,0.024279,0.057675,0.13444,0.278446,0.297135,0.13115,0.0485,0.020092,0.005347,0.002937,0.1,0.053087,0.109445,0.297135,0.002937,0.021139,0.133618,0.112479,0.294199,0.078096,0.219039
4,c37787b5cc6c3052130c6f390aa5b57462b558a204d5c4...,0.029881,0.065804,0.139384,0.271839,0.283863,0.128138,0.049918,0.021757,0.00584,0.003576,0.1,0.057861,0.104646,0.283863,0.003576,0.023788,0.136572,0.112784,0.280286,0.079571,0.204292
5,0f8ae17e177ed82363ed3dba7d277ed6227ac0c935cb52...,0.033061,0.072587,0.152462,0.285778,0.275863,0.113779,0.042014,0.017784,0.004123,0.002551,0.1,0.0573,0.10668,0.285778,0.002551,0.021603,0.142791,0.121188,0.283227,0.079358,0.20642
6,ba126be25858022d3cddf07d27288f9d35c495458ec49a...,0.009292,0.031624,0.086884,0.235003,0.352156,0.183095,0.065776,0.025425,0.007272,0.003472,0.1,0.0487,0.118527,0.352156,0.003472,0.013326,0.159043,0.145717,0.348684,0.071983,0.280173
7,4cc05cb70bcdde73e34718020f2ef4c69063af4098602b...,0.009584,0.027701,0.072714,0.191522,0.320179,0.214804,0.096026,0.042491,0.016647,0.008331,0.1,0.057603,0.106904,0.320179,0.008331,0.01941,0.167648,0.148238,0.311848,0.075536,0.244644
8,28519dc39d1da01b36c544d31154f251cd6774f55599b7...,0.012794,0.037735,0.104254,0.261892,0.335709,0.157786,0.057503,0.022838,0.006433,0.003054,0.1,0.047619,0.116676,0.335709,0.003054,0.015305,0.144403,0.129098,0.332655,0.07381,0.261899
9,6511abdb5be3ea579bc0f03c899542818231e39d9d1295...,0.031354,0.066193,0.138753,0.266936,0.281225,0.129927,0.051812,0.023241,0.006528,0.004029,0.1,0.059003,0.102807,0.281225,0.004029,0.025269,0.136547,0.111277,0.277196,0.079864,0.201362


In [71]:
res = res.drop(raw_feature_cols, axis=1)
res.head(3)

Unnamed: 0,image,mean,med,std,max,min,1_quartile,3_quartile,13_quartile_diff,max_min_diff,non_max_mean,max_non_max_mean_diff
0,856e74b8c46edcf0c0e23444eab019bfda63687bb70a34...,0.1,0.050595,0.112342,0.311518,0.003076,0.020314,0.133722,0.113409,0.308442,0.076498,0.23502
1,122d198cf11ab32d2346bff455d6702f1ea519df957cea...,0.1,0.065313,0.102145,0.285267,0.002427,0.023644,0.150721,0.127077,0.28284,0.079415,0.205852
2,2809fd6afd6d3cae4dd4ad93a7f905a0db32292f4df4b3...,0.1,0.053482,0.115737,0.346634,0.004576,0.01452,0.168939,0.154419,0.342059,0.072596,0.274038


In [72]:
for file in ['train.csv', 'test.csv']:
    if file == 'train.csv':
        df = pd.read_csv(file, usecols=['image', 'activation_date'], parse_dates=['activation_date']) \
               .sort_values('activation_date') \
               .reset_index(drop=True) \
               .drop('activation_date', axis=1)
    else:
        df = pd.read_csv(file, usecols=['image'])
        
    print('null percentage:', df.isnull().sum().max() / df.shape[0] * 100.)
    print('image in both df and res\n', df.image.isin(res.image).value_counts())
    df = df.merge(res, on='image', how='left').fillna(-1)
    
    print(df.tail(20))
    df.to_csv(file.replace(".csv", "_nasnet_nima_features.csv"), index=False)

null percentage: 7.48877229577285
image in both df and res
 True     1390827
False     112597
Name: image, dtype: int64
                                                     image  mean       med  \
1503404  2be1b7fddec6a779e5770a39d3a8322a750e4874d4f6ef...   0.1  0.068083   
1503405  682fca314e99afb85b8577a6466da8da0bc94e8764f3ce...   0.1  0.057518   
1503406  53e10cf76c59451ed9dbe2d313a0c178a7c0db8a01b96e...   0.1  0.053425   
1503407  a49b727bb8d37f7a0b1a0f100dbd6ff69b2c914ef9dab6...   0.1  0.100741   
1503408  5bdbe60d03ed200de9b27ff1816879c1abdd25300db0c4...   0.1  0.063268   
1503409  8b2aba840ba311537a936630be8d3722649006cfefe468...   0.1  0.051062   
1503410  cf6a421ff8ce70f0b0faae386b64d6607ecbc89b40cd74...   0.1  0.062044   
1503411  4c60a49ca031a0d6f37ae9aee6167bdb0a23d9dbc67d9a...   0.1  0.051695   
1503412                                                 -1  -1.0 -1.000000   
1503413  7d04440d912c9efa92f61709648afd832c4739f2ccf524...   0.1  0.052766   
1503414               

508437          0.146774      0.278304      0.079956               0.200444  
