In [1]:
import os
import numpy as np
import pandas as pd
import scipy.io as sio
import time
#from numba import jit, autojit

<h1> Loading data

In [2]:
pathhome = os.getcwd()
layer_name = 'prob'
pathcnn = pathhome+'\\outcome\\imagenet_vgg_f\\'+layer_name

<h2> Loading csv files with photo ids/business ids and sample submussion 

In [3]:
# Business ids and photo ids train data set
df_train_photo_ids = pd.read_csv('train_photo_to_biz_ids.csv')
# Business ids and photo ids test data set
df_test_photo_ids = pd.read_csv('test_photo_to_biz.csv')

<h2> Loading CNNcodes from images

In [4]:
def getCNNcodes(matfile, path):
    data = sio.loadmat(path+'\\'+matfile)
    CNNcodes = []
    photoids = []
    for d in data['data']:
        photoid = (d[0][0][0][0][0][:-4])
        photoids.append(photoid)
        CNNcode = np.array(d[0][0][0][1].flatten())
        CNNcodes.append(CNNcode)
    return photoids, CNNcodes

In [5]:
def duplicate_images(CNNcodes):    
    seen = set()
    dubs = []
    for c in CNNcodes:
        cstr = np.array(c).mean()
        if cstr not in seen:
            seen.add(cstr)
        else:
            dubs.append(cstr)
        del cstr
    CNNcodes_nodubs = [c for c in CNNcodes if np.array(c).mean() not in dubs]
    return np.array(CNNcodes_nodubs)            

<h3> Train data

In [6]:
df_CNNcodes_train =pd.DataFrame({'photo_id': [],'CNNcode': []})

# Extracting CNNcodes and photoids for train images
for f in os.listdir(pathcnn+"\\train"):
    photoids, CNNcodes = getCNNcodes(f,pathcnn+"\\train")
    df = pd.DataFrame({'photo_id': photoids,'CNNcode': CNNcodes})
    df_CNNcodes_train =pd.concat([df_CNNcodes_train, df])
    
df_CNNcodes_train = df_CNNcodes_train.convert_objects(convert_numeric = True)



In [7]:
df_train_photo_biz_codes = pd.merge(df_train_photo_ids, df_CNNcodes_train, on='photo_id')
del df_CNNcodes_train

In [8]:
grouped = df_train_photo_biz_codes.groupby('business_id')
df = grouped.aggregate(lambda x: list(x))
df.drop('photo_id', axis=1, inplace=True)

In [9]:
%%time
df["meanCNN"] = ""
    
for i,r in df.head(100).iterrows():
    CNNcode = np.array(r['CNNcode'])
    #CNNcode = duplicate_images(CNNcodes)
    meanCNN = CNNcode.mean(axis=0)
    df.loc[i, 'meanCNN'] = meanCNN

Wall time: 269 ms


In [10]:
%%time
df["meanCNN"] = ""
    
for i,r in df.iterrows():
    CNNcode = np.array(r['CNNcode'])
    #CNNcode = duplicate_images(CNNcodes)
    meanCNN = CNNcode.mean(axis=0)
    df.loc[i, 'meanCNN'] = meanCNN
        
#df.drop('CNNcode', axis=1, inplace = True)

Wall time: 1.43 s


In [11]:
df.drop('CNNcode', axis=1, inplace = True)

In [12]:
name = 'df_train_'+layer_name
df.to_pickle(name)

<h3> Test data

In [13]:
df_CNNcodes_test =pd.DataFrame({'photo_id': [],'CNNcode': []})

# Extracting CNNcodes and photoids for test images
for f in os.listdir(pathcnn+"\\test"):
    photoids, CNNcodes = getCNNcodes(f, pathcnn+"\\test")
    df = pd.DataFrame({'photo_id': photoids,'CNNcode': CNNcodes})
    df_CNNcodes_test =pd.concat([df_CNNcodes_test, df])
    
df_CNNcodes_test = df_CNNcodes_test.convert_objects(convert_numeric = True)



In [14]:
df_CNNcodes_test.head(5)

Unnamed: 0,CNNcode,photo_id
0,"[-0.560647, 4.05968, -0.504945, -3.20145, -1.4...",1
1,"[-3.4193, 3.92915, 0.398823, -1.37973, -0.6521...",100
2,"[0.147573, -1.07164, 1.13305, 0.686834, 1.8681...",1000
3,"[-0.0503105, 3.64368, -2.63264, -4.00568, -2.1...",10000
4,"[1.32161, 1.62675, 0.806612, -0.725322, -1.221...",100000


In [15]:
df_test_photo_biz_codes = pd.merge(df_test_photo_ids, df_CNNcodes_test, on='photo_id')
del df_CNNcodes_test

In [16]:
grouped = df_test_photo_biz_codes.groupby('business_id')
df = grouped.aggregate(lambda x: list(x))
df.drop('photo_id', axis=1, inplace=True)

In [17]:
df.head(5)

Unnamed: 0_level_0,CNNcode
business_id,Unnamed: 1_level_1
003sg,"[[-3.95446, -0.296355, -1.34097, -2.55254, -1...."
00er5,"[[-3.10838, 0.215555, 0.215646, -0.0465141, 0...."
00kad,"[[-3.61699, 1.46668, -1.30801, -2.57887, -2.32..."
00mc6,"[[-2.63088, -0.410354, -0.222674, -0.0535168, ..."
00q7x,"[[-0.454271, 1.86136, -4.61673, -2.36174, -1.2..."


In [18]:
%%time
df["meanCNN"] = ""
    
for i,r in df.head(100).iterrows():
    CNNcode = np.array(r['CNNcode'])
    #CNNcode = duplicate_images(CNNcode)
    meanCNN = CNNcode.mean(axis=0)
    df.loc[i, 'meanCNN'] = meanCNN

Wall time: 1.48 s
Parser   : 156 ms


In [19]:
%%time
df["meanCNN"] = ""
    
for i,r in df.iterrows():
    CNNcode = np.array(r['CNNcode'])
    #CNNcode = duplicate_images(CNNcodes)
    meanCNN = CNNcode.mean(axis=0)
    df.loc[i, 'meanCNN'] = meanCNN
        
df.drop('CNNcode', axis=1, inplace = True)

Wall time: 28.7 s


In [20]:
df.head(5)

Unnamed: 0_level_0,meanCNN
business_id,Unnamed: 1_level_1
003sg,"[-0.920977, 1.1955, -0.858639, -2.26293, -0.58..."
00er5,"[-1.1308, 0.78991, -0.919203, -2.25263, -0.615..."
00kad,"[-0.890779, 0.995832, -1.30596, -2.30876, -0.9..."
00mc6,"[-0.35268, 1.01384, -0.390578, -1.05943, 0.124..."
00q7x,"[-0.813104, 1.19193, -1.09444, -1.90327, -0.64..."


In [21]:
name = 'df_test_'+layer_name
df.to_pickle(name)