In [1]:
import os
import numpy as np
import pandas as pd
import scipy.io as sio
import time
#from numba import jit, autojit

<h1> Loading data

In [2]:
pathhome = os.getcwd()
layer_name = 'fc7'
pathcnn = pathhome+'\\outcome\\imagenet_vgg_f\\'+layer_name

<h2> Loading csv files with photo ids/business ids and sample submussion 

In [3]:
# Business ids and photo ids train data set
df_train_photo_ids = pd.read_csv('train_photo_to_biz_ids.csv')
# Business ids and photo ids test data set
df_test_photo_ids = pd.read_csv('test_photo_to_biz.csv')

<h2> Loading CNNcodes from images

In [4]:
def getCNNcodes(matfile, path):
    data = sio.loadmat(path+'\\'+matfile)
    CNNcodes = []
    photoids = []
    for d in data['data']:
        photoid = (d[0][0][0][0][0][:-4])
        photoids.append(photoid)
        CNNcode = np.array(d[0][0][0][1].flatten())
        CNNcodes.append(CNNcode)
    return photoids, CNNcodes

In [5]:
def duplicate_images(CNNcodes):    
    seen = set()
    dubs = []
    for c in CNNcodes:
        cstr = np.array(c).mean()
        if cstr not in seen:
            seen.add(cstr)
        else:
            dubs.append(cstr)
        del cstr
    CNNcodes_nodubs = [c for c in CNNcodes if np.array(c).mean() not in dubs]
    return np.array(CNNcodes_nodubs)            

<h3> Train data

In [6]:
df_CNNcodes_train =pd.DataFrame({'photo_id': [],'CNNcode': []})

# Extracting CNNcodes and photoids for train images
for f in os.listdir(pathcnn+"\\train"):
    photoids, CNNcodes = getCNNcodes(f,pathcnn+"\\train")
    df = pd.DataFrame({'photo_id': photoids,'CNNcode': CNNcodes})
    df_CNNcodes_train =pd.concat([df_CNNcodes_train, df])
    
df_CNNcodes_train = df_CNNcodes_train.convert_objects(convert_numeric = True)

In [7]:
df_train_photo_biz_codes = pd.merge(df_train_photo_ids, df_CNNcodes_train, on='photo_id')
del df_CNNcodes_train

In [8]:
grouped = df_train_photo_biz_codes.groupby('business_id')
df = grouped.aggregate(lambda x: list(x))
df.drop('photo_id', axis=1, inplace=True)

In [28]:
%%time
df["meanCNN"] = ""
    
for i,r in df.iterrows():
    CNNcode = np.array(r['CNNcode'])
    #CNNcode = duplicate_images(CNNcodes)
    meanCNN = CNNcode.mean(axis=0)
    df.loc[i, 'meanCNN'] = meanCNN
        
#df.drop('CNNcode', axis=1, inplace = True)

Wall time: 1min 30s


In [29]:
df.drop('CNNcode', axis=1, inplace = True)

In [30]:
name = 'df_train_'+layer_name
df.to_pickle(name)

<h3> Test data

In [6]:
df_CNNcodes_test =pd.DataFrame({'photo_id': [],'CNNcode': []})

# Extracting CNNcodes and photoids for test images
for f in os.listdir(pathcnn+"\\test"):
    photoids, CNNcodes = getCNNcodes(f, pathcnn+"\\test")
    df = pd.DataFrame({'photo_id': photoids,'CNNcode': CNNcodes})
    df_CNNcodes_test =pd.concat([df_CNNcodes_test, df])
    
df_CNNcodes_test = df_CNNcodes_test.convert_objects(convert_numeric = True)

In [7]:
df_CNNcodes_test.head(5)

Unnamed: 0,CNNcode,photo_id
0,"[0.0, 0.946053, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1
1,"[0.0, 0.0, 6.03987, 0.0, 0.0, 0.0, 0.0, 0.0, 0...",100
2,"[0.0, 0.0, 0.0, 2.36283, 0.0, 0.0, 0.0, 0.0, 1...",1000
3,"[0.0, 0.209569, 0.0, 5.57508, 0.0, 0.0, 1.2899...",10000
4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",100000


In [8]:
df_test_photo_biz_codes = pd.merge(df_test_photo_ids, df_CNNcodes_test, on='photo_id')
del df_CNNcodes_test

In [9]:
grouped = df_test_photo_biz_codes.groupby('business_id')
df = grouped.aggregate(lambda x: list(x))
df.drop('photo_id', axis=1, inplace=True)

In [10]:
df.head(5)

Unnamed: 0_level_0,CNNcode
business_id,Unnamed: 1_level_1
003sg,"[[0.0, 5.04013, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
00er5,"[[0.0, 0.0, 6.04672, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
00kad,"[[12.2383, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
00mc6,"[[0.0, 0.445739, 0.0, 7.15255, 0.0, 6.67938, 0..."
00q7x,"[[0.0, 0.0, 0.0, 0.0, 0.0, 2.73037, 0.0, 0.0, ..."


In [19]:
%%time
df["meanCNN"] = ""
    
for i,r in df.head(1000).iterrows():
    CNNcode = np.array(r['CNNcode'])
    #CNNcode = duplicate_images(CNNcode)
    meanCNN = CNNcode.mean(axis=0)
    df.loc[i, 'meanCNN'] = meanCNN

Wall time: 13.9 s


In [20]:
%%time
df["meanCNN"] = ""
    
for i,r in df.iterrows():
    CNNcode = np.array(r['CNNcode'])
    #CNNcode = duplicate_images(CNNcodes)
    meanCNN = CNNcode.mean(axis=0)
    df.loc[i, 'meanCNN'] = meanCNN
        
df.drop('CNNcode', axis=1, inplace = True)

Wall time: 9min 8s


In [21]:
df.head(5)

Unnamed: 0_level_0,meanCNN
business_id,Unnamed: 1_level_1
003sg,"[0.650372, 1.15609, 0.517486, 0.642933, 0.4061..."
00er5,"[0.51327, 0.593613, 0.443991, 0.376009, 0.3182..."
00kad,"[0.725106, 0.720378, 0.887807, 0.874223, 0.303..."
00mc6,"[0.52072, 1.3611, 0.954392, 0.875562, 0.728178..."
00q7x,"[0.154035, 0.926991, 0.896294, 0.403459, 0.0, ..."


In [None]:
name = 'df_test_'+layer_name
df.to_pickle(name)