In [1]:
import os
import numpy as np
import pandas as pd
import scipy.io as sio
import matplotlib.pyplot as plt
import time
%matplotlib inline
from numba import jit, autojit

<h1> Loading data

In [2]:
pathhome = os.getcwd()
pathcnn = pathhome+'\\outcome\\imagenet_vgg_f\\fc7'

<h2> Loading csv files with photo ids/business ids and sample submussion 

In [3]:
# Business ids and restraunt labels train data set
df_train_business_ids = pd.read_csv('train.csv')
# Business ids and photo ids train data set
df_train_photo_ids = pd.read_csv('train_photo_to_biz_ids.csv')
# Business ids and photo ids test data set
df_test_photo_ids = pd.read_csv('test_photo_to_biz.csv')
# Sample submission
df_sample_submission = pd.read_csv('sample_submission.csv')

<h2> Loading CNNcodes from images

In [4]:

def getCNNcodes(matfile,dtype):
    data = sio.loadmat(pathcnn+'\\'+dtype+'\\'+matfile)
    CNNcodes = []
    photoids = []
    for d in data['data']:
        photoid = (d[0][0][0][0][0][:-4])
        photoids.append(photoid)
        CNNcode = np.array(d[0][0][0][1].flatten())
        CNNcodes.append(CNNcode)
    del data
    return photoids, CNNcodes

In [5]:
def duplicate_image_ids(CNNcodes):    
    seen = set()
    dubs = []
    for c in CNNcodes:
        cstr = ''.join(map(str, c))
        if cstr not in seen:
            seen.add(cstr)
        else:
            dubs.append(cstr)
        del cstr
    CNNcodes_nodubs = [c for c in CNNcodes if ''.join(map(str, c)) not in dubs]
    return CNNcodes_nodubs            

In [67]:
@autojit
def parse_business(CNNcodes):
    #CNNcodes = duplicate_image_ids(CNNcodes)
    CNNcodes = np.array(r)
    meanCNNcode = CNNcodes.mean(axis=0)
    return meanCNNcode

<h3> Train data

In [7]:
df_CNNcodes_train =pd.DataFrame({'photo_id': [],'CNNcode': []})

# Extracting CNNcodes and photoids for train images
for f in os.listdir(pathcnn+"\\train"):
    photoids, CNNcodes = getCNNcodes(f,'train')
    df = pd.DataFrame({'photo_id': photoids,'CNNcode': CNNcodes})
    df_CNNcodes_train =pd.concat([df_CNNcodes_train, df])
df_CNNcodes_train['photo_id']= df_CNNcodes_train['photo_id'].convert_objects(convert_numeric=True)



In [8]:
df_CNNcodes_train.sort(['photo_id'], ascending=1).head(5)

  if __name__ == '__main__':


Unnamed: 0,CNNcode,photo_id
163,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",2
227,"[0.677176, 7.32974, 5.29063, 0.0, 0.0, 10.1959...",5
28,"[0.0, 0.0, 8.95383, 0.0, 0.0, 0.0, 0.0, 8.0476...",8
0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",10
148,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",12


<h3> Test data

In [None]:
df_CNNcodes_test =pd.DataFrame({'photo_id': [],'CNNcode': []})

# Extracting CNNcodes and photoids for test images
for f in os.listdir(pathcnn+"\\test"):
    photoids, CNNcodes = getCNNcodes(f, 'test')
    df = pd.DataFrame({'photo_id': photoids,'CNNcode': CNNcodes})
    df_CNNcodes_test =pd.concat([df_CNNcodes_test, df])

df_CNNcodes_test['photo_id']= df_CNNcodes_test['photo_id'].convert_objects(convert_numeric=True)

In [None]:
df_CNNcodes_test.sort(['photo_id'], ascending=1).head(5)

<h1> Evaluating mean CNN code for each business

In [9]:
df_train_photo_biz_codes = pd.merge(df_train_photo_ids, df_CNNcodes_train, on='photo_id')
df_train_photo_biz_codes = df_train_photo_biz_codes.sort('business_id', ascending = True)
del df_CNNcodes_train
df_train_photo_biz_codes.head(10)

  from ipykernel import kernelapp as app


Unnamed: 0,photo_id,business_id,CNNcode
175383,338241,3,"[4.40568, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 14.551..."
155513,435438,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
27021,414322,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.903..."
158003,343892,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 16.4443, 0..."
158583,37806,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 7.401..."
216870,343176,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 5.84614, 0.0, 0.0, 0..."
51291,207105,3,"[0.0, 0.0439066, 0.0, 0.0, 0.0, 1.35651, 0.0, ..."
158002,405693,3,"[0.0, 4.23323, 0.0, 0.0, 0.0, 0.0, 0.0, 16.591..."
158001,73617,3,"[0.0, 7.27466, 0.0, 0.0, 3.54355, 0.0, 0.0, 0...."
158000,440970,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.255523, 0.0, 0.0, ..."


In [59]:
grouped = df_train_photo_biz_codes.groupby('business_id')
df = grouped.aggregate(lambda x: list(x))
df.drop('photo_id', axis=1, inplace=True)

In [34]:
df["meanCNN"] = ""

In [52]:
%%time
meanCNN = []
for i,r in df.head(10).iterrows():
    meanCNN = parse_business(r['CNNcode'])
    df.loc[i, 'meanCNN'] = meanCNN

Wall time: 88 ms


In [37]:
%%time
meanCNN = []
for i,r in df.head(500).iterrows():
    meanCNN = parse_business(r['CNNcode'])
    df.loc[i, 'meanCNN'] = meanCNN

Wall time: 894 ms


In [38]:
%%time
meanCNN = []
for i,r in df.head(600).iterrows():
    meanCNN = parse_business(r['CNNcode'])
    df.loc[i, 'meanCNN'] = meanCNN

Wall time: 1.09 s


In [39]:
%%time
meanCNN = []
for i,r in df.head(700).iterrows():
    meanCNN = parse_business(r['CNNcode'])
    df.loc[i, 'meanCNN'] = meanCNN

Wall time: 1.3 s


In [45]:
df["meanCNN"] = ""
df_array = np.array_split(df, 4)

In [46]:
%%time
meanCNN = []
for i,r in df_array[0].iterrows():
    meanCNN = parse_business(r['CNNcode'])
    df_array[0].loc[i, 'meanCNN'] = meanCNN

Wall time: 2min 18s


In [48]:
%%time
meanCNN = []
for i,r in df_array[1].iterrows():
    meanCNN = parse_business(r['CNNcode'])
    df_array[1].loc[i, 'meanCNN'] = meanCNN

Wall time: 3min 7s


In [49]:
%%time
meanCNN = []
for i,r in df_array[2].iterrows():
    meanCNN = parse_business(r['CNNcode'])
    df_array[2].loc[i, 'meanCNN'] = meanCNN

Wall time: 14min 7s


In [68]:
df.head(10).apply(parse_business, axis =1)

TypeError: ('can only concatenate list (not "str") to list', u'occurred at index 3')