In [1]:
import os
import numpy as np
import pandas as pd
import scipy.io as sio

import matplotlib.pyplot as plt
%matplotlib inline

<h1> Loading data

In [2]:
pathhome = os.getcwd()
pathcnn = pathhome+'\\outcome\\imagenet_vgg_f\\fc7'

<h2> Loading csv files with photo ids/business ids and sample submussion 

In [3]:
# Business ids and restraunt labels train data set
df_train_business_ids = pd.read_csv('train.csv')
# Business ids and photo ids train data set
df_train_photo_ids = pd.read_csv('train_photo_to_biz_ids.csv')
# Business ids and photo ids test data set
df_test_photo_ids = pd.read_csv('test_photo_to_biz.csv')
# Sample submission
df_sample_submission = pd.read_csv('sample_submission.csv')

<h2> Loading CNNcodes from images

In [4]:
def getCNNcodes(matfile,dtype):
    data = sio.loadmat(pathcnn+'\\'+dtype+'\\'+matfile)
    CNNcodes = []
    photoids = []
    for d in data['data']:
        photoid = (d[0][0][0][0][0][:-4])
        photoids.append(photoid)
        CNNcode = np.array(d[0][0][0][1].flatten())
        CNNcodes.append(CNNcode)
    del data
    return photoids, CNNcodes

In [5]:
def duplicate_image_ids(CNNcodes):    
    seen = set()
    dubs = []
    for c in CNNcodes:
        cstr = ''.join(map(str, c))
        if cstr not in seen:
            seen.add(cstr)
        else:
            dubs.append(cstr)
        del cstr
    CNNcodes_nodubs = [c for c in CNNcodes if ''.join(map(str, c)) not in dubs]
    return CNNcodes_nodubs            

In [6]:
def parse_business(CNNcodes):
    CNNcodes = duplicate_image_ids(CNNcodes)
    CNNcodes = np.array(CNNcodes)
    meanCNNcode = CNNcodes.mean(axis=0)
    return meanCNNcode

<h3> Train data

In [7]:
df_CNNcodes_train =pd.DataFrame({'photo_id': [],'CNNcode': []})
# Extracting CNNcodes and photoids for train images
for f in os.listdir(pathcnn+"\\train"):
    photoids, CNNcodes = getCNNcodes(f,'train')
    df = pd.DataFrame({'photo_id': photoids,'CNNcode': CNNcodes})
    df_CNNcodes_train =pd.concat([df_CNNcodes_train, df])
df_CNNcodes_train['photo_id']= df_CNNcodes_train['photo_id'].convert_objects(convert_numeric=True)



In [8]:
df_CNNcodes_train.sort(['photo_id'], ascending=1).head(5)
#df_CNNcodes_train.to_pickle('df_CNNcodes_train')
#print(len(df_CNNcodes_train))

  if __name__ == '__main__':


Unnamed: 0,CNNcode,photo_id
163,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",2
227,"[0.677176, 7.32974, 5.29063, 0.0, 0.0, 10.1959...",5
28,"[0.0, 0.0, 8.95383, 0.0, 0.0, 0.0, 0.0, 8.0476...",8
0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",10
148,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",12


<h3> Test data

In [10]:
df_CNNcodes_test =pd.DataFrame({'photo_id': [],'CNNcode': []})
# Extracting CNNcodes and photoids for test images
for f in os.listdir(pathcnn+"\\test"):
    photoids, CNNcodes = getCNNcodes(f, 'test')
    df = pd.DataFrame({'photo_id': photoids,'CNNcode': CNNcodes})
    df_CNNcodes_test =pd.concat([df_CNNcodes_test, df])
#print(len(df_CNNcodes_train))_test =pd.concat([df_CNNcodes_test, df])

df_CNNcodes_test['photo_id']= df_CNNcodes_test['photo_id'].convert_objects(convert_numeric=True)



In [11]:
df_CNNcodes_test.sort(['photo_id'], ascending=1).head(5)
#df_CNNcodes_test.to_pickle('df_CNNcodes_test')
#print(len(df_CNNcodes_train))

  if __name__ == '__main__':


Unnamed: 0,CNNcode,photo_id
0,"[0.0, 0.946053, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1
84,"[0.509015, 0.0, 0.0, 0.68858, 0.0, 0.0, 0.0, 0...",3
213,"[0.0, 11.7086, 0.0, 0.0, 0.0, 0.0, 19.1888, 0....",4
234,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 4.81,...",6
141,"[12.3582, 0.0, 6.43047, 0.0, 0.0, 0.0, 0.0, 15...",7


<h1> Evaluating mean CNN code for each business

In [26]:
df_train_photo_biz_codes = pd.merge(df_train_photo_ids, df_CNNcodes_train, on='photo_id')
df_train_photo_biz_codes.head(5)

Unnamed: 0,photo_id,business_id,CNNcode
0,204149,3034,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,52779,2805,"[0.339458, 0.0, 0.0, 0.0, 0.0, 0.0, 4.33804, 0..."
2,278973,485,"[14.1688, 0.0, 8.23413, 0.0, 0.0, 0.0, 0.0, 0...."
3,195284,485,"[9.5706, 0.0, 3.62187, 0.0, 0.0, 0.0, 4.1284, ..."
4,19992,485,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [25]:
df_train_business_ids["meanCNNcode"] = np.nan
meanCNNcodes_train = []
for i, r in df_train_business_ids.iterrows():
    print i
    CNNcodes = np.array(df_train_photo_biz_codes[df_train_photo_biz_codes.business_id == r.business_id].CNNcode)
    #df_train_business_ids.loc[i, "meanCNNcode"] = parse_business(CNNcodes)
    meanCNNcodes_train.append(parse_business(CNNcodes))

0
1
2
3
4
5
6
7
8
9
10
11


KeyboardInterrupt: 