In [30]:
# Goal - convert features and labels from dataframes to np arrays ready for use in tensorflow.
import pandas as pd
import numpy as np
import pickle


tf_output = {}

# Load the cached pandas pickle file.
base_dir = '../data/'

# Load validation set dataframe from file
feat_validate_df = pd.read_pickle(base_dir + 'feat_validate.pickle')
# Group the data by taking the max of all images per business
biz_feat_validate_df= feat_validate_df.groupby(['business_id']).agg(['max'])
# Drop photo_id, since all ids were averaged (And we don't care about it anyways)
biz_feat_validate_df.drop('photo_id', axis=1, inplace=True)
# Grab the (already sorted) business_ids to be used to split the labels up.
biz_ids_validate = biz_feat_validate_df.index

# See above for details.
feat_train_df = pd.read_pickle(base_dir + 'feat_train.pickle')
biz_feat_train_df= feat_train_df.groupby(['business_id']).agg(['max'])
biz_feat_train_df.drop('photo_id', axis=1, inplace=True)
biz_ids_train = biz_feat_train_df.index

# See above for details.
feat_test_df = pd.read_pickle(base_dir + 'features_test.pickle')
biz_feat_test_df= feat_test_df.groupby(['business_id']).agg(['max'])
biz_feat_test_df.drop('photo_id', axis=1, inplace=True)
biz_ids_test = biz_feat_test_df.index


#Clean up labels
labels_df = pd.read_pickle(base_dir + 'labels.pickle')
# Make the index int64 instead of string so sorting and matching works correctly.
labels_df.index = pd.to_numeric(labels_df.index)
# Sort all the labels based on business_id
labels_df = labels_df.sort_index()

tf_output['validate_data'] = np.matrix(biz_feat_validate_df)
tf_output['validate_labels'] = np.matrix(labels_df[labels_df.index.isin(biz_ids_validate)], dtype='float16')
tf_output['train_data'] = np.matrix(biz_feat_train_df)
tf_output['train_labels'] = np.matrix(labels_df[labels_df.index.isin(biz_ids_train)], dtype='float16')
tf_output['test_data'] = np.matrix(biz_feat_test_df)
tf_output['test_business_ids'] = biz_ids_test


print "validate data: " + str(len(tf_output['validate_data']))
print "validate labels:" + str(len(tf_output['validate_labels']))
print "train data:" + str(len(tf_output['train_data']))
print "train labels:" + str(len(tf_output['train_labels']))
print "test data:" + str(len(tf_output['test_data']))
print "test business ids:" + str(len(tf_output['test_business_ids']))

print tf_output

output_file = open(base_dir + "tf_data.pickle", 'wb')
pickle.dump(tf_output, output_file)
output_file.close()





validate data: 400
validate labels:400
train data:1600
train labels:1600
test data:1911
test business ids:1911
{'train_labels': matrix([[ 1.,  1.,  0., ...,  0.,  0.,  0.],
        [ 1.,  1.,  1., ...,  0.,  0.,  0.],
        [ 1.,  1.,  1., ...,  0.,  0.,  0.],
        ..., 
        [ 1.,  1.,  1., ...,  0.,  0.,  0.],
        [ 1.,  1.,  1., ...,  0.,  0.,  0.],
        [ 1.,  0.,  0., ...,  0.,  0.,  0.]], dtype=float16), 'validate_data': matrix([[ 3.38476562,  5.515625  ,  1.61035156, ...,  6.04296875,
          4.328125  ,  3.33984375],
        [ 3.02929688,  1.91699219,  0.13842773, ...,  1.87890625,
          0.41894531,  3.9453125 ],
        [ 3.83789062,  6.30078125,  2.68359375, ...,  1.96777344,
          3.40820312,  2.09960938],
        ..., 
        [ 3.75      ,  3.609375  ,  0.47729492, ...,  8.6953125 ,
          2.00976562,  4.046875  ],
        [ 5.3828125 ,  7.34375   ,  1.04785156, ...,  2.34179688,
          3.6953125 ,  2.08789062],
        [ 4.60546875,  0.82958