# Yelp Restaurant Photo Classification

### 1) Uncompressing the data

In [3]:
# Importing the needed modules
import numpy as np
import ds_utils
import os
import pandas as pd
import cv2
import pickle

In [4]:
# Setting up the directories for this project
# Data Dir
#dataDir = '/c/Users/bajpa/Desktop/Github/KaggleProjects/MassiveData/yelpDS'
dataDir = 'C:/Users/bajpa/Desktop/Github/Kaggle/MassiveData/yelpDS'
# Analysis Dir
anDir = 'C:/Users/bajpa/Desktop/Github/Kaggle/YelpRPC'

In [3]:
# Uncompressing all the data files
# Extraction function to be run only once
def uncompress_data(dirpath):
    '''
    Extracting the compressed files in dirpath 
    '''
    os.chdir(dirpath)
    fileNames = os.listdir(dirpath)
#    print fileNames
    
    for f in fileNames:
            ds_utils.uncom_tgz(f)

    os.chdir(anDir)
    return None

#uncompress_data(dataDir)

### 2) Data overview

#### Looking at the extracted dataset, we find that we have 6 outputs
#### 1) test_photos: A folder containing all the test images
#### 2) train_photos: A folder containing all the train images
#### 3) sample_submission:  Demo submission file
#### 4) train: A csv file to associate charactersitic labels to restaruants
#### 5) train_photo_to_biz_ids: A csv file to associate train image ids to specific restaurants
#### 6) test_photo_to_biz:  A csv file to associate test image ids to specific restaurants

In [13]:
# Analyzing the count and size of each of the above
def dataset_overview(dirpath):
    os.chdir(dirpath)
    
    # test_photos
    path, dirs, files = os.walk("./test_photos").next()
    # Excluding the hidden files that were created during extraction process
    files = [f for f in files if f[0] != '.']
    file_size_bytes = [os.path.getsize("./test_photos/" + f) for f in files]
    total_size = sum(file_size_bytes) * 1e-9
    print "test_photos:", "\nfiles: ", len(files), "\nsize: ", total_size, "gb"
    print "----------"

    # train_photos
    path, dirs, files = os.walk("./train_photos").next()
    # Excluding the hidden files that were created during extraction process
    files = [f for f in files if f[0] != '.']
    file_size_bytes = [os.path.getsize("./train_photos/" + f) for f in files]
    total_size = sum(file_size_bytes) * 1e-9
    print "train_photos:", "\nfiles: ", len(files), "\nsize: ", total_size, "gb"
    print "----------"
    
    return None
    
#dataset_overview(dataDir)

test_photos: 
files:  237152 
size:  7.16280653 gb
----------
train_photos: 
files:  234842 
size:  7.086361687 gb
----------


In [5]:
# taking an overview of other csv files with the project

In [14]:
# CSV file list
def csv_overview(dirpath):
    os.chdir(dirpath)
    
    # files to be analysed
    csv_list = ['train.csv', 'train_photo_to_biz_ids.csv',
                'test_photo_to_biz.csv', 'sample_submission.csv']
    
    # Analyzing the files
    for filename in csv_list:
        print "----------\n"
        f_overview = ds_utils.csv_overview(filename)
        for keyname in f_overview:
            print keyname
            print f_overview[keyname]

    return None

#csv_overview(dataDir)

----------

train.csv
size in kb
28
readfile
   business_id         labels
0         1000  1 2 3 4 5 6 7
1         1001        0 1 6 8
2          100    1 2 4 5 6 7
3         1006      1 2 4 5 6
4         1010          0 6 8
5          101    1 2 3 4 5 6
6         1011        2 3 5 6
7         1012      1 2 3 5 6
8         1014      1 2 4 5 6
9         1015        1 5 6 7
----------

train_photo_to_biz_ids.csv
size in kb
2698
readfile
   photo_id  business_id
0    204149         3034
1     52779         2805
2    278973          485
3    195284          485
4     19992          485
5     80748          485
6    444996         1783
7    200285           35
8     90572           35
9     27565         1313
----------

test_photo_to_biz.csv
size in kb
15193
readfile
   photo_id business_id
0    317818       003sg
1     30679       003sg
2    455084       003sg
3    371381       003sg
4     86224       003sg
5     36076       003sg
6     46999       003sg
7     74896       003sg
8    16939

#### So we have a large number of image files in both test and training data set. Each of them has an image id. 1) 'train.csv': business ids and a certain number of tags related to it which signify the characteristics of the restaurant. 2) 'train_photo_to_biz_ids.csv': each photo is related to a particular business id. 3) 'test_photo_to_biz.csv': business ids for each image in the test_photos. 4) 'sample_submission.csv': We need to associate labels to each business_id.
#### The csv files are not too big and can easily be loaded into the memory

### 3) Splitting the training data into train and validation set

In [7]:
# Reading the train.csv file
def train_valid(dirpath):
    # moving to the data folder
    os.chdir(dirpath)
    
    # reading business ids
    trainBizIdsLabel = pd.read_csv("train.csv", sep=',', skipinitialspace=True,
                            skip_blank_lines=True)
    
    # Shuffle the entire dataframe and then split it into training and validation set
    trainBizIdsLabelShuffle = trainBizIdsLabel.reindex(np.random.permutation(trainBizIdsLabel.index))
    # We split the ids into a train:validate::0.75:0.25
    split_at = int(0.75 * trainBizIdsLabelShuffle.shape[0])

    # Training set data
    trSetBizIds = trainBizIdsLabelShuffle.iloc[:split_at, :]
    valSetBizIds = trainBizIdsLabelShuffle.iloc[split_at:, :]
#    print trSetBizIds.shape
#    print valSetBizIds.shape
    
    # reading biz_id to image id
    trainImgBizId = pd.read_csv("train_photo_to_biz_ids.csv", sep=',', skipinitialspace=True,
                            skip_blank_lines=True)
    
#    print trainImgBizId.shape
    # Dictionary that will have business id as key and and values will be dictionary of labels imageIds
    trainDic = {}
    for ir in trSetBizIds.itertuples():
        trainDic[str(ir[1])] = {'label':ir[2],
                                'img':trainImgBizId[trainImgBizId['business_id'] == ir[1]].iloc[:, 0].values}

    valDic = {}
    for ir in valSetBizIds.itertuples():
        valDic[str(ir[1])] = {'label':ir[2],
                              'img':trainImgBizId[trainImgBizId['business_id'] == ir[1]].iloc[:, 0].values}

#    print len(trainDic.keys())
#    print len(valDic.keys())
    # Saving the relevant dictionaries for future use.
    train_set = open('trainDic.pkl', 'wb')
    pickle.dump(trainDic, train_set)
    train_set.close()

    val_set = open('valDic.pkl', 'wb')
    pickle.dump(valDic, val_set)
    val_set.close()

    return None

#train_valid(dataDir)
