In [2]:
%pylab inline
import csv
import json
import numpy as np
from collections import defaultdict
import matplotlib.pyplot as plt

Populating the interactive namespace from numpy and matplotlib


In [4]:
business=[]
with open('business.json') as json_file:
    for line in json_file:
        business.append(json.loads(line))

In [34]:
#HELPER FUNCTIONS
def getValues(attribute):
    '''String parser that breaks down attribute and value.
    Returns attribute string and associated unlisted value.'''
    start = 0
    name = attribute[:attribute.find(':')]
    if attribute.count(':') == 1:
        coltuples = [[name, attribute[attribute.find(':')+2:]]]
    else:
        start = attribute.find(':')+3
        coltuples=[]
        s = attribute[start:]
        for colon in range(attribute.count(':')-1):
            commaindex = s.find(',')
            colindex = s.find(':')
            newname = name+'-'+s[:colindex]
            coltuples.append([newname, s[colindex+2:commaindex]])
            s = s[commaindex+2:]
    return coltuples

def modifywithNAs(dictionaryvalue, attributelist):
    '''Provide dictionary value for the business (i.e. the list of attributes it actually has data for) 
    and the list of possible attributes. Returns dictionary with modified original list of attributes with fill-ins
    for the missing data.'''
    dictatts=defaultdict(dict)
    for value in dictionaryvalue: #a list of attributes for the business
        dictatts[value[0]]=value[1]    
    for attribute in attributelist:
        if attribute not in dictatts.keys():
            dictatts[attribute] = 'NA'  
    return dictatts

In [35]:
def getAllAttributes(businessfile, city):
    '''Given a business.json subset, will return attribute list to be of size 72.
    Preconditions: city is a valid city in the business dataset, businessfile is business.json read into Python.'''
    attributelist = []
    for x in range(len(business)):
        if business[x][u'city'] == city and (type(business[x][u'attributes']) == list):
            for y in business[x][u'attributes']:
                if y not in attributelist:
                    attributelist.append(y)
    unlistedatts = []
    for item in attributelist:
        newitem = getValues(item)
        unlistedatts.append(newitem)
    allattributes=[]
    for item in unlistedatts:
        if item[0] not in allattributes:
            if len(item) == 1:
                allattributes.append(item[0][0])
            elif len(item) > 2:
                for item2 in item:
                    if item2[0] not in allattributes:
                        allattributes.append(item2[0])                  
    return allattributes

def transformAttributes(businessfile, attributelist, city):
    '''Returns transformed dictionary: key is business_id, values is attribute list of length 72.
    Preconditions: businessfile is business.json read into Python, attribute list of length 72 with all possible,
    city is a valid city in the business dataset.'''
    business2 = businessfile
    testlist = []
    #Transforming business_id + attributes into accessible file
    for x in range(len(business2)):
        if business2[x][u'city'] == city:
            temp=[(business2[x][u'business_id'])]
            if type(business2[x][u'attributes']) == list:
                temp2=[]
                for y in business2[x][u'attributes']:
                    temp2.append(y)
                temp.append(temp2)
            else:
                temp.append([])
            testlist.append(temp)
    for business in range(len(testlist)):
        attributes = testlist[business][1]
        if attributes != []:
            temp = []
            for att in attributes:
                newatt = getValues(att)
                for new in newatt:
                    temp.append(new)
            testlist[business][1] = temp 
    attributedict = {}
    for business in range(len(testlist)):
        attributedict[testlist[business][0]] = testlist[business][1]
    #Now to fill the dictionary with NA values if data is not there
    for ids in attributedict.keys():
        v = attributedict[ids]
        attributedict[ids] = modifywithNAs(v, allattributes)
    return attributedict

In [None]:
#TO WRITE CSV FILE
with open('dict6.2.csv', 'wb') as csv_file:
    writer = csv.writer(csv_file)
    for key, value in attributedict.items():
        for attribute, value2 in value.items():
            writer.writerow([key, attribute, value2])

In [36]:
allattributes = getAllAttributes(business, 'Edinburgh')
transformed = transformAttributes(business, allattributes, 'Edinburgh')

In [42]:
attributevalues=defaultdict(dict)
for y in allattributes:
    attlist=set()
    for x in transformed.keys():
        attlist.add(transformed[x][y])
    attributevalues[y] = attlist

In [43]:
attributevalues

defaultdict(dict,
            {u'AcceptsInsurance': {u'False', 'NA', u'True'},
             u'AgesAllowed': {u'18plus', 'NA', u'allages'},
             u'Alcohol': {'NA', u'beer_and_wine', u'full_bar', u'none'},
             u"Ambience-'casual'": {u'False', 'NA', u'True'},
             u"Ambience-'classy'": {u'False', 'NA', u'True'},
             u"Ambience-'hipster'": {u'False', 'NA', u'True'},
             u"Ambience-'intimate'": {u'False', 'NA', u'True'},
             u"Ambience-'romantic'": {u'False', 'NA', u'True'},
             u"Ambience-'touristy'": {u'False', 'NA', u'True'},
             u"Ambience-'trendy'": {u'False', 'NA', u'True'},
             u"Ambience-'upscale'": {u'False', 'NA', u'True'},
             u'BYOBCorkage': {'NA', u'yes_free'},
             u"BestNights-'friday'": {u'False', 'NA', u'True'},
             u"BestNights-'monday'": {u'False', 'NA', u'True'},
             u"BestNights-'saturday'": {u'False', 'NA', u'True'},
             u"BestNights-'sunday'": {u'