In [2]:
import pandas as pd
import json

## Clean Dataset business.json

business.json has problematic columns under certain parsing workflows.  There is also a lot of extraneous information.  This seeks to clean and compress available information for easier database loading and downstream parsing.

In [5]:
data = pd.read_json('business.json', lines=True)
data.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,1SWheh84yJXfytovILXOAQ,Arizona Biltmore Golf Club,2818 E Camino Acequia Drive,Phoenix,AZ,85016,33.522143,-112.018481,3.0,5,0,{'GoodForKids': 'False'},"Golf, Active Life",
1,QXAEGFB4oINsVuTFxEYKFQ,Emerald Chinese Restaurant,30 Eglinton Avenue W,Mississauga,ON,L5R 3E7,43.605499,-79.652289,2.5,128,1,"{'RestaurantsReservations': 'True', 'GoodForMe...","Specialty Food, Restaurants, Dim Sum, Imported...","{'Monday': '9:0-0:0', 'Tuesday': '9:0-0:0', 'W..."
2,gnKjwL_1w79qoiV3IC_xQQ,Musashi Japanese Restaurant,"10110 Johnston Rd, Ste 15",Charlotte,NC,28210,35.092564,-80.859132,4.0,170,1,"{'GoodForKids': 'True', 'NoiseLevel': 'u'avera...","Sushi Bars, Restaurants, Japanese","{'Monday': '17:30-21:30', 'Wednesday': '17:30-..."
3,xvX2CttrVhyG2z1dFg_0xw,Farmers Insurance - Paul Lorenz,"15655 W Roosevelt St, Ste 237",Goodyear,AZ,85338,33.455613,-112.395596,5.0,3,1,,"Insurance, Financial Services","{'Monday': '8:0-17:0', 'Tuesday': '8:0-17:0', ..."
4,HhyxOkGAM07SRYtlQ4wMFQ,Queen City Plumbing,"4209 Stuart Andrew Blvd, Ste F",Charlotte,NC,28217,35.190012,-80.887223,4.0,4,1,"{'BusinessAcceptsBitcoin': 'False', 'ByAppoint...","Plumbing, Shopping, Local Services, Home Servi...","{'Monday': '7:0-23:0', 'Tuesday': '7:0-23:0', ..."


In [6]:
sm = data.attributes[1]

In [7]:
sm.keys()

dict_keys(['RestaurantsReservations', 'GoodForMeal', 'BusinessParking', 'Caters', 'NoiseLevel', 'RestaurantsTableService', 'RestaurantsTakeOut', 'RestaurantsPriceRange2', 'OutdoorSeating', 'BikeParking', 'Ambience', 'HasTV', 'WiFi', 'GoodForKids', 'Alcohol', 'RestaurantsAttire', 'RestaurantsGoodForGroups', 'RestaurantsDelivery'])

In [8]:
import numpy as np

def get_keys(x):
    if x is not None:
        return np.array(list(x.keys()))
    else:
        return np.array([])

data["attribute_keys"] = data.attributes.apply(get_keys)

In [9]:
# Aggregate keys - These will likely need to be hardcoded as columns
unique_keys = set(np.concatenate(data.attribute_keys.to_numpy()))
unique_keys

{'AcceptsInsurance',
 'AgesAllowed',
 'Alcohol',
 'Ambience',
 'BYOB',
 'BYOBCorkage',
 'BestNights',
 'BikeParking',
 'BusinessAcceptsBitcoin',
 'BusinessAcceptsCreditCards',
 'BusinessParking',
 'ByAppointmentOnly',
 'Caters',
 'CoatCheck',
 'Corkage',
 'DietaryRestrictions',
 'DogsAllowed',
 'DriveThru',
 'GoodForDancing',
 'GoodForKids',
 'GoodForMeal',
 'HairSpecializesIn',
 'HappyHour',
 'HasTV',
 'Music',
 'NoiseLevel',
 'Open24Hours',
 'OutdoorSeating',
 'RestaurantsAttire',
 'RestaurantsCounterService',
 'RestaurantsDelivery',
 'RestaurantsGoodForGroups',
 'RestaurantsPriceRange2',
 'RestaurantsReservations',
 'RestaurantsTableService',
 'RestaurantsTakeOut',
 'Smoking',
 'WheelchairAccessible',
 'WiFi'}

In [10]:
list(unique_keys)

['BusinessAcceptsBitcoin',
 'NoiseLevel',
 'AcceptsInsurance',
 'BYOB',
 'Music',
 'RestaurantsTakeOut',
 'AgesAllowed',
 'RestaurantsAttire',
 'BusinessParking',
 'DogsAllowed',
 'BestNights',
 'RestaurantsTableService',
 'RestaurantsPriceRange2',
 'GoodForDancing',
 'RestaurantsGoodForGroups',
 'GoodForKids',
 'Ambience',
 'HairSpecializesIn',
 'BYOBCorkage',
 'CoatCheck',
 'DriveThru',
 'GoodForMeal',
 'Corkage',
 'Open24Hours',
 'WiFi',
 'Alcohol',
 'Smoking',
 'HappyHour',
 'DietaryRestrictions',
 'WheelchairAccessible',
 'BikeParking',
 'RestaurantsDelivery',
 'Caters',
 'OutdoorSeating',
 'BusinessAcceptsCreditCards',
 'HasTV',
 'ByAppointmentOnly',
 'RestaurantsReservations',
 'RestaurantsCounterService']

In [17]:
# Pickle unique keys for further use
import pickle
with open('keys.pkl', 'wb+') as file:
    pickle.dump(list(unique_keys), file)

In [11]:
# Turning Key: values into columns

subset = data.iloc[0:10]

def expand_attributes(df):
    attributes = {}
    default_na = {key: np.nan for key in unique_keys}
    if df.attributes is None:
        attributes = default_na
    else:
        for key in unique_keys:
            if key in df.attributes.keys():
                attributes[key] = df.attributes[key]
            else:
                attributes[key] = np.nan
    return attributes
        
attribute_df = pd.DataFrame.from_dict(
    dict(data.apply(expand_attributes, axis=1)),
    orient='index'
)

attribute_df.head()

Unnamed: 0,BusinessAcceptsBitcoin,NoiseLevel,AcceptsInsurance,BYOB,Music,RestaurantsTakeOut,AgesAllowed,RestaurantsAttire,BusinessParking,DogsAllowed,...,WheelchairAccessible,BikeParking,RestaurantsDelivery,Caters,OutdoorSeating,BusinessAcceptsCreditCards,HasTV,ByAppointmentOnly,RestaurantsReservations,RestaurantsCounterService
0,,,,,,,,,,,...,,,,,,,,,,
1,,u'loud',,,,True,,u'casual',"{'garage': False, 'street': False, 'validated'...",,...,,False,False,True,False,,False,,True,
2,,u'average',,,,True,,'casual',"{'garage': False, 'street': False, 'validated'...",,...,,True,False,False,False,True,True,,True,
3,,,,,,,,,,,...,,,,,,,,,,
4,False,,,,,,,,,,...,,,,,,True,,True,,


In [12]:
# Clean Individual attributes
# Remove non ascii chars from string
def clean_strings(x):
    if "u'" in str(x):
        return x.strip("u'")
    elif "'" in str(x):
        return x.strip("'")
    return x

for key in unique_keys:
    attribute_df[key] = attribute_df[key].apply(clean_strings)
    
attribute_df.head()

Unnamed: 0,BusinessAcceptsBitcoin,NoiseLevel,AcceptsInsurance,BYOB,Music,RestaurantsTakeOut,AgesAllowed,RestaurantsAttire,BusinessParking,DogsAllowed,...,WheelchairAccessible,BikeParking,RestaurantsDelivery,Caters,OutdoorSeating,BusinessAcceptsCreditCards,HasTV,ByAppointmentOnly,RestaurantsReservations,RestaurantsCounterService
0,,,,,,,,,,,...,,,,,,,,,,
1,,loud,,,,True,,casual,"{'garage': False, 'street': False, 'validated'...",,...,,False,False,True,False,,False,,True,
2,,average,,,,True,,casual,"{'garage': False, 'street': False, 'validated'...",,...,,True,False,False,False,True,True,,True,
3,,,,,,,,,,,...,,,,,,,,,,
4,False,,,,,,,,,,...,,,,,,True,,True,,


In [13]:
for key in unique_keys:
    attribute_to_check = key
    print(key, '\n', attribute_df[attribute_df[attribute_to_check].notna()].head(1)[attribute_to_check])

BusinessAcceptsBitcoin 
 4    False
Name: BusinessAcceptsBitcoin, dtype: object
NoiseLevel 
 1    loud
Name: NoiseLevel, dtype: object
AcceptsInsurance 
 9    False
Name: AcceptsInsurance, dtype: object
BYOB 
 176    False
Name: BYOB, dtype: object
Music 
 12    {'dj': False, 'background_music': False, 'no_m...
Name: Music, dtype: object
RestaurantsTakeOut 
 1    True
Name: RestaurantsTakeOut, dtype: object
AgesAllowed 
 52    allages
Name: AgesAllowed, dtype: object
RestaurantsAttire 
 1    casual
Name: RestaurantsAttire, dtype: object
BusinessParking 
 1    {'garage': False, 'street': False, 'validated'...
Name: BusinessParking, dtype: object
DogsAllowed 
 15    False
Name: DogsAllowed, dtype: object
BestNights 
 12    {'monday': False, 'tuesday': False, 'friday': ...
Name: BestNights, dtype: object
RestaurantsTableService 
 1    True
Name: RestaurantsTableService, dtype: object
RestaurantsPriceRange2 
 1    2
Name: RestaurantsPriceRange2, dtype: object
GoodForDancing 
 12    True
Na

# How to get attributes into database

There are a couple ways to approach this.  One is to continue cleaning and condense back to a uniform size attributes list.  Another would be to migrate the business table to something with all these features. I don't think the migration is worthwhile.  The raw data isn't done as such, and the possibity of new feaures downstream means another migration may be needed.

Few Steps:

1. Convert dictionary items to simple lists of True.  (Don't include false items).
2. Condense all attribute columns (from attribute_df) into single dictionary of lists (or nan).

In [14]:
sample = attribute_df.Ambience[1]

In [15]:
from ast import literal_eval
print(type(sample))
display(type(literal_eval(sample)), literal_eval(sample))

<class 'str'>


dict

{'romantic': False,
 'intimate': False,
 'classy': False,
 'hipster': False,
 'divey': False,
 'touristy': False,
 'trendy': False,
 'upscale': False,
 'casual': True}

In [16]:
# Strip Only true
def strip_true(str_dict):
    if str_dict is not None and type(str_dict) == str:
        x = literal_eval(str_dict)
        if type(x) == dict:
            true_keys = []
            for key in x.keys():
                if x[key] == True:
                    true_keys.append(key)
            return true_keys
    return str_dict

In [17]:
strip_true(sample)

['casual']

In [18]:
# Apply to transformed dataframe
dict_cols = ['Ambience', 'BestNights', 'BusinessParking',
             'GoodForMeal', 'Music', 'HairSpecializesIn', 'DietaryRestrictions' ]
for col in dict_cols:
    attribute_df[col] = attribute_df[col].apply(strip_true)

In [19]:
attribute_df.head()

Unnamed: 0,BusinessAcceptsBitcoin,NoiseLevel,AcceptsInsurance,BYOB,Music,RestaurantsTakeOut,AgesAllowed,RestaurantsAttire,BusinessParking,DogsAllowed,...,WheelchairAccessible,BikeParking,RestaurantsDelivery,Caters,OutdoorSeating,BusinessAcceptsCreditCards,HasTV,ByAppointmentOnly,RestaurantsReservations,RestaurantsCounterService
0,,,,,,,,,,,...,,,,,,,,,,
1,,loud,,,,True,,casual,[lot],,...,,False,False,True,False,,False,,True,
2,,average,,,,True,,casual,[lot],,...,,True,False,False,False,True,True,,True,
3,,,,,,,,,,,...,,,,,,,,,,
4,False,,,,,,,,,,...,,,,,,True,,True,,


In [20]:
# Attempt turning into single dict
attribute_df.iloc[1].to_dict()

{'BusinessAcceptsBitcoin': nan,
 'NoiseLevel': 'loud',
 'AcceptsInsurance': nan,
 'BYOB': nan,
 'Music': nan,
 'RestaurantsTakeOut': 'True',
 'AgesAllowed': nan,
 'RestaurantsAttire': 'casual',
 'BusinessParking': ['lot'],
 'DogsAllowed': nan,
 'BestNights': nan,
 'RestaurantsTableService': 'True',
 'RestaurantsPriceRange2': '2',
 'GoodForDancing': nan,
 'RestaurantsGoodForGroups': 'True',
 'GoodForKids': 'True',
 'Ambience': ['casual'],
 'HairSpecializesIn': nan,
 'BYOBCorkage': nan,
 'CoatCheck': nan,
 'DriveThru': nan,
 'GoodForMeal': ['lunch', 'dinner'],
 'Corkage': nan,
 'Open24Hours': nan,
 'WiFi': 'no',
 'Alcohol': 'full_bar',
 'Smoking': nan,
 'HappyHour': nan,
 'DietaryRestrictions': nan,
 'WheelchairAccessible': nan,
 'BikeParking': 'False',
 'RestaurantsDelivery': 'False',
 'Caters': 'True',
 'OutdoorSeating': 'False',
 'BusinessAcceptsCreditCards': nan,
 'HasTV': 'False',
 'ByAppointmentOnly': nan,
 'RestaurantsReservations': 'True',
 'RestaurantsCounterService': nan}

In [26]:
attribute_df.head().to_dict(orient='records')

[{'RestaurantsReservations': nan,
  'BestNights': nan,
  'Alcohol': nan,
  'RestaurantsTableService': nan,
  'GoodForMeal': nan,
  'Music': nan,
  'WheelchairAccessible': nan,
  'BikeParking': nan,
  'HairSpecializesIn': nan,
  'Caters': nan,
  'DogsAllowed': nan,
  'RestaurantsCounterService': nan,
  'BusinessAcceptsCreditCards': nan,
  'GoodForKids': 'False',
  'RestaurantsPriceRange2': nan,
  'DriveThru': nan,
  'WiFi': nan,
  'Open24Hours': nan,
  'RestaurantsTakeOut': nan,
  'Corkage': nan,
  'NoiseLevel': nan,
  'DietaryRestrictions': nan,
  'RestaurantsGoodForGroups': nan,
  'Ambience': nan,
  'BYOBCorkage': nan,
  'BusinessAcceptsBitcoin': nan,
  'AcceptsInsurance': nan,
  'AgesAllowed': nan,
  'BYOB': nan,
  'RestaurantsAttire': nan,
  'ByAppointmentOnly': nan,
  'BusinessParking': nan,
  'GoodForDancing': nan,
  'Smoking': nan,
  'HappyHour': nan,
  'HasTV': nan,
  'OutdoorSeating': nan,
  'CoatCheck': nan,
  'RestaurantsDelivery': nan},
 {'RestaurantsReservations': 'True',
 

In [21]:
# Try concatenating with main data

## Appending exploded frame
# frame = [data, attribute_df]
# new_data = pd.concat(frame, axis=1, sort=True, join='inner')
# new_data.head(1)

## Appending compressed frame
new_data = data
new_data.attributes = attribute_df.to_dict(orient='records')
new_data = new_data.drop(columns='attribute_keys')

In [22]:
# Conversion of dict type to jsonified list for database storage
new_data.attributes = list(map(lambda x: json.dumps(x), new_data.attributes))
new_data.hours = list(map(lambda x: json.dumps(x), new_data.hours))

In [28]:
display(type(new_data.attributes[1]), type(new_data.hours[1]), new_data.attributes[0], new_data.hours[1])

str

str

'{"BusinessAcceptsBitcoin": NaN, "NoiseLevel": NaN, "AcceptsInsurance": NaN, "BYOB": NaN, "Music": NaN, "RestaurantsTakeOut": NaN, "AgesAllowed": NaN, "RestaurantsAttire": NaN, "BusinessParking": NaN, "DogsAllowed": NaN, "BestNights": NaN, "RestaurantsTableService": NaN, "RestaurantsPriceRange2": NaN, "GoodForDancing": NaN, "RestaurantsGoodForGroups": NaN, "GoodForKids": "False", "Ambience": NaN, "HairSpecializesIn": NaN, "BYOBCorkage": NaN, "CoatCheck": NaN, "DriveThru": NaN, "GoodForMeal": NaN, "Corkage": NaN, "Open24Hours": NaN, "WiFi": NaN, "Alcohol": NaN, "Smoking": NaN, "HappyHour": NaN, "DietaryRestrictions": NaN, "WheelchairAccessible": NaN, "BikeParking": NaN, "RestaurantsDelivery": NaN, "Caters": NaN, "OutdoorSeating": NaN, "BusinessAcceptsCreditCards": NaN, "HasTV": NaN, "ByAppointmentOnly": NaN, "RestaurantsReservations": NaN, "RestaurantsCounterService": NaN}'

'{"Monday": "9:0-0:0", "Tuesday": "9:0-0:0", "Wednesday": "9:0-0:0", "Thursday": "9:0-0:0", "Friday": "9:0-1:0", "Saturday": "9:0-1:0", "Sunday": "9:0-0:0"}'

In [30]:
## Saving Data
import s3
bucket = s3.Bucket('yelp-data-shared-labs18')

temp_file_path = '/tmp/tempbusiness.json'
new_data.to_json(temp_file_path, orient='records')
bucket.save(temp_file_path, 'Processed/business_transformed.json')

/tmp/tempbusiness.json  283428862 / 283428862.0  (100.00%)

In [31]:
# Get file from s3 if needed
bucket.get('Processed/business_transformed.json', 'business_transformed.json')

In [3]:
pd.read_json("business_transformed.json").head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,1SWheh84yJXfytovILXOAQ,Arizona Biltmore Golf Club,2818 E Camino Acequia Drive,Phoenix,AZ,85016,33.522143,-112.018481,3.0,5,0,"{'RestaurantsReservations': None, 'BestNights'...","Golf, Active Life",
1,QXAEGFB4oINsVuTFxEYKFQ,Emerald Chinese Restaurant,30 Eglinton Avenue W,Mississauga,ON,L5R 3E7,43.605499,-79.652289,2.5,128,1,"{'RestaurantsReservations': 'True', 'BestNight...","Specialty Food, Restaurants, Dim Sum, Imported...","{'Monday': '9:0-0:0', 'Tuesday': '9:0-0:0', 'W..."
2,gnKjwL_1w79qoiV3IC_xQQ,Musashi Japanese Restaurant,"10110 Johnston Rd, Ste 15",Charlotte,NC,28210,35.092564,-80.859132,4.0,170,1,"{'RestaurantsReservations': 'True', 'BestNight...","Sushi Bars, Restaurants, Japanese","{'Monday': '17:30-21:30', 'Wednesday': '17:30-..."
3,xvX2CttrVhyG2z1dFg_0xw,Farmers Insurance - Paul Lorenz,"15655 W Roosevelt St, Ste 237",Goodyear,AZ,85338,33.455613,-112.395596,5.0,3,1,"{'RestaurantsReservations': None, 'BestNights'...","Insurance, Financial Services","{'Monday': '8:0-17:0', 'Tuesday': '8:0-17:0', ..."
4,HhyxOkGAM07SRYtlQ4wMFQ,Queen City Plumbing,"4209 Stuart Andrew Blvd, Ste F",Charlotte,NC,28217,35.190012,-80.887223,4.0,4,1,"{'RestaurantsReservations': None, 'BestNights'...","Plumbing, Shopping, Local Services, Home Servi...","{'Monday': '7:0-23:0', 'Tuesday': '7:0-23:0', ..."


## Data Viz 2

Prepping csv for sql write

In [1]:
import pandas as pd
import json

df = pd.read_csv("viz2_aggregate.csv")
df.head(2)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,business_id,address,city,is_open,latitude,longitude,name,review_count,...,numreviews2011,numreviews2012,numreviews2013,numreviews2014,numreviews2015,numreviews2016,numreviews2017,numreviews2018,average_stars_over_time,top_pos_neg_noun_chunks
0,0,0,1SWheh84yJXfytovILXOAQ,2818 E Camino Acequia Drive,Phoenix,0.0,33.522143,-112.018481,Arizona Biltmore Golf Club,5.0,...,0.0,0.0,0.0,0.0,3.0,2.0,0.0,0.0,"([5.0, 4.0, 1.0, 4.0, 1.0], ['2015-03-27', '20...","(['gorgeous facility', 'great view', 'my frien..."
1,1,1,QXAEGFB4oINsVuTFxEYKFQ,30 Eglinton Avenue W,Mississauga,1.0,43.605499,-79.652289,Emerald Chinese Restaurant,128.0,...,9.0,5.0,17.0,14.0,21.0,26.0,17.0,17.0,"([3.142857142857143, 2.6153846153846154, 2.571...","(['pleasant servers', 'its price', 'language b..."


In [2]:
# Remove csv import errors
df = df.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'])

In [3]:
# Remove redundant columns
df = df.drop(columns=['address', 'city',\
                      'is_open', 'latitude', 'longitude',\
                      'name', 'review_count', 'stars', 'state', 'postalcode'])

In [4]:
df = df.rename(columns={
    'average_stars_over_time': 'avg_stars_over_time',
    'top_pos_neg_noun_chunks': 'chunk_sentiment',
})
df.columns

Index(['business_id', 'categories', 'percentile', 'competitors',
       'bestinsector', 'star_review1.0', 'star_review2.0', 'star_review3.0',
       'star_review4.0', 'star_review5.0', 'numreviews2009', 'numreviews2010',
       'numreviews2011', 'numreviews2012', 'numreviews2013', 'numreviews2014',
       'numreviews2015', 'numreviews2016', 'numreviews2017', 'numreviews2018',
       'avg_stars_over_time', 'chunk_sentiment'],
      dtype='object')

In [5]:
count_by_star = []
for row in df.to_dict('r'):
    count_by_star.append(
        {
            '1':row['star_review1.0'],
            '2':row['star_review2.0'],
            '3':row['star_review3.0'],
            '4':row['star_review4.0'],
            '5':row['star_review5.0'],
        }
    )

In [6]:
df['count_by_star'] = count_by_star
df = df.drop(columns=['star_review1.0', 'star_review2.0', 'star_review3.0','star_review4.0','star_review5.0'])

In [7]:
review_by_year = []
for row in df.to_dict('r'):
    review_by_year.append(
        {
            '2009':row['numreviews2009'],
            '2010':row['numreviews2010'],
            '2011':row['numreviews2011'],
            '2012':row['numreviews2012'],
            '2013':row['numreviews2013'],
            '2014':row['numreviews2014'],
            '2015':row['numreviews2015'],
            '2016':row['numreviews2016'],
            '2017':row['numreviews2017'],
            '2018':row['numreviews2018'],
        }
    )

In [8]:
df['review_by_year'] = review_by_year
df = df.drop(columns=['numreviews2009', 'numreviews2010', 'numreviews2011', 'numreviews2012','numreviews2013',\
                'numreviews2014', 'numreviews2015', 'numreviews2016','numreviews2017','numreviews2018'])

In [9]:
safe_list = ['competitors', 'avg_stars_over_time', 'chunk_sentiment', \
             'bestinsector', 'count_by_star', 'review_by_year']
for column in safe_list:
    df[column] = list(map(lambda x: json.dumps(x), df[column]))
df.head()

Unnamed: 0,business_id,categories,percentile,competitors,bestinsector,avg_stars_over_time,chunk_sentiment,count_by_star,review_by_year
0,1SWheh84yJXfytovILXOAQ,"Golf, Active Life",10.271318,"""['Royal Palms Golf Course', 'Freedom Golf Cou...","""['Camelback Mountain', 'OdySea Aquarium', 'Ch...","""([5.0, 4.0, 1.0, 4.0, 1.0], ['2015-03-27', '2...","""(['gorgeous facility', 'great view', 'my frie...","{""1"": 2.0, ""2"": 0.0, ""3"": 0.0, ""4"": 2.0, ""5"": ...","{""2009"": 0.0, ""2010"": 0.0, ""2011"": 0.0, ""2012""..."
1,QXAEGFB4oINsVuTFxEYKFQ,"Specialty Food, Restaurants, Dim Sum, Imported...",18.817321,"""['Grandeur Palace', 'Noble Seafood', 'Century...","""['Khao San Road', 'Seven Lives Tacos Y Marisc...","""([3.142857142857143, 2.6153846153846154, 2.57...","""(['pleasant servers', 'its price', 'language ...","{""1"": 34.0, ""2"": 22.0, ""3"": 33.0, ""4"": 35.0, ""...","{""2009"": 2.0, ""2010"": 6.0, ""2011"": 9.0, ""2012""..."
2,gnKjwL_1w79qoiV3IC_xQQ,"Sushi Bars, Restaurants, Japanese",85.156069,"""['Yama Asian Fusion', 'Mizuho', 'Sky Asian Bi...","""['Tupelo Honey', \""Am\u00e9lie's French Baker...","""([3.8333333333333335, 4.222222222222222, 3.94...","""(['dinner excellent tonkotsu ramen noodles', ...","{""1"": 11.0, ""2"": 11.0, ""3"": 24.0, ""4"": 40.0, ""...","{""2009"": 5.0, ""2010"": 4.0, ""2011"": 11.0, ""2012..."
3,xvX2CttrVhyG2z1dFg_0xw,"Insurance, Financial Services",83.943662,"""['Curt Henderson Insurance', 'J M Chapman Age...","""['OneGuard Home Warranties', 'Fairway Indepen...","""([5.0, 5.0, 5.0], ['2013-01-01', '2017-12-11'...","""(['amazing staff', 'your life', 'excellent ab...","{""1"": 0.0, ""2"": 0.0, ""3"": 0.0, ""4"": 0.0, ""5"": ...","{""2009"": 0.0, ""2010"": 0.0, ""2011"": 0.0, ""2012""..."
4,HhyxOkGAM07SRYtlQ4wMFQ,"Plumbing, Shopping, Local Services, Home Servi...",60.419065,"""['Rose Cleaners & Alterations', 'Elite Cleane...","""['7th Street Public Market', 'IKEA', 'Paper S...","""([1.0, 5.0, 5.0, 5.0], ['2014-06-11', '2016-0...","""(['reputable business owner', 'fair competiti...","{""1"": 1.0, ""2"": 0.0, ""3"": 0.0, ""4"": 0.0, ""5"": ...","{""2009"": 0.0, ""2010"": 0.0, ""2011"": 0.0, ""2012""..."


In [14]:
df.tolist()

AttributeError: 'DataFrame' object has no attribute 'tolist'

## Generate Jobs from Existing Data

Often enough we need to reprocess existing data.  This is a framework for filtering for the files needed and generating the necessary jobs.

Uses local jobs library

In [2]:
import jobs
import s3
bucket = jobs.get_bucket()

In [4]:
# Search for exisisting data
search_string = 'retoken'
search = bucket.find(search_string)[0:10]
search[0:10]

['Processed/clean_review_0_retoken',
 'Processed/clean_review_100_retoken',
 'Processed/clean_review_101_retoken',
 'Processed/clean_review_102_retoken',
 'Processed/clean_review_103_retoken',
 'Processed/clean_review_104_retoken',
 'Processed/clean_review_105_retoken',
 'Processed/clean_review_106_retoken',
 'Processed/clean_review_107_retoken',
 'Processed/clean_review_108_retoken']

In [7]:
for filepath in search:
    jobs.generate_job(objectpath=filepath, job_type='POST', tablename='reviews', dry_run=False)

/tmp/POST_clean_review_108_retoken_job.json  70 / 70.0  (100.00%)

In [12]:
# Check jobs available
jobs.get_jobs('')

[]

In [14]:
bucket.find('Jobs/', suffix='json')

[]

"{'test': 12}"