# Scratchpad for testing out graphlab with project data

In [2]:
import json
import numpy as np
import graphlab
import pandas as pd
import matplotlib.pyplot as plt

This non-commercial license of GraphLab Create is assigned to windsurf_dean@yahoo.com and will expire on May 26, 2017. For commercial licensing options, visit https://dato.com/buy/.


[INFO] graphlab.cython.cy_server: GraphLab Create v1.10.1 started. Logging: /tmp/graphlab_server_1467176787.log


### Load in user-item-rating data...

In [4]:
# create Pandas DF with data pulled from Postgres
df = pd.read_csv('data_analysis/user_reviews.csv', header=None,
                 names=['user_id', 'business_id', 'bus_name', 'stars', 'locale'])

In [5]:
df.head()

Unnamed: 0,user_id,business_id,bus_name,stars,locale
0,Ouus3lC3Xk3YRGx4ptd_4A,DW6SI2KxcfXBHeo9jEr4lQ,Pyramid Cafe,2,"Las Vegas, NV"
1,dz_sBsL69aWzsxksT2O_Mg,DW6SI2KxcfXBHeo9jEr4lQ,Pyramid Cafe,2,"Las Vegas, NV"
2,zyI-u0C1YOzp5v1j2wuCOQ,DW6SI2KxcfXBHeo9jEr4lQ,Pyramid Cafe,3,"Las Vegas, NV"
3,vyfsQo-estP8EfiIFMsL6g,DW6SI2KxcfXBHeo9jEr4lQ,Pyramid Cafe,3,"Las Vegas, NV"
4,lxZSVeJz6KEBW1nlA3JKJg,DW6SI2KxcfXBHeo9jEr4lQ,Pyramid Cafe,1,"Las Vegas, NV"


In [6]:
# store data in Graphlab's SFrame type
sf = graphlab.SFrame(df[['user_id', 'business_id', 'stars']])

In [7]:
sf.head()

user_id,business_id,stars
Ouus3lC3Xk3YRGx4ptd_4A,DW6SI2KxcfXBHeo9jEr4lQ,2
dz_sBsL69aWzsxksT2O_Mg,DW6SI2KxcfXBHeo9jEr4lQ,2
zyI-u0C1YOzp5v1j2wuCOQ,DW6SI2KxcfXBHeo9jEr4lQ,3
vyfsQo-estP8EfiIFMsL6g,DW6SI2KxcfXBHeo9jEr4lQ,3
lxZSVeJz6KEBW1nlA3JKJg,DW6SI2KxcfXBHeo9jEr4lQ,1
CbMCOeeHVeafcD-6-CJrPg,DW6SI2KxcfXBHeo9jEr4lQ,4
Z8DyUMojRhFbLyHBymM_rA,DW6SI2KxcfXBHeo9jEr4lQ,2
sXWvDlMQspjJ1zzr3cEM7Q,DW6SI2KxcfXBHeo9jEr4lQ,2
G8RaaiVzbyRvsBmoUx0VeA,DW6SI2KxcfXBHeo9jEr4lQ,1
f0isFKnJVZVmpdCPNd8Gfw,DW6SI2KxcfXBHeo9jEr4lQ,1


In [8]:
sf.column_names

<bound method SFrame.column_names of Columns:
	user_id	str
	business_id	str
	stars	int

Rows: 1363242

Data:
+------------------------+------------------------+-------+
|        user_id         |      business_id       | stars |
+------------------------+------------------------+-------+
| Ouus3lC3Xk3YRGx4ptd_4A | DW6SI2KxcfXBHeo9jEr4lQ |   2   |
| dz_sBsL69aWzsxksT2O_Mg | DW6SI2KxcfXBHeo9jEr4lQ |   2   |
| zyI-u0C1YOzp5v1j2wuCOQ | DW6SI2KxcfXBHeo9jEr4lQ |   3   |
| vyfsQo-estP8EfiIFMsL6g | DW6SI2KxcfXBHeo9jEr4lQ |   3   |
| lxZSVeJz6KEBW1nlA3JKJg | DW6SI2KxcfXBHeo9jEr4lQ |   1   |
| CbMCOeeHVeafcD-6-CJrPg | DW6SI2KxcfXBHeo9jEr4lQ |   4   |
| Z8DyUMojRhFbLyHBymM_rA | DW6SI2KxcfXBHeo9jEr4lQ |   2   |
| sXWvDlMQspjJ1zzr3cEM7Q | DW6SI2KxcfXBHeo9jEr4lQ |   2   |
| G8RaaiVzbyRvsBmoUx0VeA | DW6SI2KxcfXBHeo9jEr4lQ |   1   |
| f0isFKnJVZVmpdCPNd8Gfw | DW6SI2KxcfXBHeo9jEr4lQ |   1   |
+------------------------+------------------------+-------+
[1363242 rows x 3 columns]
Note: Only the head of t

### Load in item_data, for side_data_factorization...

In [9]:
pd.__version__

u'0.18.0'

In [183]:
# create Pandas DF from json business data

seen_categories = {}
fname = 'data_analysis/test_data/test100_business.json'
with open(fname) as js_file:
    js_list = [json.loads(js_line) for js_line in js_file]
    js_list_filtered = [js for js in js_list if 'Restaurants' in js['categories']]
    dfb = pd.DataFrame(js_list_filtered)

In [185]:
dfb.head()

Unnamed: 0,attributes,business_id,categories,city,full_address,hours,latitude,longitude,name,neighborhoods,open,review_count,stars,state,type
0,"{u'Take-out': True, u'Drive-Thru': False, u'Ou...",5UmKMjUEUNdYWqANhGckJw,"[Fast Food, Restaurants]",Dravosburg,"4734 Lebanon Church Rd\nDravosburg, PA 15034","{u'Tuesday': {u'close': u'21:00', u'open': u'1...",40.354327,-79.900706,Mr Hoagie,[],True,4,4.5,PA,business
1,"{u'Alcohol': u'full_bar', u'Noise Level': u'av...",mVHrayjG3uZ_RLHkLj-AMg,"[Bars, American (New), Nightlife, Lounges, Res...",rankin,"414 Hawkins Ave\nrankin, PA 15104","{u'Tuesday': {u'close': u'19:00', u'open': u'1...",40.413464,-79.880247,Emil's Lounge,[],True,20,5.0,PA,business
2,"{u'Alcohol': u'full_bar', u'Noise Level': u'lo...",KayYbHCt-RkbGcPdGOThNg,"[Bars, American (Traditional), Nightlife, Rest...",Carnegie,"141 Hawthorne St\nGreentree\nCarnegie, PA 15106","{u'Monday': {u'close': u'02:00', u'open': u'11...",40.415517,-80.067534,Alexion's Bar & Grill,[Greentree],True,21,4.0,PA,business
3,"{u'Take-out': True, u'Drive-Thru': False, u'Ou...",wJr6kSA5dchdgOdwH6dZ2w,"[Burgers, Breakfast & Brunch, American (Tradit...",Carnegie,"2100 Washington Pike\nCarnegie, PA 15106","{u'Monday': {u'close': u'02:00', u'open': u'08...",40.387512,-80.093215,Kings Family Restaurant,[],True,8,3.5,PA,business
4,"{u'Alcohol': u'full_bar', u'Noise Level': u'av...",fNGIbpazjTRdXgwRY_NIXA,"[Bars, American (Traditional), Nightlife, Loun...",Carnegie,"1201 Washington Ave\nCarnegie, PA 15106","{u'Monday': {u'close': u'23:00', u'open': u'11...",40.396469,-80.084942,Rocky's Lounge,[],True,7,4.0,PA,business


In [188]:
df_itemdata = dfb[['business_id', 'name', 'attributes', 'categories']]
df_itemdata.head()

Unnamed: 0,business_id,name,attributes,categories
0,5UmKMjUEUNdYWqANhGckJw,Mr Hoagie,"{u'Take-out': True, u'Drive-Thru': False, u'Ou...","[Fast Food, Restaurants]"
1,mVHrayjG3uZ_RLHkLj-AMg,Emil's Lounge,"{u'Alcohol': u'full_bar', u'Noise Level': u'av...","[Bars, American (New), Nightlife, Lounges, Res..."
2,KayYbHCt-RkbGcPdGOThNg,Alexion's Bar & Grill,"{u'Alcohol': u'full_bar', u'Noise Level': u'lo...","[Bars, American (Traditional), Nightlife, Rest..."
3,wJr6kSA5dchdgOdwH6dZ2w,Kings Family Restaurant,"{u'Take-out': True, u'Drive-Thru': False, u'Ou...","[Burgers, Breakfast & Brunch, American (Tradit..."
4,fNGIbpazjTRdXgwRY_NIXA,Rocky's Lounge,"{u'Alcohol': u'full_bar', u'Noise Level': u'av...","[Bars, American (Traditional), Nightlife, Loun..."


In [189]:
# pd.io.json.json_normalize()

In [11]:
print js['business_id']
print js['categories']
print js['attributes']

5UmKMjUEUNdYWqANhGckJw
[u'Fast Food', u'Restaurants']
{u'Take-out': True, u'Drive-Thru': False, u'Outdoor Seating': False, u'Caters': False, u'Noise Level': u'average', u'Parking': {u'garage': False, u'street': False, u'validated': False, u'lot': False, u'valet': False}, u'Delivery': False, u'Attire': u'casual', u'Has TV': False, u'Price Range': 1, u'Good For': {u'dessert': False, u'latenight': False, u'lunch': False, u'dinner': False, u'breakfast': False, u'brunch': False}, u'Takes Reservations': False, u'Ambience': {u'romantic': False, u'intimate': False, u'classy': False, u'hipster': False, u'divey': False, u'touristy': False, u'trendy': False, u'upscale': False, u'casual': False}, u'Waiter Service': False, u'Accepts Credit Cards': True, u'Good for Kids': True, u'Good For Groups': True, u'Alcohol': u'none'}


In [35]:
dfcat[['categories']]

Unnamed: 0,categories
0,"[Fast Food, Restaurants]"
1,[Nightlife]
2,"[Auto Repair, Automotive]"
3,"[Active Life, Mini Golf, Golf]"
4,"[Shopping, Home Services, Internet Service Pro..."
5,"[Bars, American (New), Nightlife, Lounges, Res..."
6,"[Active Life, Trainers, Fitness & Instruction]"
7,"[Bars, American (Traditional), Nightlife, Rest..."
8,"[Auto Repair, Automotive, Tires]"
9,"[Active Life, Mini Golf]"


In [None]:
# create the recommender (will train during this step)
rec = graphlab.recommender.factorization_recommender.create(
            sf,
            user_id='user_id',
            item_id='business_id',
            target='stars',
            solver='als',
            side_data_factorization=False)

In [None]:
rec = graphlab.recommender.create(
            sf,
            user_id='user_id',
            item_id='business_id',
            target ='stars')

In [None]:
# evaluate 1 datapoint (user=1, business=100) for potential rating
one_datapoint_sf = graphlab.SFrame({'user_id': [1], 'business_id': [100]})
print "stars:", rec.predict(one_datapoint_sf)[0]   # 4.879

In [None]:
rec.list_fields()

In [None]:
rec.get('coefficients')['intercept']

In [None]:
# rec['coefficients']  # equivalent to: rec.get('coefficients')
rec.get('coefficients')['user_id']

In [None]:
# rec['coefficients']  # equivalent to: rec.get('coefficients')
rec.get('coefficients')['business_id']

In [None]:
business_sf = rec['coefficients']['business_id']
print len(business_sf)                     # 1682
print len(business_sf['factors'][0])       # 8
user_sf = rec['coefficients']['user_id']
print len(user_sf)                      # 943
print len(user_sf['factors'][0])        # 8

In [None]:
import os

In [None]:
os.path.getsize()

In [None]:
import progressbar

In [None]:
progressbar.__version__

In [None]:
bar = progressbar.ProgressBar