# Scratchpad for testing out graphlab with project data

In [1]:
import json
import numpy as np
import graphlab as gl
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

[INFO] graphlab.cython.cy_server: GraphLab Create v1.10.1 started. Logging: /tmp/graphlab_server_1467331875.log


This non-commercial license of GraphLab Create is assigned to windsurf_dean@yahoo.com and will expire on May 26, 2017. For commercial licensing options, visit https://dato.com/buy/.


## Load in user-item-rating data...

In [2]:
# create Pandas DF with data pulled from Postgres (restaurant only reviews)
df = pd.read_csv('data_analysis/user_reviews.csv', header=None,
                 names=['user_id', 'business_id', 'biz_name', 'stars', 'locale'])

In [None]:
df.head()

In [None]:
df.info()

In [3]:
# store the observation data in Graphlab's SFrame type
sf_obs = gl.SFrame(df[['user_id', 'business_id', 'stars']])

In [None]:
sf_obs.head()

# Quick EDA

## Group reviews by users...

In [5]:
# number of reviews by user (388k distinct users)
reviews_by_user = sf_obs.groupby('user_id', [gl.aggregate.COUNT()]).sort('Count', ascending=False)#['Count']
reviews_by_user[0:10]

user_id,Count
DrWLhrK8WMZf7Jb-Oqc7ww,817
9A2-wSoBUxlMd3LwmlGrrQ,782
3gIfcQq5KxAegwCPXc83cQ,754
Iu3Jo9ROp2IWC9FwtWOaUQ,666
kGgAARL2UmvCcTRfiscjug,665
ia1nTRAQEaFWv0cwADeK7g,655
pEVf8GRshP9HUkSpizc9LA,611
glRXVWWD6x1EZKfjJawTOg,601
uZbTb-u-GVjTa2gtQfry5g,563
0bNXP9quoJEgyVZu9ipGgQ,525


In [6]:
reviews_by_user[-10:]

user_id,Count
Rtg76jfmQPvV_oR8-p8KTw,1
UhR_BNjo9D7TjodphHO_Xw,1
vdSENJiX3ba39bT62_K2Ww,1
8vL_O9a2kyTYLwWS76jC6w,1
i8q5Jyqo2sx77H3lfbXyAA,1
r-Nj2eaow23g2tDJuBPj3w,1
-ZpbWAg4NOa6DkXLYZTVZQ,1
gJtHSP48N_MmQPBxjJi_Nw,1
JtH5Axhtzkn6OxYL_jCgrg,1
ccLq_11c5ZG-xdGb8ClUxA,1


In [None]:
# df_cat_full.head().T[250:]

In [None]:
plt.figure(figsize=(15,10))
plt.semilogx(reviews_by_user, '.');
plt.grid()
plt.xlabel('user number')
plt.ylabel('number of reviews')
plt.title('Plot of number of reviews, by user')

In [None]:
plt.loglog(reviews_by_user, '.');
plt.grid()
plt.xlabel('user number')
plt.ylabel('number of reviews')
plt.title('Log-Log plot of number of reviews, by user')

In [None]:
plt.figure(figsize=(15,10))
plt.hist(reviews_by_user, bins=100, log=True, cumulative=False);
plt.grid()
plt.xlabel('number of reviews')
plt.ylabel('count of users')
plt.title('(log) Histogram of number of reviews, by users')

## Group reviews by restaurants...

In [None]:
# number of reviews by restaurant (25k distinct restaurants)
reviews_by_biz = sf_obs.groupby('business_id', [gl.aggregate.COUNT()]).sort('Count', ascending=False)['Count']
reviews_by_biz[0:10]

In [None]:
plt.semilogx(reviews_by_biz, '.');
plt.grid()
plt.xlabel('biz number')
plt.ylabel('number of reviews')
plt.title('Plot of number of reviews, by business')

In [None]:
plt.loglog(reviews_by_biz, '.');
plt.grid()
plt.xlabel('biz number')
plt.ylabel('number of reviews')
plt.title('Log-Log plot of number of reviews, by business')

In [None]:
plt.hist(reviews_by_biz, bins=50, log=True);
plt.grid()
plt.autoscale(tight=True)
plt.xlabel('number of reviews')
plt.ylabel('count of biz')
plt.title('(log) Histogram of number of reviews, by business')

## Load in business.json file as 'item_data', for side data factorization...

In [4]:
# create Pandas DF from json business data
fname_test = 'data_analysis/test_data/test100_business.json'
fname_full = '../data/Yelp/yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_business.json'
# 'data_analysis/full_data/yelp_academic_dataset_business.json'

In [5]:
fname = fname_full # user full dataset
# fname = fname_test # user small test dataset
with open(fname) as js_file:
    js_list = [json.loads(js_line) for js_line in js_file]
    js_list_filtered = [js for js in js_list if 'Restaurants' in js['categories']]
    dfb = pd.DataFrame(js_list_filtered)

In [6]:
print '{} = Orig business list size'.format(len(js_list))
print '{} = Restaurant list size'.format(len(js_list_filtered))

77445 = Orig business list size
25071 = Restaurant list size


In [7]:
dfb.head(3)

Unnamed: 0,attributes,business_id,categories,city,full_address,hours,latitude,longitude,name,neighborhoods,open,review_count,stars,state,type
0,"{u'Take-out': True, u'Drive-Thru': False, u'Ou...",5UmKMjUEUNdYWqANhGckJw,"[Fast Food, Restaurants]",Dravosburg,"4734 Lebanon Church Rd\nDravosburg, PA 15034","{u'Tuesday': {u'close': u'21:00', u'open': u'1...",40.354327,-79.900706,Mr Hoagie,[],True,4,4.5,PA,business
1,"{u'Alcohol': u'full_bar', u'Noise Level': u'av...",mVHrayjG3uZ_RLHkLj-AMg,"[Bars, American (New), Nightlife, Lounges, Res...",rankin,"414 Hawkins Ave\nrankin, PA 15104","{u'Tuesday': {u'close': u'19:00', u'open': u'1...",40.413464,-79.880247,Emil's Lounge,[],True,20,5.0,PA,business
2,"{u'Alcohol': u'full_bar', u'Noise Level': u'lo...",KayYbHCt-RkbGcPdGOThNg,"[Bars, American (Traditional), Nightlife, Rest...",Carnegie,"141 Hawthorne St\nGreentree\nCarnegie, PA 15106","{u'Monday': {u'close': u'02:00', u'open': u'11...",40.415517,-80.067534,Alexion's Bar & Grill,[Greentree],True,21,4.0,PA,business


In [None]:
# quick peek at business_id and name
dfb[['business_id', 'name']].head()

### Handle the 'attribute' field

which contains nested json fields...

In [None]:
# handle the attribute field...
df_att = pd.io.json.json_normalize(dfb['attributes'])
df_att.columns = ['Attributes.'+col for col in df_att.columns]
df_att.head()

In [None]:
df_att.info()

Wow... lot of NaN / Null. How to handle this?

### Handle the 'categories' field

Which is a flat list of categories.

The data has already been filtered down to contain 'Restaurants' in the field.

Found hint/solution here: http://datascience.stackexchange.com/questions/8253/how-to-binary-encode-multi-valued-categorical-variable-from-pandas-dataframe

In [8]:
import collections

In [None]:
dfb[['categories']].head()

In [9]:
# apply collections.counter() to get a dict of hashable objects...
dfb_cat_dict = dfb['categories'].apply(collections.Counter)
dfb_cat_dict.head()

0                 {u'Restaurants': 1, u'Fast Food': 1}
1    {u'Bars': 1, u'Restaurants': 1, u'Nightlife': ...
2    {u'American (Traditional)': 1, u'Bars': 1, u'N...
3    {u'American (Traditional)': 1, u'Burgers': 1, ...
4    {u'American (Traditional)': 1, u'Bars': 1, u'N...
Name: categories, dtype: object

In [10]:
df_cat_full = pd.DataFrame.from_records(dfb_cat_dict).fillna(value=0).astype(int)
df_cat_full.head()

Unnamed: 0,Active Life,Adult Entertainment,Afghan,African,Alsatian,Amateur Sports Teams,American (New),American (Traditional),Amusement Parks,Antiques,...,Uzbek,Vegan,Vegetarian,Venezuelan,Venues & Event Spaces,Vietnamese,Wine Bars,Wineries,Wok,Yoga
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
df_cat_full.columns

In [56]:
# category EXCLUSION list (hand-picked)...
exclude_cat = \
   ['Active Life', 'Amateur Sports Teams', 'Amusement Parks', 'Antiques', 'Apartments',
    'Appliances', 'Arcades', 'Art Galleries', 'Arts & Crafts', 'Arts & Entertainment', 'Auto Repair',
    'Automotive', 'Banks & Credit Unions', 'Beauty & Spas', 'Bed & Breakfast', 'Bikes', 'Boating',
    'Books, Mags, Music & Video', 'Bookstores', 'Bowling', 'Building Supplies', 'Butcher',
    'Candy Stores', 'Car Wash', 'Caterers', 'Chocolatiers & Shops', 'Coffee & Tea Supplies',
    'Colleges & Universities', 'Convenience Stores', 'Cooking Schools', 'Country Clubs',
    'Country Dance Halls', 'DJs', 'Dance Clubs', 'Day Spas', 'Department Stores', 'Discount Store',
    'Do-It-Yourself Food', 'Dry Cleaning & Laundry', 'Education', 'Event Planning & Services',
    'Fashion', 'Festivals', 'Financial Services', 'Fitness & Instruction', 'Flea Markets',
    'Flowers & Gifts', 'Food Delivery Services', 'Food Tours', 'Furniture Stores',
    'Gas & Service Stations', 'Gay Bars', 'Golf', 'Grocery', 'Guest Houses', 'Hardware Stores',
    'Health & Medical', 'Health Markets', 'Heating & Air Conditioning/HVAC', 'Herbs & Spices',
    'Hiking', 'Hobby Shops', 'Home & Garden', 'Home Decor', 'Home Services', 'Hotels',
    'Hotels & Travel', 'Jazz & Blues', 'Karaoke', 'Kids Activities', 'Kitchen & Bath', 'Lakes',
    'Landmarks & Historical Buildings', 'Leisure Centers', 'Local Services', 'Lounges', 'Meat Shops',
    'Music Venues', 'Musicians', 'Nightlife', 'Nutritionists', 'Organic Stores', 'Parking',
    'Party & Event Planning', 'Pasta Shops', 'Patisserie/Cake Shop', 'Performing Arts',
    'Personal Chefs', 'Pet Services', 'Pets', 'Piano Bars', 'Plumbing', 'Pool Halls',
    'Public Services & Government', 'Real Estate', 'Shopping', 'Shopping Centers', 'Soccer',
    'Social Clubs', 'Specialty Schools', 'Sporting Goods', 'Sports Clubs', 'Sports Wear',
    'Street Vendors', 'Swimming Pools', 'Tea Rooms', 'Tours', 'Toy Stores', 'Travel Services',
    'Venues & Event Spaces', 'Wine Bars', 'Yoga']

In [17]:
print '{} = number of orig categories'.format(len(df_cat_full.T))
print '{} = number of excluded categories'.format(len(exclude_cat))
print 'Exclusion categories:\n', exclude_cat

296 = number of orig categories
112 = number of excluded categories
Exclusion categories:
['Active Life', 'Amateur Sports Teams', 'Amusement Parks', 'Antiques', 'Apartments', 'Appliances', 'Arcades', 'Art Galleries', 'Arts & Crafts', 'Arts & Entertainment', 'Auto Repair', 'Automotive', 'Banks & Credit Unions', 'Beauty & Spas', 'Bed & Breakfast', 'Bikes', 'Boating', 'Books, Mags, Music & Video', 'Bookstores', 'Bowling', 'Building Supplies', 'Butcher', 'Candy Stores', 'Car Wash', 'Caterers', 'Chocolatiers & Shops', 'Coffee & Tea Supplies', 'Colleges & Universities', 'Convenience Stores', 'Cooking Schools', 'Country Clubs', 'Country Dance Halls', 'DJs', 'Dance Clubs', 'Day Spas', 'Department Stores', 'Discount Store', 'Do-It-Yourself Food', 'Dry Cleaning & Laundry', 'Education', 'Event Planning & Services', 'Fashion', 'Festivals', 'Financial Services', 'Fitness & Instruction', 'Flea Markets', 'Flowers & Gifts', 'Food Delivery Services', 'Food Tours', 'Furniture Stores', 'Gas & Service Sta

Drop the categories in the exclusion list

In [18]:
df_cat = df_cat_full.drop(exclude_cat, axis=1)
df_cat.head()

Unnamed: 0,Adult Entertainment,Afghan,African,Alsatian,American (New),American (Traditional),Arabian,Argentine,Armenian,Asian Fusion,...,Trinidadian,Turkish,Ukrainian,Uzbek,Vegan,Vegetarian,Venezuelan,Vietnamese,Wineries,Wok
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Combining the 'categories' and 'attributes' into item data

In [19]:
# df_item_data = pd.concat([dfb[['business_id', 'name']] , df_cat, df_att], axis=1)

df_item_data = pd.concat([dfb[['business_id', 'name']] , df_cat], axis=1) # categories for now...
df_item_data.head()

Unnamed: 0,business_id,name,Adult Entertainment,Afghan,African,Alsatian,American (New),American (Traditional),Arabian,Argentine,...,Trinidadian,Turkish,Ukrainian,Uzbek,Vegan,Vegetarian,Venezuelan,Vietnamese,Wineries,Wok
0,5UmKMjUEUNdYWqANhGckJw,Mr Hoagie,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,mVHrayjG3uZ_RLHkLj-AMg,Emil's Lounge,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,KayYbHCt-RkbGcPdGOThNg,Alexion's Bar & Grill,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,wJr6kSA5dchdgOdwH6dZ2w,Kings Family Restaurant,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,fNGIbpazjTRdXgwRY_NIXA,Rocky's Lounge,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
# convert to SFrame...
sf_itemdata = gl.SFrame(df_item_data)

# Setup the train/test data split...

In [21]:
sf_obs.shape

(1363242, 3)

In [22]:
sf_obs.column_names()

['user_id', 'business_id', 'stars']

In [23]:
# split the test data via Graphlab's recommender tailored splitter function
train_set, test_set = gl.recommender.util.random_split_by_user(sf_obs,
                                                               'user_id',
                                                               'business_id',
                                                               max_num_users=100)

In [24]:
train_set

user_id,business_id,stars
Ouus3lC3Xk3YRGx4ptd_4A,DW6SI2KxcfXBHeo9jEr4lQ,2
dz_sBsL69aWzsxksT2O_Mg,DW6SI2KxcfXBHeo9jEr4lQ,2
zyI-u0C1YOzp5v1j2wuCOQ,DW6SI2KxcfXBHeo9jEr4lQ,3
vyfsQo-estP8EfiIFMsL6g,DW6SI2KxcfXBHeo9jEr4lQ,3
lxZSVeJz6KEBW1nlA3JKJg,DW6SI2KxcfXBHeo9jEr4lQ,1
CbMCOeeHVeafcD-6-CJrPg,DW6SI2KxcfXBHeo9jEr4lQ,4
Z8DyUMojRhFbLyHBymM_rA,DW6SI2KxcfXBHeo9jEr4lQ,2
sXWvDlMQspjJ1zzr3cEM7Q,DW6SI2KxcfXBHeo9jEr4lQ,2
G8RaaiVzbyRvsBmoUx0VeA,DW6SI2KxcfXBHeo9jEr4lQ,1
f0isFKnJVZVmpdCPNd8Gfw,DW6SI2KxcfXBHeo9jEr4lQ,1


In [25]:
test_set

user_id,business_id,stars
9K013bOSmGBmZqAV2YDfpA,bcW-OuYklAXeEZWBgc7TaA,5
xjZgADf08bQlOQnWwW_IKg,px45x27eir8RyN6YjX-VWQ,4
hMnwV8h15C1sfgVIP0dgQw,LXrq0Fw6MCz70KMA928ldQ,2
NyFFJjymTm1N6pjHp37KiQ,PWG28q4JFOc8FiRBjnfCkA,2
tAM6TS8Mcnqf6Ar4zcxG6A,3ZQmCHGtK6JNHlxaCeOx7g,4
d7Jyyd5lKGoNSV1SoSqZxg,N50HceZf1595UtU867u4uQ,5
RnEZs_qJf2KqosVcA2eUrQ,CKf3mf7fl5sZO9xviHU7lQ,4
-ff8in_34TSOnzLIFYQdaA,KJnVuzpveyDrHARVNZaYVg,1
WJNMh_hUymOX7CDKtCOKUA,d52zg-S0o940WUCK-nNiKw,5
KsHHV9UJl2zzXJ2a04gbbQ,4uGHPY-OpJN08CabtTAvNg,5


# Run the recommender...

In [57]:
# create the recommender (will train during this step)
rec = gl.recommender.factorization_recommender.create(
            train_set,
            user_id='user_id',
            item_id='business_id',
            target='stars',
#             item_data=sf_itemdata
            )

In [58]:
rec.save('frec_without_sides')


In [7]:
mws = gl.load_model('frec_with_sides')
mwos= gl.load_model('frec_without_sides')

In [42]:
import random

In [81]:
# find similar users (via cosine similiarity) to the given user
test_user = reviews_by_user[random.randint(0,len(reviews_by_user))]
print test_user
simuser_ws = mws.get_similar_users([test_user['user_id']], k=2000)
simuser_ns = mwos.get_similar_users([test_user['user_id']], k=2000)
print simuser_ws.tail()
print simuser_ns.tail()

{'Count': 79, 'user_id': 'UFr-WCtnNbX8O3sI7h4qeQ'}


+------------------------+------------------------+----------------+------+
|        user_id         |        similar         |     score      | rank |
+------------------------+------------------------+----------------+------+
| UFr-WCtnNbX8O3sI7h4qeQ | DvwpzNEPln8ywGd7xtryfg | 0.999362170696 | 1991 |
| UFr-WCtnNbX8O3sI7h4qeQ | cXiMY-Xb4ZdLS8tM9ksQqg | 0.999362051487 | 1992 |
| UFr-WCtnNbX8O3sI7h4qeQ | 37DAW26R2LI0p9mQdFX1xg | 0.999362051487 | 1993 |
| UFr-WCtnNbX8O3sI7h4qeQ | y7NxB1I19YIxTSeZLfk6PA | 0.999362051487 | 1994 |
| UFr-WCtnNbX8O3sI7h4qeQ | TWZ0qe64v-lXVa-TdYycKg | 0.999362051487 | 1995 |
| UFr-WCtnNbX8O3sI7h4qeQ | Xd7vmJdko39h6zY_R4K8CQ | 0.999362051487 | 1996 |
| UFr-WCtnNbX8O3sI7h4qeQ | WxSMeBeDgXx6lylvx_OjoQ | 0.999362051487 | 1997 |
| UFr-WCtnNbX8O3sI7h4qeQ | kGJ-3vLFUHv-rvK3jmLClQ | 0.999361991882 | 1998 |
| UFr-WCtnNbX8O3sI7h4qeQ | JasjEgtuuKG9WflCV4lyqA | 0.999361991882 | 1999 |
| UFr-WCtnNbX8O3sI7h4qeQ | DG9WLmbw3tev5agizBAUVQ | 0.999361991882 | 2000 |
+-----------

In [102]:
stat = mws.training_stats
s1 = stat['progress']
s1.print_rows()

In [None]:
# rec = gl.recommender.create(
#             sf_obj,
#             user_id='user_id',
#             item_id='business_id',
#             target ='stars')

In [39]:
# evaluate 1 datapoint (user=1, business=100) for potential rating
one_datapoint_sf = gl.SFrame({'user_id': [1], 'business_id': [100]})
print "stars:", rec.predict(one_datapoint_sf)[0]   # 4.879

stars: 3.73694314521


In [40]:
rec.list_fields()

['adagrad_momentum_weighting',
 'additional_iterations_if_unhealthy',
 'binary_target',
 'coefficients',
 'data_load_time',
 'init_random_sigma',
 'item_id',
 'item_side_data_column_names',
 'item_side_data_column_types',
 'linear_regularization',
 'max_iterations',
 'model_name',
 'nmf',
 'num_factors',
 'num_features',
 'num_item_side_features',
 'num_items',
 'num_observations',
 'num_tempering_iterations',
 'num_user_side_features',
 'num_users',
 'observation_data_column_names',
 'random_seed',
 'regularization',
 'regularization_type',
 'sgd_convergence_interval',
 'sgd_convergence_threshold',
 'sgd_max_trial_iterations',
 'sgd_sampling_block_size',
 'sgd_step_adjustment_interval',
 'sgd_step_size',
 'sgd_trial_sample_minimum_size',
 'sgd_trial_sample_proportion',
 'side_data_factorization',
 'solver',
 'step_size_decrease_rate',
 'target',
 'tempering_regularization_start_value',
 'track_exact_loss',
 'training_rmse',
 'training_stats',
 'training_time',
 'user_id',
 'user_side_

In [41]:
rec.get('coefficients')['intercept']

3.7369431452062614

In [42]:
# rec['coefficients']  # equivalent to: rec.get('coefficients')
rec.get('coefficients')['user_id']

user_id,linear_terms,factors
Ouus3lC3Xk3YRGx4ptd_4A,0.0327613838017,"[0.0261160954833, 0.0263281986117, ..."
dz_sBsL69aWzsxksT2O_Mg,-0.0152220064774,"[-0.0241532307118, -0.0241916514933, ..."
zyI-u0C1YOzp5v1j2wuCOQ,0.0465549193323,"[0.0160138569772, 0.0160326808691, ..."
vyfsQo-estP8EfiIFMsL6g,0.0587146058679,"[-0.0107431840152, -0.0103426072747, ..."
lxZSVeJz6KEBW1nlA3JKJg,-0.11037093401,"[-0.0459412224591, -0.0458470769227, ..."
CbMCOeeHVeafcD-6-CJrPg,0.0605444423854,"[-0.0185883156955, -0.018461458385, ..."
Z8DyUMojRhFbLyHBymM_rA,-0.110937416553,"[-0.0106938546523, -0.0105813508853, ..."
sXWvDlMQspjJ1zzr3cEM7Q,-0.0254131220281,"[0.0130900871009, 0.0133146680892, ..."
G8RaaiVzbyRvsBmoUx0VeA,-0.00300458748825,"[0.0119381546974, 0.0119537906721, ..."
f0isFKnJVZVmpdCPNd8Gfw,-0.226664245129,"[-0.00979323592037, -0.00989496335387, ..."


In [43]:
# rec['coefficients']  # equivalent to: rec.get('coefficients')
rec.get('coefficients')['business_id']

business_id,linear_terms,factors
Yf_mKctDPfMQkig-DavQJw,-0.0407451838255,"[-0.0085883019492, -0.00839034561068, ..."
Hxw3hG8Efg6WXSa5oRJhrA,-0.234818160534,"[-0.06912881881, -0.0693349763751, ..."
KkdkWeTMN8q-xpWD0ckx1Q,0.222354754806,"[-0.0495907403529, -0.0494525320828, ..."
sUwVz00qBevDgt2dhe_KXA,-0.0903140455484,"[-0.0292513202876, -0.0294271633029, ..."
xoYY0F5hrs00h285tWmZZA,-0.0753992125392,"[0.0891507640481, 0.0887101069093, ..."
m6JwiGrKrs0XCmH9ij3fmA,-0.060532130301,"[0.0517805144191, 0.0518697313964, ..."
BaoMhBMUDnMDud6F2ZgxrA,-0.119151495397,"[-0.0343846902251, -0.0343868993223, ..."
h_1xOfqdfxqNlGOG1oMwKQ,-0.197489455342,"[-0.049904268235, -0.0500828027725, ..."
E4b5OC_6mZ0V7B6Nyjncsg,0.113419637084,"[0.0469928234816, 0.0473065190017, ..."
1xu5wFd0TeBg6xNBiRtknw,0.0979582667351,"[-0.071278013289, -0.0715539827943, ..."


In [44]:
business_sf = rec['coefficients']['business_id']
print len(business_sf)                     # 1682
print len(business_sf['factors'][0])       # 8
user_sf = rec['coefficients']['user_id']
print len(user_sf)                      # 943
print len(user_sf['factors'][0])        # 8

25071
8
388600
8
