In [1]:
import numpy as np
import json
import tarfile

Loading Dataset

In [2]:
tar = tarfile.open('yelp_dataset.tar')
members = tar.getmembers()

members

[<TarInfo 'Dataset_User_Agreement.pdf' at 0x1e19465c118>,
 <TarInfo 'yelp_academic_dataset_business.json' at 0x1e19465c2b8>,
 <TarInfo 'yelp_academic_dataset_checkin.json' at 0x1e19465c1e8>,
 <TarInfo 'yelp_academic_dataset_review.json' at 0x1e19465c458>,
 <TarInfo 'yelp_academic_dataset_tip.json' at 0x1e19465c528>,
 <TarInfo 'yelp_academic_dataset_user.json' at 0x1e19465c5f8>]

In [3]:
review_file = tar.extractfile('yelp_academic_dataset_review.json')

In [4]:
review_file

<ExFileObject name='yelp_dataset.tar'>

In [5]:
test_data = []
for _ in range(3):
    test_data.append(json.loads(review_file.readline()))

test_data

[{'review_id': 'lWC-xP3rd6obsecCYsGZRg',
  'user_id': 'ak0TdVmGKo4pwqdJSTLwWw',
  'business_id': 'buF9druCkbuXLX526sGELQ',
  'stars': 4.0,
  'useful': 3,
  'funny': 1,
  'cool': 1,
  'text': "Apparently Prides Osteria had a rough summer as evidenced by the almost empty dining room at 6:30 on a Friday night. However new blood in the kitchen seems to have revitalized the food from other customers recent visits. Waitstaff was warm but unobtrusive. By 8 pm or so when we left the bar was full and the dining room was much more lively than it had been. Perhaps Beverly residents prefer a later seating. \n\nAfter reading the mixed reviews of late I was a little tentative over our choice but luckily there was nothing to worry about in the food department. We started with the fried dough, burrata and prosciutto which were all lovely. Then although they don't offer half portions of pasta we each ordered the entree size and split them. We chose the tagliatelle bolognese and a four cheese filled pas

Finally extracting data from a tarfile

In [6]:
review_data = []
for _ in range(50000):
    review_data.append(json.loads(review_file.readline()))

In [8]:
review_data[0]

{'review_id': 'T5fAqjjFooT4V0OeZyuk1w',
 'user_id': 'SFQ1jcnGguO0LYWnbbftAA',
 'business_id': '0AzLzHfOJgL7ROwhdww2ew',
 'stars': 2.0,
 'useful': 1,
 'funny': 1,
 'cool': 1,
 'text': "I've stayed at many Marriott and Renaissance Marriott's and this was a huge disappointment! The front desk and atrium is nice..there is a starbucks on site which is nice.\n\nThe rooms are run down and old.  There is a flat screen but that is to be expected of a Renaissance.\n\nWe got this hotel via Priceline at a rate of $75/night...good deal for the price but this is not a true Renaissance.",
 'date': '2010-01-08 02:29:15'}

#### Extracting basic statistics from data
- First extracting some simple numerical features

In [9]:
ratings = []
cool = []
funny = []
useful = []

for rev in review_data:
    ratings.append(rev['stars'])
    cool.append(rev['cool'])
    funny.append(rev['funny'])
    useful.append(rev['useful'])


We can convert them into numpy arrays

In [10]:
ratings = np.array(ratings)
cool = np.array(cool)
funny = np.array(funny)
useful = np.array(useful)

Now it very simple to do simple statistical operations

In [11]:
np.mean(ratings)

3.77398

In [12]:
print(np.var(funny))
np.std(ratings)

1.4665266415999996


1.3772635766620709

In [13]:
# nd-array
np.stack([cool, funny, useful])[0:2]

array([[1, 0, 0, ..., 0, 0, 1],
       [1, 0, 0, ..., 0, 0, 1]])

In [14]:
features = np.stack([cool, funny, useful]).T

In [15]:
features[:2]

array([[1, 1, 1],
       [0, 0, 0]])

In [16]:
# Converting it to a matrix

features = np.matrix(features)

In [17]:
features.T * features

matrix([[ 79192,  54652, 101747],
        [ 54652,  79048,  87193],
        [101747,  87193, 267647]])

In [18]:
np.linalg.inv(features.T * features)

matrix([[ 3.18670807e-05, -1.35322779e-05, -7.70589602e-06],
        [-1.35322779e-05,  2.54926690e-05, -3.16055706e-06],
        [-7.70589602e-06, -3.16055706e-06,  7.69532352e-06]])

In [19]:
np.eye(3)

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

Basic Visualization and exploratory data analysis