In [1]:
import csv

In [2]:
f = open("/home/fares/Downloads/sample_us.tsv")

In [3]:
reader = csv.reader(f, delimiter = '\t')

In [4]:
header = next(reader)

In [5]:
header

['marketplace',
 'customer_id',
 'review_id',
 'product_id',
 'product_parent',
 'product_title',
 'product_category',
 'star_rating',
 'helpful_votes',
 'total_votes',
 'vine',
 'verified_purchase',
 'review_headline',
 'review_body',
 'review_date']

In [6]:
dataset = []
for line in reader:
    d = dict(zip(header,line))
    for field in ['helpful_votes','star_rating','total_votes']:
        d[field]= int(d[field])
    for field in ['verified_purchase','vine']:
        if d[field] == 'Y':
            d[field] = True
        else:
            d[field] = False
    dataset.append(d)

### Calculate the Average Rating 

In [7]:
ratings = [d['star_rating'] for d in dataset]
averageRating= sum(ratings)/len(ratings)

In [8]:
averageRating

4.2

### Rating distribution

In [9]:
ratingCounts = {1: 0,2: 0,3: 0,4: 0,5: 0}

In [10]:
for line in ratings:
    ratingCounts[line] +=1

In [11]:
ratingCounts

{1: 5, 2: 2, 3: 5, 4: 4, 5: 34}

### Calculate the rating distribution using 'defaultdict' in case the dataset has many categories

In [12]:
from collections import defaultdict

In [13]:
ratingCounts = defaultdict(int)

In [14]:
for d in dataset:
    ratingCounts[d['star_rating']] +=1

In [15]:
ratingCounts

defaultdict(int, {5: 34, 2: 2, 1: 5, 3: 5, 4: 4})

### Verified purchases in the dataset

In [16]:
verifiedPurchases = defaultdict(int)

In [17]:
for d in dataset:
    verifiedPurchases[d['verified_purchase']] +=1

In [18]:
verifiedPurchases

defaultdict(int, {True: 44, False: 6})

### Most Popular Products in the dataset

In [19]:
popularProducts = defaultdict(int)

In [20]:
for d in dataset:
    popularProducts[d['product_id']] +=1

In [21]:
count = [(popularProducts[p] , p) for p in popularProducts]

In [22]:
count.sort()

In [23]:
count[-10:]

[(1, 'B00UMSVHD4'),
 (1, 'B00UZOPOFW'),
 (1, 'B00V5DM3RE'),
 (1, 'B00VPXX92W'),
 (1, 'B00WAKEQLW'),
 (1, 'B00WJ1OPMW'),
 (1, 'B00XPWXYDK'),
 (1, 'B00YRA3H4U'),
 (1, 'B0101EHRSM'),
 (2, 'B00WTGGGRO')]

### Top rated Products

In [24]:
ratingPerProduct = defaultdict(list)

In [25]:
for d in dataset :
    ratingPerProduct[d['product_id']].append(d['star_rating'])

In [26]:
averageRatingPerProduct = {}
for d in ratingPerProduct:
    averageRatingPerProduct[d] = sum(ratingPerProduct[d])/len(ratingPerProduct[d])

In [38]:
topRatedProducts = [(averageRatingPerProduct[d],d) for d in averageRatingPerProduct if len(ratingPerProduct[d]) >0]

In [39]:
topRatedProducts.sort()

In [40]:
topRatedProducts[-10:]

[(5.0, 'B00IGXV9UI'),
 (5.0, 'B00JVY9J1M'),
 (5.0, 'B00KQUNNZ8'),
 (5.0, 'B00MZ6BR3Q'),
 (5.0, 'B00PXWS1CY'),
 (5.0, 'B00V5DM3RE'),
 (5.0, 'B00VPXX92W'),
 (5.0, 'B00WJ1OPMW'),
 (5.0, 'B00WTGGGRO'),
 (5.0, 'B00XPWXYDK')]