In [1]:
import gzip
path ="/home/fares/Documents/datasets/amazon_reviews_us_Gift_Card_v1_00.tsv.gz"
f = gzip.open(path,'rt')

In [2]:
import csv
reader = csv.reader(f, delimiter ='\t')

In [3]:
header = next(reader)

In [4]:
dataset = []
for line in f:
    line = line.split('\t')
    d = dict(zip(header,line))
    d['star_rating'] =  int(d['star_rating'])
    d['helpful_votes'] = int(d['helpful_votes'])
    d['total_votes'] = int(d['total_votes'])
    for field in ['verified_purchase','vine']:
        if d[field] == 'Y':
            d[field] = True
        else:
            d[field] = False
    dataset.append(d)

In [5]:
len(dataset)

149086

In [6]:
dataset[0]

{'marketplace': 'US',
 'customer_id': '24371595',
 'review_id': 'R27ZP1F1CD0C3Y',
 'product_id': 'B004LLIL5A',
 'product_parent': '346014806',
 'product_title': 'Amazon eGift Card - Celebrate',
 'product_category': 'Gift Card',
 'star_rating': 5,
 'helpful_votes': 0,
 'total_votes': 0,
 'vine': False,
 'verified_purchase': True,
 'review_headline': 'Five Stars',
 'review_body': 'Great birthday gift for a young adult.',
 'review_date': '2015-08-31\n'}

### remove instuance that not contains review_date

In [7]:
dataset = [d for d in dataset if 'review_date' in d]

In [8]:
len(dataset)

149086

### Filtering by date

In [9]:
for d in dataset:
    d['yearInt'] = int(d['review_date'][:4])

In [10]:
dataset[0]

{'marketplace': 'US',
 'customer_id': '24371595',
 'review_id': 'R27ZP1F1CD0C3Y',
 'product_id': 'B004LLIL5A',
 'product_parent': '346014806',
 'product_title': 'Amazon eGift Card - Celebrate',
 'product_category': 'Gift Card',
 'star_rating': 5,
 'helpful_votes': 0,
 'total_votes': 0,
 'vine': False,
 'verified_purchase': True,
 'review_headline': 'Five Stars',
 'review_body': 'Great birthday gift for a young adult.',
 'review_date': '2015-08-31\n',
 'yearInt': 2015}

In [11]:
# this is to use it later when i filter all the data and i need to practice more exercices
dataset2 = dataset

### Discard reviews written before 2010

In [12]:
dataset = [d for d in dataset if d['yearInt'] > 2009]

In [13]:
len(dataset)

148872

### Filtering by review quality

######                   "<3"  we don't know if its true helpful yet or not    |    delete any with less than 50% helpfulness          

In [14]:
dataset = [d for d in dataset if d['total_votes']<3 or d['helpful_votes']/d['total_votes'] >= 0.5]

In [15]:
len(dataset)

148578

### Filtering by user activity

In [16]:
from collections import defaultdict

In [17]:
nbReviewPerUser = defaultdict(int)

In [18]:
for d in dataset:
    nbReviewPerUser[d['customer_id']] += 1

In [19]:
dataset = [d for d in dataset if nbReviewPerUser[d['customer_id']] > 1 ]

In [20]:
len(dataset)

11289

### Filtering By Review Length

In [21]:
dataset = [d for d in dataset if len(d['review_body'].split()) >= 10]

In [22]:
len(dataset)

7098

### Filtering product that have few reviews

In [23]:
nbReviewPerProduct = defaultdict(int)
for d in dataset:
    nbReviewPerProduct[d['product_id']] +=1

In [24]:
dataset = [d for d in dataset if nbReviewPerProduct[d['product_id']] <3 ]

In [25]:
len(dataset)

621

### Filtering users who have only given '5-star' rating

In [26]:
starRatingsOfUser = defaultdict(list)
for d in dataset:
    starRatingsOfUser[d['customer_id']].append(d['star_rating']) 

In [27]:
dataset=[d for d in dataset if len(starRatingsOfUser[d['customer_id']])>starRatingsOfUser[d['customer_id']].count(5)]

In [28]:
len(dataset)

101

### Filtering reviews that aren't part of vine program

In [29]:
dataset = [d for d in dataset if d['vine'] == True]

In [30]:
len(dataset)

0

### Filtering users who haven't written a review from 2014

In [31]:
lastReviewOfEveryUser = defaultdict(int)
for d in dataset2:
    if  lastReviewOfEveryUser[d['customer_id']] < d['yearInt']:
        lastReviewOfEveryUser[d['customer_id']] = d['yearInt']

In [32]:
len(dataset2)

149086

In [33]:
dataset2 = [d for d in dataset2 if lastReviewOfEveryUser[d['customer_id']]>=2015]

In [34]:
len(dataset2)

45345