# Purpose

### Author: Julius Remigio

Split reviews.csv by category, which can be found in the metadata file.

use pattern from here:
https://github.com/DSE-capstone-sharknado/models-legacy
reviews.csv into men, women, phone etc like that cpp file does

In [9]:
import pandas as pd
import gzip
import shutil

In [69]:
# categories we are interested in 
categories = ["Women", "Men", "Girls", "Boys", "Baby"]

In [10]:
# Reading the metadat file
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

### Review Columns
    reviewerID - ID of the reviewer, e.g. A2SUAM1J3GNN3B
    asin - ID of the product, e.g. 0000013714
    unixReviewTime - time of the review (unix time)
   


In [34]:
# create dataframe from reviews.csv
reviews = pd.read_csv('reviews.csv', names=['reviewerID', 'asin', 'unixReviewTime'])
reviews.head()

Unnamed: 0,reviewerID,asin,unixReviewTime
0,A1KLRMWW2FWPL4,31887,1297468800
1,A2G5TCU2WDFZ65,31887,1358553600
2,A1RLQXYNCMWRWN,31887,1357257600
3,A8U3FAMSJVHS5,31887,1398556800
4,A3GEOILWLK86XM,31887,1394841600


In [46]:
# read the data file, it takes a few minutes
df_meta = getDF('meta_Clothing_Shoes_and_Jewelry.json.gz')

# we only care about the first 2 columns
df_meta = df_meta.ix[:,:2]

# categories is a list of lists 
df_meta[:1].categories[0]

[['Clothing, Shoes & Jewelry', 'Girls'],
 ['Clothing, Shoes & Jewelry',
  'Novelty, Costumes & More',
  'Costumes & Accessories',
  'More Accessories',
  'Kids & Baby']]

In [52]:
df_meta.head()

Unnamed: 0,asin,categories
0,37214,"[[Clothing, Shoes & Jewelry, Girls], [Clothing..."
1,31887,"[[Clothing, Shoes & Jewelry, Girls, Clothing, ..."
2,123456479,"[[Clothing, Shoes & Jewelry, Novelty, Costumes..."
3,456844570,"[[Clothing, Shoes & Jewelry, Women, Accessorie..."
4,456808574,"[[Clothing, Shoes & Jewelry, Women, Accessorie..."


In [76]:
# get distinct categories - we only care about the second level of the first list
sorted([c for c in df_meta.categories.apply(lambda x: x[0][1]).drop_duplicates().tolist() if c[0] in 'WMGB'])

['B',
 'Baby',
 'Baby & Child Care',
 'Baby & Toddler Toys',
 'Bath',
 'Bath & Body',
 'Bathing & Skin Care',
 'Beading & Jewelry-Making',
 'Bedding',
 'Beverages',
 'Blues',
 'Boating & Water Sports',
 'Boot Shop',
 'Boutique Designer Jewelry',
 "Boy's Athletic Watches",
 'Boys',
 'Breitling Watches',
 'Broadway & Vocalists',
 'Building Toys',
 'G',
 'GPS & Navigation',
 'Game Time',
 'Games',
 'Gardening & Lawn Care',
 'Gear',
 'Gemstones',
 'Geography',
 'Gifts',
 'Girls',
 'Glycine',
 'Golf',
 'Grown-Up Toys',
 'M',
 'Made in USA',
 'Makeup',
 'Material Handling Products',
 'Medical Supplies & Equipment',
 'Men',
 "Men's Athletic Watches",
 'Michael Kors Watches',
 'Michele Watches',
 'Motorcycle & ATV Casual Footwear',
 'Motorcycle & Powersports',
 'Movado',
 'Movies',
 'W',
 'Wedding Party Gifts',
 'Winter Promo',
 'Wolf Designs',
 'Women',
 "Women's Athletic Watches",
 "Women's Luxury Brands"]

In [35]:
# join metadata and reviews so we can split the reviews by category
combined = pd.merge(reviews, df_meta, left_on='asin', right_on='asin')
combined.head()

Unnamed: 0,reviewerID,asin,unixReviewTime,categories
0,A1KLRMWW2FWPL4,31887,1297468800,"[[Clothing, Shoes & Jewelry, Girls, Clothing, ..."
1,A2G5TCU2WDFZ65,31887,1358553600,"[[Clothing, Shoes & Jewelry, Girls, Clothing, ..."
2,A1RLQXYNCMWRWN,31887,1357257600,"[[Clothing, Shoes & Jewelry, Girls, Clothing, ..."
3,A8U3FAMSJVHS5,31887,1398556800,"[[Clothing, Shoes & Jewelry, Girls, Clothing, ..."
4,A3GEOILWLK86XM,31887,1394841600,"[[Clothing, Shoes & Jewelry, Girls, Clothing, ..."


In [75]:
# create a file for each category of reviews.
for c in categories:
    csv = 'review_{0}.csv'.format(c)
    gz = csv + '.gz'
    combined[combined.categories.apply(lambda x: x[0][1])==c].ix[:,:].to_csv(gz, compression='gzip',index=False)