# Purpose

### Author: Julius Remigio

Split reviews.csv by category, which can be found in the metadata file.

use pattern from here:
https://github.com/DSE-capstone-sharknado/models-legacy
reviews.csv into men, women, phone etc like that cpp file does

In [1]:
import pandas as pd
import gzip
import shutil

In [2]:
# categories we are interested in per original c++ code
categories = ["Women", "Men", "Girls", "Boys", "Baby"]

In [3]:
# Reading the metadata file
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

### Review Columns
    reviewerID - ID of the reviewer, e.g. A2SUAM1J3GNN3B
    asin - ID of the product, e.g. 0000013714
    unixReviewTime - time of the review (unix time)
   


In [4]:
# create dataframe from reviews.csv
reviews = pd.read_csv('reviews.csv', names=['reviewerID', 'asin', 'unixReviewTime'])
reviews.head()

Unnamed: 0,reviewerID,asin,unixReviewTime
0,A1KLRMWW2FWPL4,31887,1297468800
1,A2G5TCU2WDFZ65,31887,1358553600
2,A1RLQXYNCMWRWN,31887,1357257600
3,A8U3FAMSJVHS5,31887,1398556800
4,A3GEOILWLK86XM,31887,1394841600


In [5]:
# read the data file, it takes a few minutes
df_meta = getDF('meta_Clothing_Shoes_and_Jewelry.json.gz')

In [6]:
print(df_meta.columns)
print(df_meta.shape)

Index([u'asin', u'categories', u'title', u'price', u'salesRank', u'imUrl',
       u'brand', u'related', u'description'],
      dtype='object')
(1503384, 9)


In [7]:
df_meta[(df_meta.description.isnull()==False) & (df_meta.description != '')]

Unnamed: 0,asin,categories,title,price,salesRank,imUrl,brand,related,description
1,0000031887,"[[Clothing, Shoes & Jewelry, Girls, Clothing, ...",Ballet Dress-Up Fairy Tutu,6.79,{u'Sports &amp; Outdoors': 8547},http://ecx.images-amazon.com/images/I/314qZjYe...,Boutique Cutie,"{u'also_bought': [u'0000031852', u'0000031895'...",This adorable basic ballerina tutu is perfect ...
2,0123456479,"[[Clothing, Shoes & Jewelry, Novelty, Costumes...",SHINING IMAGE HUGE PINK LEATHER JEWELRY BOX / ...,64.98,{u'Kitchen & Dining': 16987},http://ecx.images-amazon.com/images/I/413tGhqo...,,"{u'also_bought': [u'B000BMTCK6', u'B0006JCGUM'...",Elegance par excellence. Hand-crafted of the f...
10,0641674791,"[[Clothing, Shoes & Jewelry, Women, Accessorie...",Red Crocodile Embossed Leather Travel Wallet,,,http://ecx.images-amazon.com/images/I/31wW2pPm...,,,ISBN: 0641674791\nISBN-13: 9780641674792\nManu...
11,0641997078,"[[Clothing, Shoes & Jewelry, Luggage & Travel ...",Vera Bradley Tote Blue Rhapsody,47.99,{u'Home &amp; Kitchen': 656086},http://ecx.images-amazon.com/images/I/51WqmUxY...,,"{u'also_viewed': [u'B00A8OV6R4', u'B00A8P5M96'...","Vera Bradley Tote\n1 veryberry paisley,3 symph..."
13,0681358173,"[[Clothing, Shoes & Jewelry, Women, Petite, Fa...",Bear Ear Hoodie Jacket &amp; Poms on drawstrin...,,{u'Toys & Games': 600198},http://ecx.images-amazon.com/images/I/51JCLkQk...,,,Adorable fleece teddy bear ear sweater for adu...
14,0742403920,"[[Clothing, Shoes & Jewelry, Novelty, Costumes...",Rectangle Blank Puzzle (12-pack),3.60,{u'Toys & Games': 16840},http://ecx.images-amazon.com/images/I/51BgWs7D...,Instructional Fair,"{u'also_bought': [u'B00598K6LQ', u'B00598K1IO'...","Designed to be easily decorated with crayons, ..."
15,0756029929,"[[Clothing, Shoes & Jewelry, Novelty, Costumes...",Spanish Third-year Pin Set of 10,11.45,{u'Toys & Games': 918374},http://ecx.images-amazon.com/images/I/51AqSOl7...,,,"Spanish Third-year Pin, 1 inch in diameter. S..."
16,0756029104,"[[Clothing, Shoes & Jewelry, Novelty, Costumes...",Viva Espanol Pins Set of 10,11.45,{u'Home &amp; Kitchen': 2651070},http://ecx.images-amazon.com/images/I/51By%2BZ...,,,"Viva Espaol pin, 1 x 1 inch. Set of 10."
17,0765599864,"[[Clothing, Shoes & Jewelry, Novelty, Costumes...",Eagle / Hawk / Parrot Nose Bird Beak Costume Mask,,{u'Toys & Games': 907289},http://ecx.images-amazon.com/images/I/41N8QNv6...,,,"This Bird Beak Nose Costume Mask is about 3"", ..."
18,0839933363,"[[Clothing, Shoes & Jewelry, Novelty, Costumes...",Death Note Anime Manga: Cross Logo necklace,,{u'Toys & Games': 1350779},http://ecx.images-amazon.com/images/I/51f0HkHs...,,,This necklace from the popular manga and anime...


### metadata file analysis

In [53]:
print df_meta.shape
print(df_meta.brand.describe())
print 'Percent NaN:', df_meta[(df_meta.brand.isnull()) | (df_meta.brand == '')].brand.shape[0]/float(df_meta.shape[0])
print '\n'
print(df_meta.price.describe())
print 'Percent NaN:', df_meta[(df_meta.price.isnull()) | (df_meta.price == -1)].price.shape[0]/float(df_meta.shape[0])
print '\n'
print(df_meta.description.describe())
print 'Percent NaN:', df_meta[(df_meta.description.isnull()) | (df_meta.description == '')].description.shape[0]/float(df_meta.shape[0])


(1503384, 5)
count                      98132
unique                      8501
top       Rubie&#39;s Costume Co
freq                        4079
Name: brand, dtype: object
Percent NaN: 0.935208170368


count    574882.000000
mean         46.285506
std          73.692372
min           0.010000
25%          12.990000
50%          23.360000
75%          48.990000
max         999.990000
Name: price, dtype: float64
Percent NaN: 0.617608009664


count     85442
unique    60407
top            
freq        651
Name: description, dtype: object
Percent NaN: 0.94359990528


In [43]:
# we only care about brand and categories columns
df_meta = df_meta[[u'asin', u'categories',u'brand', u'price', 'description']]

# categories is a list of lists 
df_meta[:1].categories[0]

[['Clothing, Shoes & Jewelry', 'Girls'],
 ['Clothing, Shoes & Jewelry',
  'Novelty, Costumes & More',
  'Costumes & Accessories',
  'More Accessories',
  'Kids & Baby']]

In [44]:
df_meta.head()

Unnamed: 0,asin,categories,brand,price,description
0,37214,"[[Clothing, Shoes & Jewelry, Girls], [Clothing...",Big Dreams,6.99,
1,31887,"[[Clothing, Shoes & Jewelry, Girls, Clothing, ...",Boutique Cutie,6.79,This adorable basic ballerina tutu is perfect ...
2,123456479,"[[Clothing, Shoes & Jewelry, Novelty, Costumes...",,64.98,Elegance par excellence. Hand-crafted of the f...
3,456844570,"[[Clothing, Shoes & Jewelry, Women, Accessorie...",,,
4,456808574,"[[Clothing, Shoes & Jewelry, Women, Accessorie...",,,


In [45]:
# get second level of each assigned category tree
allcats = df_meta.categories.apply(lambda x: [c[1] for c in x if c[0]=='Clothing, Shoes & Jewelry']).tolist()

In [46]:
# get distinct categories 
sorted([c for c in set([item for sublist in allcats for item in sublist]) if c[0] in 'WMGBwmgb'])

['B',
 'Baby',
 'Boot Shop',
 'Boutique Designer Jewelry',
 "Boy's Athletic Watches",
 'Boys',
 'Breda',
 'Breitling Watches',
 'Burberry Watches',
 'G',
 'GUESS? Watches',
 'Game Time',
 'Gemstones',
 'Gifts',
 "Girl's Athletic Watches",
 'Girls',
 'Glycine',
 'Gucci Watches',
 'M',
 'Made in USA',
 'Men',
 "Men's Athletic Watches",
 "Men's Designer Jewelry",
 'Michael Kors Watches',
 'Michele Watches',
 'Momentum',
 'Motorcycle & ATV Casual Footwear',
 'Movado',
 'W',
 "WF Inheritance Test Women's CK Custom Store",
 'Watch Gifts',
 'Wedding Party Gifts',
 'Wenger Swiss Military',
 'Winter Event',
 'Winter Promo',
 'Wolf Designs',
 'Women',
 "Women's Athletic Watches",
 "Women's Luxury Brands"]

In [47]:
# join metadata and reviews so we can split the reviews by category
combined = pd.merge(reviews, df_meta, left_on='asin', right_on='asin')
combined.head()

Unnamed: 0,reviewerID,asin,unixReviewTime,categories,brand,price,description
0,A1KLRMWW2FWPL4,31887,1297468800,"[[Clothing, Shoes & Jewelry, Girls, Clothing, ...",Boutique Cutie,6.79,This adorable basic ballerina tutu is perfect ...
1,A2G5TCU2WDFZ65,31887,1358553600,"[[Clothing, Shoes & Jewelry, Girls, Clothing, ...",Boutique Cutie,6.79,This adorable basic ballerina tutu is perfect ...
2,A1RLQXYNCMWRWN,31887,1357257600,"[[Clothing, Shoes & Jewelry, Girls, Clothing, ...",Boutique Cutie,6.79,This adorable basic ballerina tutu is perfect ...
3,A8U3FAMSJVHS5,31887,1398556800,"[[Clothing, Shoes & Jewelry, Girls, Clothing, ...",Boutique Cutie,6.79,This adorable basic ballerina tutu is perfect ...
4,A3GEOILWLK86XM,31887,1394841600,"[[Clothing, Shoes & Jewelry, Girls, Clothing, ...",Boutique Cutie,6.79,This adorable basic ballerina tutu is perfect ...


In [48]:
# create a file for each category of reviews.
for cat in sorted(categories):
    csv = 'review_{0}.csv'.format(cat)
    csvc = 'review_{0}c.csv'.format(cat)
    
    # create files without the categories column
    combined[combined.categories.apply(lambda x: any(cat in c for c in x))][[c for c in combined.columns if c not in ['description','categories']]].to_csv(csv + '.gz', compression='gzip',index=False)
    print cat, combined[combined.categories.apply(lambda x: any(cat in c for c in x))].shape
    # create files with the categories column
    combined[combined.categories.apply(lambda x: any(cat in c for c in x))].to_csv(csvc + '.gz', compression='gzip',index=False)

Baby (3961, 7)
Boys (6672, 7)
Girls (8085, 7)
Men (71542, 7)
Women (180612, 7)


### Review Women's segmented file

We are primarily interesetd in Women's clothing

In [50]:
women = pd.read_csv('review_Womenc.csv.gz', compression='gzip')
women.head()

Unnamed: 0,reviewerID,asin,unixReviewTime,categories,brand,price,description
0,A4KU0XJNBH674,B000051SEN,1372723200,"[['Electronics', 'GPS & Navigation', 'Sports &...",Suunto,159.99,
1,A3HY330W94JPUQ,B000051SEN,1354492800,"[['Electronics', 'GPS & Navigation', 'Sports &...",Suunto,159.99,
2,A1CHOKV10NEI8X,B000051SEN,1143936000,"[['Electronics', 'GPS & Navigation', 'Sports &...",Suunto,159.99,
3,AOUBIY0S651IU,B000051SEN,1399248000,"[['Electronics', 'GPS & Navigation', 'Sports &...",Suunto,159.99,
4,A1GPGBHBI6T2HJ,B000051SEN,1175558400,"[['Electronics', 'GPS & Navigation', 'Sports &...",Suunto,159.99,


#### Summary Stats for needed features

In [57]:
print 'Row Count:',women.shape
print 'brand %NaN:', women[(women.brand.isnull()) | (women.brand == '')].brand.shape[0]/float(women.shape[0])
print 'price %NaN:', women[(women.price.isnull()) | (women.price == -1)].price.shape[0]/float(women.shape[0])
print 'description %NaN:', women[(women.description.isnull()) | (women.description == '')].description.shape[0]/float(women.shape[0])


Row Count: (180612, 7)
brand %NaN: 0.872998471862
price %NaN: 0.605142515447
description %NaN: 0.97888290922
