# Import Libraries

In [2]:
import json
import itertools
import pandas as pd
import numpy as np
import random

# Declare filename, chunk size, range, file iteration number

In [3]:
filename = "meta_Electronics.json"
num_objects = 100000

# Read 100k records save in a dateframe

In [4]:
with open(filename) as f:
    # use itertools to limit the number of JSON objects read in
    limited_objects = itertools.islice(f, num_objects)

    # use json.loads to parse each JSON object and store in a list
    data = [json.loads(obj) for obj in limited_objects]

# create a pandas dataframe from the list of JSON objects
df = pd.DataFrame(data)

In [5]:
df.head()

Unnamed: 0,category,tech1,description,fit,title,also_buy,tech2,brand,feature,rank,also_view,main_cat,similar_item,date,price,asin,imageURL,imageURLHighRes,details
0,"[Electronics, Camera &amp; Photo, Video Survei...",,[The following camera brands and models have b...,,Genuine Geovision 1 Channel 3rd Party NVR IP S...,[],,GeoVision,"[Genuine Geovision 1 Channel NVR IP Software, ...","[>#3,092 in Tools &amp; Home Improvement &gt; ...",[],Camera &amp; Photo,,"January 28, 2014",$65.00,11300000,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,
1,"[Electronics, Camera &amp; Photo]",,[This second edition of the Handbook of Astron...,,"Books ""Handbook of Astronomical Image Processi...",[0999470906],,33 Books Co.,[Detailed chapters cover these fundamental top...,"[>#55,933 in Camera &amp; Photo (See Top 100 i...","[0943396670, 1138055360, 0999470906]",Camera &amp; Photo,,"June 17, 2003",,43396828,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,
2,"[Electronics, eBook Readers &amp; Accessories,...",,[A zesty tale. (Publishers Weekly)<br /><br />...,,One Hot Summer,"[0425167798, 039914157X]",,Visit Amazon's Carolina Garcia Aguilera Page,[],"3,105,177 in Books (",[],Books,,,$11.49,60009810,[],[],
3,"[Electronics, eBook Readers & Accessories, eBo...",,[],,Hurray for Hattie Rabbit: Story and pictures (...,"[0060219521, 0060219580, 0060219394]",,Visit Amazon's Dick Gackenbach Page,[],"2,024,298 in Books (","[0060219521, 0060219475, 0060219394]",Books,,,.a-section.a-spacing-mini{margin-bottom:6px!im...,60219602,[],[],
4,"[Electronics, eBook Readers & Accessories, eBo...",,[&#8220;sex.lies.murder.fame. is brillllli&#82...,,sex.lies.murder.fame.: A Novel,[],,Visit Amazon's Lolita Files Page,[],"3,778,828 in Books (",[],Books,,,$13.95,60786817,[],[],


In [6]:
new_products_df = pd.DataFrame()
new_products_df['asin']= df['asin']
new_products_df['title']=df['title']
new_products_df['price']=df['price']
new_products_df.count()

asin     100000
title    100000
price    100000
dtype: int64

In [7]:
new_products_df.head()

Unnamed: 0,asin,title,price
0,11300000,Genuine Geovision 1 Channel 3rd Party NVR IP S...,$65.00
1,43396828,"Books ""Handbook of Astronomical Image Processi...",
2,60009810,One Hot Summer,$11.49
3,60219602,Hurray for Hattie Rabbit: Story and pictures (...,.a-section.a-spacing-mini{margin-bottom:6px!im...
4,60786817,sex.lies.murder.fame.: A Novel,$13.95


# Data Cleaning

## Replace any empty strings in the price column with np.nan objects

In [8]:
new_products_df['asin'].replace('', np.nan, inplace=True)
new_products_df['title'].replace('', np.nan, inplace=True)
new_products_df['price'].replace('', np.nan, inplace=True)
new_products_df.count()

asin     100000
title     99998
price     32760
dtype: int64

## Dropping the NAN values

In [9]:
new_products_df['asin'].isnull().sum()

0

In [10]:
new_products_df['title'].isnull().sum()

2

In [11]:
new_products_df['price'].isnull().sum()

67240

In [12]:
new_products_df.dropna(subset=['asin'], inplace=True)
new_products_df.dropna(subset=['title'], inplace=True)
new_products_df.dropna(subset=['price'], inplace=True)

In [13]:
new_products_df.count()

asin     32758
title    32758
price    32758
dtype: int64

In [14]:
new_products_df.to_csv('product_details_sample.csv',index=False)

## Dropping the price by given partial string

In [15]:
# drop rows that contain the partial string
new_products_df=new_products_df[~new_products_df.price.str.contains("a|<")]

In [16]:
new_products_df

Unnamed: 0,asin,title,price
0,0011300000,Genuine Geovision 1 Channel 3rd Party NVR IP S...,$65.00
2,0060009810,One Hot Summer,$11.49
4,0060786817,sex.lies.murder.fame.: A Novel,$13.95
6,0091912407,Girl with a One-track Mind: Confessions of the...,$4.76
8,0132492776,Wireless Bluetooth Headphones Earbuds with Mic...,$7.99
...,...,...,...
99980,B000NDLOHM,"Wireless Display Dongle Receiver,ONTOTL HDMI 1...",$2.00
99986,B000NDQ2OW,Micro Connectors G05-302SL VGA HD15 Female To ...,$3.99
99988,B000NDQ2AG,2GB Memorystick Pro Duo Gaming Edition MSDP2GB...,$7.99
99992,B000NDQ38W,APC UPS Battery Replacement for APC Smart-UPS ...,$3.80


# Creating the review_count column 

In [27]:
new_products_df['review_count']=[random.randint(0, 200) for i in range(len(new_products_df))]

In [28]:
new_products_df

Unnamed: 0,asin,title,price,review_count
0,0011300000,Genuine Geovision 1 Channel 3rd Party NVR IP S...,$65.00,120
2,0060009810,One Hot Summer,$11.49,6
4,0060786817,sex.lies.murder.fame.: A Novel,$13.95,7
6,0091912407,Girl with a One-track Mind: Confessions of the...,$4.76,120
8,0132492776,Wireless Bluetooth Headphones Earbuds with Mic...,$7.99,50
...,...,...,...,...
99980,B000NDLOHM,"Wireless Display Dongle Receiver,ONTOTL HDMI 1...",$2.00,184
99986,B000NDQ2OW,Micro Connectors G05-302SL VGA HD15 Female To ...,$3.99,43
99988,B000NDQ2AG,2GB Memorystick Pro Duo Gaming Edition MSDP2GB...,$7.99,174
99992,B000NDQ38W,APC UPS Battery Replacement for APC Smart-UPS ...,$3.80,193


In [29]:
new_products_df['review_count'].sum()

3011384

In [30]:
new_sliced_products_df=new_products_df[0:10000]
new_sliced_products_df

Unnamed: 0,asin,title,price,review_count
0,0011300000,Genuine Geovision 1 Channel 3rd Party NVR IP S...,$65.00,120
2,0060009810,One Hot Summer,$11.49,6
4,0060786817,sex.lies.murder.fame.: A Novel,$13.95,7
6,0091912407,Girl with a One-track Mind: Confessions of the...,$4.76,120
8,0132492776,Wireless Bluetooth Headphones Earbuds with Mic...,$7.99,50
...,...,...,...,...
31797,B0002XN73C,reEVOlution iSkin eVot Silicone Case for 40 GB...,$6.04,127
31809,B0002XNSVI,Hitachi Deskstar 0A30229 400GB SATA/150 7200RP...,$9.99,75
31812,B0002XQJFA,Canon EOS 20D DSLR Camera with EF-S 18-55mm f/...,$69.35,162
31813,B0002XQKEK,Opteka 10 Peice Deluxe Lens/Digital Camera Cle...,$4.95,109


In [31]:
new_sliced_products_df['review_count'].sum()

998598

In [32]:
new_products_df.to_csv('product_details_without_slice.csv',index=False)

In [33]:
new_sliced_products_df.to_csv('product_details.csv',index=False)