# Import Libraries

In [1]:
import json
import itertools
import pandas as pd
import numpy as np

# Declare filename, chunk size, range, file iteration number

In [2]:
filename = "Electronics.json"
chunk_size = 100000
max_objects = 2100000
objects_read = 0
i=1

# Read 100k records and data clean and save in a CSV file

In [3]:
with open(filename) as f:
    while objects_read < max_objects:
        #itertools to limit the number of JSON objects read in to chunk_size
        chunk_objects = itertools.islice(f, chunk_size)

        #exit loop when there are no more JSON objects to read in, or we have read in max_objects
        if not chunk_objects:
            break

        #json.loads to parse each JSON object in the chunk and store in a list
        data = [json.loads(obj) for obj in chunk_objects]
        
        #save in a dataframe
        df = pd.DataFrame(data)
        
        #create a new dataframe 
        new_review_df = pd.DataFrame()
        
        #assign columns to the dataframe
        new_review_df['asin']=df['asin']
        #if the overall value can't convert to numeric make it NAN
        new_review_df['overall']=pd.to_numeric(df['overall'],errors='coerce') 
        new_review_df['reviewText']=df['reviewText']
        
        #data cleaning 
        #replace empty strings NAN values
        new_review_df['asin'].replace('', np.nan, inplace=True)
        new_review_df['overall'].replace('', np.nan, inplace=True)
        new_review_df['reviewText'].replace('', np.nan, inplace=True)
        
        #remove NAN values rows
        new_review_df.dropna(subset=['overall'], inplace=True)
        new_review_df.dropna(subset=['overall'], inplace=True)
        new_review_df.dropna(subset=['reviewText'], inplace=True)
         
        # drop rows that contain the partial string
        new_review_df=new_review_df[~new_review_df.reviewText.str.contains("<|&nbsp;")]
        
        
        #save the dataframe to a CSV file
        new_review_df.to_csv("product_reviews_for_review_analysis_"+str(i)+".csv",index=False)
                
        print("Processed", i, "chunk of", chunk_size, "JSON objects")
        #increase the i
        i=i+1
        
        # increment the objects_read counter
        objects_read += len(data)

Processed 1 chunk of 100000 JSON objects
Processed 2 chunk of 100000 JSON objects
Processed 3 chunk of 100000 JSON objects
Processed 4 chunk of 100000 JSON objects
Processed 5 chunk of 100000 JSON objects
Processed 6 chunk of 100000 JSON objects
Processed 7 chunk of 100000 JSON objects
Processed 8 chunk of 100000 JSON objects
Processed 9 chunk of 100000 JSON objects
Processed 10 chunk of 100000 JSON objects
Processed 11 chunk of 100000 JSON objects
Processed 12 chunk of 100000 JSON objects
Processed 13 chunk of 100000 JSON objects
Processed 14 chunk of 100000 JSON objects
Processed 15 chunk of 100000 JSON objects
Processed 16 chunk of 100000 JSON objects
Processed 17 chunk of 100000 JSON objects
Processed 18 chunk of 100000 JSON objects
Processed 19 chunk of 100000 JSON objects
Processed 20 chunk of 100000 JSON objects
Processed 21 chunk of 100000 JSON objects
