## Convert customer reviews to parquet format

In [None]:
## Install pyarrow package

!pip install pyarrow

In [1]:
import pandas as pd
import pyarrow as pa
import numpy as np
import pyarrow.parquet as pq

In [14]:
file_name = r".\data\customer_reviews_with_sentiment_compressed.txt.gz"
parquet_file_name = r".\data\customer_reviews_with_sentiment.parquet"
df = pd.read_csv(file_name, sep='\t',on_bad_lines="skip")

In [15]:
df.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date,sentiment
0,US,16199106,R203HPW78Z7N4K,B0067WNSZY,633038551,"FGGF3032MW Gallery Series 30"" Wide Freestandin...",Major Appliances,5,0,0,N,Y,"If you need a new stove, this is a winner.",What a great stove. What a wonderful replacem...,2015-08-31,POSITIVE
1,US,16374060,R2EAIGVLEALSP3,B002QSXK60,811766671,Best Hand Clothes Wringer,Major Appliances,5,1,1,N,Y,Five Stars,worked great,2015-08-31,POSITIVE
2,US,15322085,R1K1CD73HHLILA,B00EC452R6,345562728,Supco SET184 Thermal Cutoff Kit,Major Appliances,5,0,0,N,Y,Fast Shipping,Part exactly what I needed. Saved by purchasi...,2015-08-31,POSITIVE
3,US,32004835,R2KZBMOFRMYOPO,B00MVVIF2G,563052763,Midea WHS-160RB1 Compact Single Reversible Doo...,Major Appliances,5,1,1,N,Y,Five Stars,Love my refrigerator! ! Keeps everything cold...,2015-08-31,POSITIVE
4,US,25414497,R6BIZOZY6UD01,B00IY7BNUW,874236579,Avalon Bay Portable Ice Maker,Major Appliances,5,0,0,N,Y,Five Stars,No more running to the store for ice! Works p...,2015-08-31,POSITIVE


In [16]:
df['customer_id'] = df['customer_id'].astype('str')
df['product_parent'] = df['product_parent'].astype('str')
df['star_rating'] = df['star_rating'].astype('int')
df['helpful_votes'] = df['helpful_votes'].astype('int')
df['total_votes'] = df['total_votes'].astype('int')

In [17]:
df.dtypes

marketplace          object
customer_id          object
review_id            object
product_id           object
product_parent       object
product_title        object
product_category     object
star_rating           int32
helpful_votes         int32
total_votes           int32
vine                 object
verified_purchase    object
review_headline      object
review_body          object
review_date          object
sentiment            object
dtype: object

In [18]:
df['sentiment'].unique()

array(['POSITIVE', 'NEGATIVE', 'MIXED', 'NEUTRAL',
       'InternalServerException'], dtype=object)

In [20]:
# Deleting rows where 'sentiment' is 'InternalServerException'
df = df.loc[df['sentiment'] != 'InternalServerException']

In [21]:
df.shape

(96833, 16)

In [22]:
table = pa.Table.from_pandas(df)
pq.write_table(table, parquet_file_name)

In [23]:
# Read back to verify

In [24]:
df = pd.read_parquet(parquet_file_name)

In [25]:
df.shape

(96833, 16)

In [26]:
df.columns

Index(['marketplace', 'customer_id', 'review_id', 'product_id',
       'product_parent', 'product_title', 'product_category', 'star_rating',
       'helpful_votes', 'total_votes', 'vine', 'verified_purchase',
       'review_headline', 'review_body', 'review_date', 'sentiment'],
      dtype='object')

In [27]:
df.dtypes

marketplace          object
customer_id          object
review_id            object
product_id           object
product_parent       object
product_title        object
product_category     object
star_rating           int32
helpful_votes         int32
total_votes           int32
vine                 object
verified_purchase    object
review_headline      object
review_body          object
review_date          object
sentiment            object
dtype: object

In [28]:
df['sentiment'].unique()

array(['POSITIVE', 'NEGATIVE', 'MIXED', 'NEUTRAL'], dtype=object)