**Handling the jsonl files**

In [1]:
from bigdata_a3_utils import load_compressed_dataset
import pandas as pd
import numpy as np
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the review dataset from .tar.gz
review_dataset = load_compressed_dataset(r"C:\Users\annik\Downloads\amazon_reviews_data\raw_review_Handmade_Products.tar.gz")
df_reviews = review_dataset["full"].to_pandas()
print("Review dataframe created")

# Load the meta dataset from .jsonl
meta_file_path = r"C:\Users\annik\Downloads\review\meta_Handmade_Products.jsonl"
with open(meta_file_path, 'r', encoding='utf-8') as f:
    meta_data = [json.loads(line) for line in f]  # Read each line as a JSON object

# Convert the meta data to a DataFrame
df_meta = pd.DataFrame(meta_data)
print("Meta dataframe created")

Extracting C:\Users\annik\Downloads\amazon_reviews_data\raw_review_Handmade_Products.tar.gz to C:\Users\annik\Downloads\amazon_reviews_data\temp_9fad51ac22e54a7db4635f19e95c86a3...
Loading dataset from C:\Users\annik\Downloads\amazon_reviews_data\temp_9fad51ac22e54a7db4635f19e95c86a3\raw_review_Handmade_Products...
Cleaning up temporary directory: C:\Users\annik\Downloads\amazon_reviews_data\temp_9fad51ac22e54a7db4635f19e95c86a3
Review dataframe created
Meta dataframe created


In [3]:
# Merge datasets
merged_df = df_reviews.merge(df_meta, how="left", on="parent_asin")
print("Dataframe successfully merged!")

Dataframe successfully merged!


In [4]:
# Filter reviews and clean data
merged_df = merged_df[merged_df['rating'].between(1, 5)]
merged_df = merged_df[merged_df['text'].notna() & (merged_df['text'].str.strip() != "")]
merged_df['details'] = merged_df['details'].fillna("Unknown")
merged_df['store'] = merged_df['store'].fillna("Unknown")

print("Remaining rows:", len(merged_df))
print("Missing brands from details:", (merged_df['details'] == 'Unknown').sum())
print("Missing brands from store:", (merged_df['store'] == 'Unknown').sum())

Remaining rows: 662043
Missing brands from details: 0
Missing brands from store: 3299


In [5]:
# Drop duplicates
before = len(merged_df)
merged_df = merged_df.drop_duplicates(subset=['user_id', 'asin', 'text'], keep='first')
after = len(merged_df)
print(f"Removed {before - after} duplicate reviews")

Removed 7345 duplicate reviews


In [7]:
import re  

# Function to compute token count in review text
def review_length(text):
    if isinstance(text, str):
        tokens = re.findall(r'\b\w+\b', text)
        return len(tokens)
    else:
        return 0  
    
# Apply to create the 'review_length' column
merged_df['review_length'] = merged_df['text'].apply(review_length)
print("Review length column created!")

Review length column created!


In [8]:
# Convert the 'timestamp' column to datetime
merged_df['timestamp'] = pd.to_datetime(merged_df['timestamp'], errors='coerce')
print("Converting timestamp column to datetime")

# Extract the year
merged_df['year'] = merged_df['timestamp'].dt.year
merged_df[['text', 'review_length', 'timestamp', 'year']].head()
print("Year column extracted and created!")

Converting timestamp column to datetime
Year column extracted and created!


In [9]:
# Handling complex columns (set to NaN temporarily)
print("Handling complex columns... converting values to NaN")
complex_cols = ['images_x', 'images_y', 'videos', 'features', 'description', 'categories']

for col in complex_cols:
    merged_df[col] = np.nan

Handling complex columns... converting values to NaN


In [10]:
# Save cleaned dataframe as Parquet
print("Saving cleaned category...")
merged_df.to_parquet("Handmade_Products_Merged.parquet", index=False, engine='fastparquet')

Saving cleaned category...


In [11]:
import pandas as pd

# Read the saved Parquet file into a DataFrame
df_handmade_products = pd.read_parquet("Handmade_Products_Merged.parquet", engine='fastparquet')

# Print the first few rows of the DataFrame
print(df_handmade_products.head())

   rating                                     title_x  \
0     5.0                            Beautiful colors   
1     5.0  You simply must order order more than one!   
2     5.0                                       Great   
3     5.0                  Well made and so beautiful   
4     5.0            Smells just like the real thing!   

                                                text  images_x        asin  \
0  I bought one for myself and one for my grandda...       NaN  B08GPJ1MSN   
1  I’ve ordered three bows so far. Have not been ...       NaN  B084TWHS7W   
2  As pictured. Used a frame from the dollar stor...       NaN  B07V3NRQC4   
3  This is beyond beautiful.  So shiny, the size ...       NaN  B071ZMDK26   
4  Oh wow what a pleasant surprise! This smells g...       NaN  B01MPVZ4YP   

  parent_asin                       user_id                     timestamp  \
0  B08GPJ1MSN  AF7OANMNHQJC3PD4HRPX2FATECPA 1970-01-01 00:27:01.607495111   
1  B084TWHS7W  AGMJ3EMDVL6OWBJF7CA