In [None]:
#%pip install fastparquet # easier on memory when converting large dataframe to parquet 

## Data Cleaning

In [None]:
from bigdata_a3_utils import load_compressed_dataset
import re
import pandas as pd
import numpy as np

# Attempting to load the review data
review_dataset = load_compressed_dataset(r"C:\Users\zakar\Downloads\amazon_reviews_data\raw_review_Baby_Products.tar.gz")
review_dataset["full"][0]

meta_dataset = load_compressed_dataset(r"C:\Users\zakar\Downloads\amazon_reviews_data\raw_meta_Baby_Products.tar.gz")
meta_dataset["full"][0]

df_reviews = review_dataset["full"].to_pandas()
print("Review dataframe created")
df_meta = meta_dataset["full"].to_pandas()
print("Meta dataframe created")

merged_df = df_reviews.merge(df_meta, how="left", on="parent_asin")
print("Dataframe successfully merged!")



merged_df = merged_df[merged_df['rating'].between(1, 5)]
merged_df = merged_df[merged_df['text'].notna() & (merged_df['text'].str.strip() != "")]
merged_df['details'] = merged_df['details'].fillna("Unknown")
merged_df['store'] = merged_df['store'].fillna("Unknown")

print("Remaining rows:", len(merged_df))
print("Missing brands from details:", (merged_df['details'] == 'Unknown').sum())
print("Missing brands from store:", (merged_df['store'] == 'Unknown').sum())



before = len(merged_df)
merged_df = merged_df.drop_duplicates(subset=['user_id', 'asin', 'text'], keep='first')
after = len(merged_df)
print(f"Removed {before - after} duplicate reviews")



# Function to compute token count in review text
def review_length(text):
    if isinstance(text, str):
        tokens = re.findall(r'\b\w+\b', text)
        return len(tokens)
    else:
        return 0  # or np.nan if you prefer

# Apply to create the 'review_length' column
merged_df['review_length'] = merged_df['text'].apply(review_length)
print("Review length column created!")



# Convert the 'timestamp' column to datetime
merged_df['timestamp'] = pd.to_datetime(merged_df['timestamp'], errors='coerce')
print("Converting timestamp column to datetime")

# Extract the year
merged_df['year'] = merged_df['timestamp'].dt.year
merged_df[['text', 'review_length', 'timestamp', 'year']].head()
print("Year column extracted and created!")



print("Handling complex columns... converting values to NaN")
# List of complex columns to handle
complex_cols = ['images_x', 'images_y', 'videos', 'features', 'description', 'categories']

# Temporarily set complex columns to NaN
for col in complex_cols:
    merged_df[col] = np.nan



print("Saving cleaned category...")
#Saving cleaned and merged dataset to parquet
merged_df.to_parquet("Baby_Products_Merged.parquet", index=False, engine='fastparquet')

Troubleshooting for some complex columns... later decided they were not needed for future tasks. Therefore, converting them to NaN and will drop them after merging the full datatset (some categories were already cleaned and saved with these columns, hence we can't drop them yet)

In [2]:
import json

for col in merged_df.columns:
    if merged_df[col].apply(lambda x: isinstance(x, list)).any():
        print(f"Converting {col} from list to string...")
        merged_df[col] = merged_df[col].apply(json.dumps)


In [3]:
merged_df.columns

Index(['rating', 'title_x', 'text', 'images_x', 'asin', 'parent_asin',
       'user_id', 'timestamp', 'helpful_vote', 'verified_purchase',
       'main_category', 'title_y', 'average_rating', 'rating_number',
       'features', 'description', 'price', 'images_y', 'videos', 'store',
       'categories', 'details', 'bought_together', 'subtitle', 'author',
       'review_length', 'year'],
      dtype='object')

In [4]:
merged_df.dtypes

rating                      float64
title_x                      object
text                         object
images_x                     object
asin                         object
parent_asin                  object
user_id                      object
timestamp            datetime64[ns]
helpful_vote                  int64
verified_purchase              bool
main_category                object
title_y                      object
average_rating              float64
rating_number                 int64
features                     object
description                  object
price                        object
images_y                     object
videos                       object
store                        object
categories                   object
details                      object
bought_together              object
subtitle                     object
author                       object
review_length                 int64
year                          int32
dtype: object

In [6]:
problematic_cols = [
    col for col in merged_df.columns
    if merged_df[col].apply(lambda x: isinstance(x, (list, dict))).any()
]

print("Problematic columns:", problematic_cols)


Problematic columns: ['images_y', 'videos']


In [9]:
import json
import numpy as np

def deep_convert(obj):
    """Recursively convert any ndarray to list for JSON serialization."""
    if isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, list):
        return [deep_convert(item) for item in obj]
    elif isinstance(obj, dict):
        return {key: deep_convert(value) for key, value in obj.items()}
    else:
        return obj

def safe_serialize(x):
    try:
        return json.dumps(deep_convert(x))
    except Exception as e:
        print(f"Serialization failed for: {x} with error: {e}")
        return None

# Apply safe serialization to problematic columns
for col in problematic_cols:
    print(f"Safely serializing column: {col}")
    merged_df[col] = merged_df[col].apply(safe_serialize)


Safely serializing column: images_y
Safely serializing column: videos


In [11]:
merged_df['images_x'] = merged_df['images_x'].apply(safe_serialize)

In [13]:
merged_df['features'] = merged_df['features'].apply(safe_serialize)

In [14]:
def is_complex(val):
    return isinstance(val, (list, dict, np.ndarray))

complex_cols = [col for col in merged_df.columns if merged_df[col].apply(is_complex).any()]
print("Complex columns:", complex_cols)

Complex columns: ['description', 'categories']


In [16]:
# List of complex columns to handle
complex_cols = ['images_x', 'images_y', 'videos', 'features', 'description', 'categories']

# Temporarily set complex columns to NaN
for col in complex_cols:
    merged_df[col] = np.nan

# Save the DataFrame as parquet
merged_df.to_parquet("Appliances_Merged.parquet", index=False, engine='fastparquet')
