# Data Preprocessing:
# Please double check for errors or misinterpretation ~Kailash

In [2]:
from bigdata_a3_utils import *
import pandas as pd
import pyarrow as pa
import pyarrow.dataset as ds
import pyarrow.parquet as pq
from pathlib import Path
import os
import time
import gc

In [3]:
subset_sizes = [11, 11, 12]  # Define the sizes of each subset

# Create subsets
subsets = []
start_index = 0
for size in subset_sizes:
    subsets.append(VALID_CATEGORIES[start_index:start_index + size])
    start_index += size

# Unpack the subsets into individual variables
kailash_subset, saeed_subset, johnny_subset = subsets

print(f"Subset list: {kailash_subset}")
# Output the subsets

Subset list: ['All_Beauty', 'Amazon_Fashion', 'Appliances', 'Arts_Crafts_and_Sewing', 'Automotive', 'Baby_Products', 'Beauty_and_Personal_Care', 'Books', 'CDs_and_Vinyl', 'Cell_Phones_and_Accessories', 'Clothing_Shoes_and_Jewelry']


Define paths below

In [4]:
base_path = Path("D:\\COMP3610A3")
pickle_path = Path("D:\\COMP3610A3\\dataframes")

In [5]:
def load_review_data(base_path, category):
    review_path = base_path / f"raw_review_{category}.tar.gz"

    print(f"Loading review data for {category}...")
    review_dataset = load_compressed_dataset(review_path)    
    reviews_df = review_dataset["full"].to_pandas()

    return reviews_df

In [6]:
def load_metadata(base_path, category):
    metadata_folder_path = base_path / f"raw_meta_{category}.tar.gz"

    print(f"Loading metadata for {category}...")
    metadata = load_compressed_dataset(metadata_folder_path)
    metadata_df = metadata["full"].to_pandas()

    return metadata_df

In [7]:
def merge_dataframes(reviews_df, meta_df):
    merged_df = pd.merge(
        reviews_df,
        meta_df,
        left_on='parent_asin',
        right_on='parent_asin',
        how='inner',
        suffixes=('_reviews', '_meta')
    )

    return merged_df

In [8]:
def extract_brand(row):
    if row['store']:
        return row['store']  # Return store if it has a value
    elif row['details']:
        # Extract brand from details (assuming brand is mentioned in a specific format)
        # Here we just check if the string contains 'Brand' and extract it
        for word in row['details'].split():
            if 'Brand' in word:
                return word  # Return the brand found in details
    return 'Unknown'

# Preprocessing creates pickle files for temporary storage

In [9]:
# Create directory if none exists
if not os.path.exists(pickle_path):
    os.makedirs(pickle_path)

# Main preprocessing loop
for category in johnny_subset:
    pickle_file_name = os.path.join(pickle_path, f'cleaned_data_{category}.pkl')
    
    if os.path.exists(pickle_file_name):
        print(f"Pickle file already exists for {category}. Skipping preprocessing...")
        continue
    
    reviews_df = load_review_data(base_path, category)
    metadata_df = load_metadata(base_path, category)
    
    df = merge_dataframes(reviews_df, metadata_df)
    print("Merged meta and reviews")

    df = df[(df['rating'].notnull()) & (df['rating'].between(1, 5))]
    print("Filtered ratings")

    df = df[df['text'].str.strip().astype(bool)]
    print("Filtered empty reviews")

    df.loc[:, 'brand'] = df.apply(extract_brand, axis=1)
    print("Extracted brands")

    df = df.drop_duplicates(subset=['user_id', 'text', 'asin'], keep='first')
    print("Dropped duplicates")

    df.loc[:, 'review_length'] = df['text'].apply(lambda x: len(x.split()))
    print("Calculated review length")

    df.loc[:, 'year'] = pd.to_datetime(df['timestamp'], unit='ms', errors='coerce').dt.year
    print("Extracted year")

    pickle_file_name = os.path.join(pickle_path, f'cleaned_data_{category}.pkl')
    df.to_pickle(pickle_file_name)
    
    del reviews_df
    del metadata_df
    gc.collect()


Pickle file already exists for Movies_and_TV. Skipping preprocessing...
Pickle file already exists for Musical_Instruments. Skipping preprocessing...
Pickle file already exists for Office_Products. Skipping preprocessing...
Pickle file already exists for Patio_Lawn_and_Garden. Skipping preprocessing...
Loading review data for Pet_Supplies...
Extracting D:\COMP3610A3\raw_review_Pet_Supplies.tar.gz to D:\COMP3610A3\temp_91ef180841d64e7cbed41695de26afa4...
Loading dataset from D:\COMP3610A3\temp_91ef180841d64e7cbed41695de26afa4\raw_review_Pet_Supplies...
Cleaning up temporary directory: D:\COMP3610A3\temp_91ef180841d64e7cbed41695de26afa4
Loading metadata for Pet_Supplies...
Extracting D:\COMP3610A3\raw_meta_Pet_Supplies.tar.gz to D:\COMP3610A3\temp_ab0ef047728b4af8b8002ba998c15457...
Loading dataset from D:\COMP3610A3\temp_ab0ef047728b4af8b8002ba998c15457\raw_meta_Pet_Supplies...
Cleaning up temporary directory: D:\COMP3610A3\temp_ab0ef047728b4af8b8002ba998c15457
Merged meta and reviews
F

KeyboardInterrupt: 