# Data Preprocessing:

In [25]:
from bigdata_a3_utils import *
import pandas as pd
import pyarrow as pa
import pyarrow.dataset as ds
import pyarrow.parquet as pq
from pathlib import Path
import os
import time
import gc
import re

In [26]:
subset_sizes = [11, 11, 12]  # Define the sizes of each subset

# Create subsets
subsets = []
start_index = 0
for size in subset_sizes:
    subsets.append(VALID_CATEGORIES[start_index:start_index + size])
    start_index += size

# Unpack the subsets into individual variables
kailash_subset, saeed_subset, johnny_subset = subsets

print(f"Subset list: {kailash_subset}")
# Output the subsets

Subset list: ['All_Beauty', 'Amazon_Fashion', 'Appliances', 'Arts_Crafts_and_Sewing', 'Automotive', 'Baby_Products', 'Beauty_and_Personal_Care', 'Books', 'CDs_and_Vinyl', 'Cell_Phones_and_Accessories', 'Clothing_Shoes_and_Jewelry']


Define paths below

In [None]:
base_path = Path("C:\dataset")
pickle_path = Path("C:\pickles")

  base_path = Path("C:\dataset")
  pickle_path = Path("C:\pickles")


In [28]:
def load_review_data(base_path, category):
    review_path = base_path / f"raw_review_{category}.tar.gz"

    print(f"Loading review data for {category}...")
    review_dataset = load_compressed_dataset(review_path)    
    reviews_df = review_dataset["full"].to_pandas()

    return reviews_df

In [29]:
def load_metadata(base_path, category):
    metadata_folder_path = base_path / f"raw_meta_{category}.tar.gz"

    print(f"Loading metadata for {category}...")
    metadata = load_compressed_dataset(metadata_folder_path)
    metadata_df = metadata["full"].to_pandas()

    return metadata_df

In [30]:
def merge_dataframes(reviews_df, meta_df):
    merged_df = pd.merge(
        reviews_df,
        meta_df,
        left_on='parent_asin',
        right_on='parent_asin',
        how='inner',
        suffixes=('_reviews', '_meta')
    )

    return merged_df

In [31]:
def extract_brand(row):
    # Check if store exists and is not empty after stripping
    if row.get('store') and row['store'].strip():
        return row['store'].strip()  # Return store if it has a non-empty value
    
    # Try to extract brand from details
    elif row.get('details'):
        # Use a regex pattern similar to your original function
        match = re.search(r"Brand[:\s\-]*([A-Za-z0-9&\s]+)", row['details'], re.IGNORECASE)
        if match:
            return match.group(1).strip()  # Return the brand found in details
    
    # Default fallback
    return 'Unknown'

# Preprocessing creates pickle files for temporary storage

In [None]:
# Create directory if none exists
if not os.path.exists(pickle_path):
    os.makedirs(pickle_path)

# Main preprocessing loop
for category in VALID_CATEGORIES:
    pickle_file_name = os.path.join(pickle_path, f'cleaned_data_{category}.pkl')
    
    if os.path.exists(pickle_file_name):
        print(f"Pickle file already exists for {category}. Skipping preprocessing...")
        continue
    
    reviews_df = load_review_data(base_path, category)
    metadata_df = load_metadata(base_path, category)
    
    df = merge_dataframes(reviews_df, metadata_df)
    print("Merged meta and reviews")

    # del reviews_df
    # del metadata_df   ## wont it need 
                        ##to remove the memory used to hold review and meta?
    # gc.collect()

    df = df[(df['rating'].notnull()) & (df['rating'].between(1, 5))]
    print("Filtered ratings")

    df = df[df['text'].str.strip().astype(bool)]
    print("Filtered empty reviews")

    df.loc[:, 'brand'] = df.apply(extract_brand, axis=1)
    print("Extracted brands")

    df = df.drop_duplicates(subset=['user_id', 'text', 'asin'], keep='first')
    print("Dropped duplicates")

    df.loc[:, 'review_length'] = df['text'].apply(lambda x: len(x.split()))
    print("Calculated review length")

    df.loc[:, 'year'] = pd.to_datetime(df['timestamp'], unit='ms', errors='coerce').dt.year
    print("Extracted year")

    pickle_file_name = os.path.join(pickle_path, f'cleaned_data_{category}.pkl')
    df.to_pickle(pickle_file_name)
    
    del reviews_df
    del metadata_df
    gc.collect()


Loading review data for All_Beauty...
Extracting C:\dataset\raw_review_All_Beauty.tar.gz to C:\dataset\temp_abfd0f0ae08f42c3bc4584a7af3b68f0...
Loading dataset from C:\dataset\temp_abfd0f0ae08f42c3bc4584a7af3b68f0\raw_review_All_Beauty...
Cleaning up temporary directory: C:\dataset\temp_abfd0f0ae08f42c3bc4584a7af3b68f0
Loading metadata for All_Beauty...
Extracting C:\dataset\raw_meta_All_Beauty.tar.gz to C:\dataset\temp_3343a7c2f6ff47fa92a1891cfede8adc...
Loading dataset from C:\dataset\temp_3343a7c2f6ff47fa92a1891cfede8adc\raw_meta_All_Beauty...
Cleaning up temporary directory: C:\dataset\temp_3343a7c2f6ff47fa92a1891cfede8adc
Merged meta and reviews
Filtered ratings
Filtered empty reviews
Extracted brands
Dropped duplicates
Calculated review length
Extracted year
