# Data Preprocessing:

In [1]:
from bigdata_a3_utils import *
import pandas as pd
import pyarrow as pa
import pyarrow.dataset as ds
import pyarrow.parquet as pq
from pathlib import Path

In [8]:
df_list = []

In [9]:
subset_sizes = [11, 11, 12]  # Define the sizes of each subset

# Create subsets
subsets = []
start_index = 0
for size in subset_sizes:
    subsets.append(VALID_CATEGORIES[start_index:start_index + size])
    start_index += size

# Unpack the subsets into individual variables
kailash_subset, tbd_subset1, tbd_subset2 = subsets

print(f"Subset list: {kailash_subset}")
# Output the subsets

Subset list: ['All_Beauty', 'Amazon_Fashion', 'Appliances', 'Arts_Crafts_and_Sewing', 'Automotive', 'Baby_Products', 'Beauty_and_Personal_Care', 'Books', 'CDs_and_Vinyl', 'Cell_Phones_and_Accessories', 'Clothing_Shoes_and_Jewelry']


In [10]:
def load_all_data(category):
    base_path = Path("C:/BigDataA3/A3_dataset")
    review_path = base_path / f"raw_review_{category}.tar.gz"
    meta_folder_path = base_path / f"raw_meta_{category}" / "full"

    print(f"Loading review data for {category}...")
    review_dataset = load_compressed_dataset(review_path)

    print(f"Loading metadata for {category}...")
    meta_dataset = load_from_disk(str(meta_folder_path))

    reviews_df = review_dataset["full"].to_pandas()
    meta_df = meta_dataset.to_pandas()

    return reviews_df, meta_df

In [11]:
def merge_dataframes(reviews_df, meta_df):
    merged_df = pd.merge(
    reviews_df,
    meta_df,
    left_on='parent_asin',
    right_on='parent_asin',
    how='inner',
    suffixes=('_reviews', '_meta')
    )

    return merged_df

In [12]:
def extract_brand(row):
    if row['store']:
        return row['store']  # Return store if it has a value
    elif row['details']:
        # Extract brand from details (assuming brand is mentioned in a specific format)
        # Here we just check if the string contains 'Brand' and extract it
        for word in row['details'].split():
            if 'Brand' in word:
                return word  # Return the brand found in details
    return 'Unknown'

# MEMBERS READ
- replace the kailash_subset varible with the subset you chose. Also update the code and whatsapp chat to reflect that

In [None]:
for category in kailash_subset:
    reviews_df, metadata_df = load_all_data(category)
    # print(f"Review dataset shape: {reviews_df.shape}")
    # print(f"Review dataset columns:\n{reviews_df.columns}")
    # print(f"Metadata dataset shape: {metadata_df.shape}")
    # print(f"Metadata dataset columns:\n{metadata_df.columns}")
    merged_df = merge_dataframes(reviews_df, metadata_df)
    cleaned_rating = merged_df[(merged_df['rating'].notnull()) & 
                              (merged_df['rating'].between(1, 5))]
    cleaned_empty_review = cleaned_rating[cleaned_rating['text'].str.strip().astype(bool)]
    cleaned_empty_review.loc[:, 'brand'] = cleaned_empty_review.apply(extract_brand, axis=1)

    cleaned_brands = cleaned_empty_review

    cleaned_dup = cleaned_brands.drop_duplicates(subset=['user_id', 'text'], keep='first')
    cleaned_dup.loc[:, 'review_length'] = cleaned_dup['text'].apply(lambda x: len(x.split()))
    cleaned_dup.loc[:, 'year'] = pd.to_datetime(cleaned_dup['timestamp'], unit='ms', errors='coerce').dt.year
    
    cleaned_df = cleaned_dup
    df_list.append(cleaned_df)
    

Loading review data for All_Beauty...
Extracting C:\BigDataA3\A3_dataset\raw_review_All_Beauty.tar.gz to C:\BigDataA3\A3_dataset\temp_a46cdf4b74e24cb2ba5de8aa3dbce771...
Loading dataset from C:\BigDataA3\A3_dataset\temp_a46cdf4b74e24cb2ba5de8aa3dbce771\raw_review_All_Beauty...
Cleaning up temporary directory: C:\BigDataA3\A3_dataset\temp_a46cdf4b74e24cb2ba5de8aa3dbce771
Loading metadata for All_Beauty...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_empty_review.loc[:, 'brand'] = cleaned_empty_review.apply(extract_brand, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_dup.loc[:, 'review_length'] = cleaned_dup['text'].apply(lambda x: len(x.split()))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_dup.loc[:, 'year']

Loading review data for Amazon_Fashion...
Extracting C:\BigDataA3\A3_dataset\raw_review_Amazon_Fashion.tar.gz to C:\BigDataA3\A3_dataset\temp_6aba93d2e9d245538eb4be632b8424f1...
Loading dataset from C:\BigDataA3\A3_dataset\temp_6aba93d2e9d245538eb4be632b8424f1\raw_review_Amazon_Fashion...
Cleaning up temporary directory: C:\BigDataA3\A3_dataset\temp_6aba93d2e9d245538eb4be632b8424f1
Loading metadata for Amazon_Fashion...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_empty_review.loc[:, 'brand'] = cleaned_empty_review.apply(extract_brand, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_dup.loc[:, 'review_length'] = cleaned_dup['text'].apply(lambda x: len(x.split()))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_dup.loc[:, 'year']