# Data Preprocessing:
# Please double check for errors or misinterpretation ~Kailash

In [1]:
from bigdata_a3_utils import *
import pandas as pd
import pyarrow as pa
import pyarrow.dataset as ds
import pyarrow.parquet as pq
from pathlib import Path
import os
import time

In [None]:
subset_sizes = [11, 11, 12]  # Define the sizes of each subset

# Create subsets
subsets = []
start_index = 0
for size in subset_sizes:
    subsets.append(VALID_CATEGORIES[start_index:start_index + size])
    start_index += size

# Unpack the subsets into individual variables
kailash_subset, tbd_subset1, tbd_subset2 = subsets

print(f"Subset list: {kailash_subset}")
# Output the subsets

In [None]:
def load_review_data(category):
    base_path = Path("D:\BigData")
    review_path = base_path / f"raw_review_{category}.tar.gz"

    print(f"Loading review data for {category}...")
    review_dataset = load_compressed_dataset(review_path)

    # time.sleep(5)
    
    reviews_df = review_dataset["full"].to_pandas()

    return reviews_df

In [None]:
def load_metadata(category):
    base_path = Path("C:/BigDataA3/A3_dataset")
    metadata_folder_path = base_path / f"raw_meta_{category}.tar.gz"

    print(f"Loading metadata for {category}...")
    metadata = load_compressed_dataset(metadata_folder_path)

    # time.sleep(5)

    metadata_df = metadata["full"].to_pandas()

    return metadata_df

In [None]:
def merge_dataframes(reviews_df, meta_df):
    merged_df = pd.merge(
    reviews_df,
    meta_df,
    left_on='parent_asin',
    right_on='parent_asin',
    how='inner',
    suffixes=('_reviews', '_meta')
    )

    return merged_df

In [None]:
def extract_brand(row):
    if row['store']:
        return row['store']  # Return store if it has a value
    elif row['details']:
        # Extract brand from details (assuming brand is mentioned in a specific format)
        # Here we just check if the string contains 'Brand' and extract it
        for word in row['details'].split():
            if 'Brand' in word:
                return word  # Return the brand found in details
    return 'Unknown'

# Preprocessing creates pickle files for temporary storage

In [None]:

pickle_directory = Path("C:/BigDataA3/dataframes")
if not os.path.exists(pickle_directory):
    os.makedirs(pickle_directory)

for category in VALID_CATEGORIES:
    reviews_df = load_review_data(category)
    metadata_df = load_metadata(category)

    df = merge_dataframes(reviews_df, metadata_df)
    print("Merged")

    df = df[(df['rating'].notnull()) & (df['rating'].between(1, 5))]
    print("Filtered ratings")

    df = df[df['text'].str.strip().astype(bool)]
    print("Filtered empty reviews")

    df.loc[:, 'brand'] = df.apply(extract_brand, axis=1)
    print("Extracted brands")

    df = df.drop_duplicates(subset=['user_id', 'text', 'asin'], keep='first')
    print("Dropped duplicates")

    df.loc[:, 'review_length'] = df['text'].apply(lambda x: len(x.split()))
    print("Calculated review length")

    df.loc[:, 'year'] = pd.to_datetime(df['timestamp'], unit='ms', errors='coerce').dt.year
    print("Extracted year")

    pickle_file_name = os.path.join(pickle_directory, f'cleaned_data_{category}.pkl')
    df.to_pickle(pickle_file_name)
    