# COMP 3610 – A3

- Zidane Timothy, Maia Neptune, Christophe Gittens

In [1]:
# %pip install pyspark
# %pip install findspark
# %pip install -q gdown
# %pip install pandas
# %pip install matplotlib
# %pip install seaborn
# %pip install pyarrow
# %pip install setuptools

In [2]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import functions as F
from pyspark.sql.window import Window
# import `DenseVector`
from pyspark.ml.linalg import DenseVector

# import `StandardScaler`
from pyspark.ml.feature import StandardScaler


# sudo apt install python3-distutils 

In [3]:
import findspark
from pathlib import Path
import os
import tarfile
import pandas as pd
import shutil

import time, matplotlib.pyplot as plt, seaborn as sns, matplotlib.ticker as ticker
import numpy as np


# findspark.init()

In [4]:
from datasets import load_dataset
from pathlib import Path
import pyarrow as pa
import pyarrow.parquet as pq
from datetime import datetime
import json

In [5]:
# spark = SparkSession.builder\
# .appName("Amazon_Reviews")\
# .getOrCreate()

In [6]:
review_schema = StructType([
    StructField("rating", FloatType(), True),
    StructField("title", StringType(), True),
    StructField("images", ArrayType(StringType()), True),
    StructField("asin", StringType(), True),
    StructField("parent_asin", FloatType(), True),
    StructField("user_id", ArrayType(StringType()), True),
    StructField("timestamp", IntegerType(), True),
    StructField("verified_purchase", BooleanType(), True),
    StructField("helpful_vote", StringType(), True),
])

# String types in arrays may need to be sequence but couldn't find the actual sequence dytpe syntax
meta_schema = StructType([
    StructField("main_category", StringType(), True),
    StructField("title", StringType(), True),
    StructField("average_rating", FloatType(), True),
    StructField("rating_number", IntegerType(), True),
    StructField("features", ArrayType(StringType()), True),
    StructField("description", ArrayType(StringType()), True),
    StructField("price", FloatType(), True),
    StructField("images", ArrayType(StringType()), True),
    StructField("videos", ArrayType(StringType()), True),
    StructField("store", StringType(), True),
    StructField("categories", ArrayType(StringType()), True),
    StructField("details", MapType(StringType(), IntegerType()), True),
    StructField("parent_asin", FloatType(), True),
    StructField("user_id", ArrayType(StringType()), True),
    StructField("bought_together", ArrayType(StringType()), True),
    # StructField("timestamp", IntegerType(), True),
    # StructField("verified_purchase", BooleanType(), True),
    # StructField("helpful_vote", StringType(), True),
])


In [7]:
# tar_folder = 'root/Data'
# output_folder = 'root/output_folder'
# os.makedirs(output_folder, exist_ok=True)

In [8]:
def extract_tar_bz2(tar_path, extract_dir):
    if not os.path.exists(tar_path):
        print(f"Error: File {tar_path} does not exist.")
        return
    if not tar_path.endswith(".tar.bz2"):
        print(f"Error: File {tar_path} is not a .tar.bz2 file.")
        return

    try:
        with tarfile.open(tar_path, "r:bz2") as tar:
            print(f"Extracting {tar_path} to {extract_dir}")
            tar.extractall(path=extract_dir)
    except Exception as e:
        print(f"Error during extraction: {e}")

In [9]:
import os
import shutil
from datasets import load_dataset
from pathlib import Path
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd

def preprocess_category(review_tar_path, meta_tar_path, output_folder, category,batch_size=1000):
    temp_path = "Data/temp_extract" # change as needed
    os.makedirs(output_folder, exist_ok=True)

    print("Extracting tar files...")
    extract_tar_bz2(review_tar_path, temp_path)
    extract_tar_bz2(meta_tar_path, temp_path)

    arrow_files = list(Path(temp_path).rglob("*.arrow"))
    print(f"Found {len(arrow_files)} Arrow files")

    for arrow_file in arrow_files:
        try:
            is_meta = "meta" in str(arrow_file).lower()
            folder_name = "meta" if is_meta else "reviews"

            pkl_output_path = os.path.join(output_folder, f"{folder_name}_pkl")
            os.makedirs(pkl_output_path, exist_ok=True)

            # print(f"Streaming {arrow_file.name} → {parquet_output_path}")
            dataset = load_dataset("arrow", data_files=str(arrow_file), split="train", streaming=True)

            batch = []
            seen_keys = set()
            batch_num = 0

            for i, row in enumerate(dataset):
                if not row:
                    continue

                if not is_meta:
                    key = (row.get("user_id"), row.get("asin"), row.get("text"))
                    if key in seen_keys:
                        continue
                    seen_keys.add(key)

                batch.append(row)

                if len(batch) >= batch_size:
                    table = pa.Table.from_pylist(batch)
                    # pq.write_to_dataset(table, root_path=parquet_output_path)

                    # convert to pandas and save as .pkl batch
                    df = pd.DataFrame(batch)
                    df.to_pickle(os.path.join(pkl_output_path, f"{category}_batch_{batch_num}.pkl"))
                    print(f"Saved batch {batch_num} ({len(batch)} rows) to .pkl")
                    batch = []
                    batch_num += 1

            # Final batch
            if batch:
                table = pa.Table.from_pylist(batch)
                # pq.write_to_dataset(table, root_path=parquet_output_path)

                df = pd.DataFrame(batch)
                df.to_pickle(os.path.join(pkl_output_path, f"{category}_batch_{batch_num}.pkl"))
                print(f"Saved final batch {batch_num} ({len(batch)} rows)")

        except Exception as e:
            print(f"Error processing {arrow_file.name}: {e}")

    shutil.rmtree(temp_path)
    print("All done, temp folder removed.")


Calling fn to preprocess for a category

Run for one

In [10]:
# preprocess_category(
#     r"D:\UWI\Year 3\Sem 2\COMP3610-Big-Data\Assignments\Assignment#3\A3\datasets\raw_meta_All_Beauty.tar.bz2",
#     r"D:\UWI\Year 3\Sem 2\COMP3610-Big-Data\Assignments\Assignment#3\A3\datasets\raw_review_All_Beauty.tar.bz2",
#     "output_folder", category="All_Beauty"
# )

Meta and Review parsing

In [11]:
def convert_to_meta_df(folder):
    df_m = []

    for fname in sorted(os.listdir(folder)):
        if fname.endswith(".pkl"):
            try:
                file_path = os.path.join(folder, fname)
                meta_df = pd.read_pickle(file_path)
                print(f"{fname} loaded: shape = {meta_df.shape}")
                df_m.append(meta_df)
            except Exception as e:
                print(f"Error in {fname}:", e)

    if df_m:
        meta_df = pd.concat(df_m, ignore_index=True)
        print("All .pkl files loaded. Final shape:", meta_df.shape)
        
    print("Removed meta pkl folder")
    return meta_df
 

In [12]:
def convert_to_rev_df(folder):
    df_r = []
    for fname in sorted(os.listdir(folder)):
        if fname.endswith(".pkl"):
            try:
                file_path = os.path.join(folder, fname)
                review_df = pd.read_pickle(file_path)
                print(f"{fname} loaded: shape = {review_df.shape}")
                df_r.append(review_df)
            except Exception as e:
                print(f"Error in {fname}:", e)

    if df_r:
        review_df = pd.concat(df_r, ignore_index=True)
        print("All .pkl files loaded. Final shape:", review_df.shape)
        
    print("Removed reviews pkl folder")
    return review_df


Merging

In [13]:
# merged_df = pd.merge(
#     review_df,
#     meta_df,
#     on="parent_asin",
#     how="inner"
# )
# merged_df

Clean data

Dealing with the brand

In [14]:
def extract_brand(details, store):
    try:
        if isinstance(details, dict) and "brand" in details and details["brand"]:
            return details["brand"]
    except Exception:
        pass
    if isinstance(store, str) and store.strip():
        return store
    return "Unknown"

In [15]:
def clean_data(category):
    output_dir = r"D:/UWI/Year 3/Sem 2/COMP3610-Big-Data/Assignments/Assignment#3/A3/datasets/output_folder/cleaned"
    os.makedirs(output_dir, exist_ok=True)
    
    print("Merging review and meta...")
    merged_df = pd.merge(review_df, meta_df, on="parent_asin", how="inner")
    print("Merged")

    print("Filtering invalid ratings...")
    merged_df = merged_df[merged_df["rating"].between(1.0, 5.0, inclusive="both")]

    print("Dropping empty review text...")
    merged = merged_df[merged_df["text"].notna() & (merged_df["text"].str.strip() != "")]

    print("Extracting brand from metadata...")
    merged["brand"] = merged.apply(lambda row: extract_brand(row.get("details"), row.get("store")), axis=1)

    print("Removing duplicate reviews...")
    merged.drop_duplicates(subset=["user_id", "asin", "text"], keep="first", inplace=True)

    print("Computing review length...")
    merged["review_length"] = merged["text"].str.split().apply(len)

    print("Extracting year from timestamp...")
    merged["year"] = pd.to_datetime(merged["timestamp"], unit="ms", errors="coerce").dt.year

    output_file = os.path.join(output_dir, f"{category}_cleaned_merged.pkl.bz2")
    merged.to_pickle(output_file, compression="bz2")

    print(" All cleaning steps completed.")
    
    test = merged
    return test

In [16]:
# cleaned = clean_data()

Run for all

In [17]:
categories = ["All_Beauty", "Amazon_Fashion", "Appliances", "Arts_Crafts_and_Sewing"]

In [18]:
from pathlib import Path

for category in categories:
    base_path = r"D:\UWI\Year 3\Sem 2\COMP3610-Big-Data\Assignments\Assignment#3\A3\datasets"
    meta_path = os.path.join(base_path, f"raw_meta_{category}.tar.bz2")
    review_path = os.path.join(base_path, f"raw_review_{category}.tar.bz2")

    # review pkled folder
    rev_pkl  = r"D:\UWI\Year 3\Sem 2\COMP3610-Big-Data\Assignments\Assignment#3\A3\Code\output_folder\reviews_pkl" # Make sure this is the folder with review .pkl batches
    meta_pkl = r"D:\UWI\Year 3\Sem 2\COMP3610-Big-Data\Assignments\Assignment#3\A3\Code\output_folder\meta_pkl"  # Make sure this is the folder with meta .pkl batches

    preprocess_category(meta_path, review_path, "output_folder", category)
    review_df = convert_to_rev_df(rev_pkl)
    meta_df = convert_to_meta_df(meta_pkl)
    cleaned = clean_data(category)
    print(cleaned)
    del cleaned
    del meta_df
    del review_df

    # remove the review and meta pkl files that aren't compressed
    if os.path.exists(rev_pkl):
        shutil.rmtree(rev_pkl)
    else:
        print(f"{rev_pkl} path does not exist")
    
    if os.path.exists(meta_pkl):
        shutil.rmtree(meta_pkl)
    else:
        print(f"{meta_pkl} path does not exist")


Extracting tar files...
Extracting D:\UWI\Year 3\Sem 2\COMP3610-Big-Data\Assignments\Assignment#3\A3\datasets\raw_meta_All_Beauty.tar.bz2 to Data/temp_extract
Extracting D:\UWI\Year 3\Sem 2\COMP3610-Big-Data\Assignments\Assignment#3\A3\datasets\raw_review_All_Beauty.tar.bz2 to Data/temp_extract
Found 2 Arrow files
Saved batch 0 (1000 rows) to .pkl
Saved batch 1 (1000 rows) to .pkl
Saved batch 2 (1000 rows) to .pkl
Saved batch 3 (1000 rows) to .pkl
Saved batch 4 (1000 rows) to .pkl
Saved batch 5 (1000 rows) to .pkl
Saved batch 6 (1000 rows) to .pkl
Saved batch 7 (1000 rows) to .pkl
Saved batch 8 (1000 rows) to .pkl
Saved batch 9 (1000 rows) to .pkl
Saved batch 10 (1000 rows) to .pkl
Saved batch 11 (1000 rows) to .pkl
Saved batch 12 (1000 rows) to .pkl
Saved batch 13 (1000 rows) to .pkl
Saved batch 14 (1000 rows) to .pkl
Saved batch 15 (1000 rows) to .pkl
Saved batch 16 (1000 rows) to .pkl
Saved batch 17 (1000 rows) to .pkl
Saved batch 18 (1000 rows) to .pkl
Saved batch 19 (1000 rows) to

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged["brand"] = merged.apply(lambda row: extract_brand(row.get("details"), row.get("store")), axis=1)


Removing duplicate reviews...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged.drop_duplicates(subset=["user_id", "asin", "text"], keep="first", inplace=True)


Computing review length...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged["review_length"] = merged["text"].str.split().apply(len)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged["year"] = pd.to_datetime(merged["timestamp"], unit="ms", errors="coerce").dt.year


Extracting year from timestamp...
 All cleaning steps completed.
        rating                                            title_x  \
0          5.0          Such a lovely scent but not overpowering.   
1          1.0                                      Not worth it.   
2          5.0                                               love   
3          5.0                           Mermaid hair in a bottle   
4          2.0  It makes my curly/wavy hair way too smooth and...   
...        ...                                                ...   
694247     5.0                                      Very natural!   
694248     5.0      A great combo of sharp, smooth, and longevity   
694249     1.0                        Made of chemicals -Returned   
694250     1.0                                           One Star   
694251     3.0                   Good variety, but no directions.   

                                                     text  \
0       This spray is really nice. It smells 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged["brand"] = merged.apply(lambda row: extract_brand(row.get("details"), row.get("store")), axis=1)


Removing duplicate reviews...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged.drop_duplicates(subset=["user_id", "asin", "text"], keep="first", inplace=True)


Computing review length...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged["review_length"] = merged["text"].str.split().apply(len)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged["year"] = pd.to_datetime(merged["timestamp"], unit="ms", errors="coerce").dt.year


Extracting year from timestamp...
 All cleaning steps completed.
        rating                                            title_x  \
0          4.0                                         Satisfied!   
1          1.0      These do not stay up in my shoes, despite ...   
2          1.0           This are the worst socks I’ve ever tried   
3          5.0                                        comfortable   
4          2.0  I was disappointed to discover that these don'...   
...        ...                                                ...   
384330     5.0                                          Love it!!   
384331     1.0                                        Bad quality   
384332     1.0  So I ordered this for the shape of Maryland an...   
384333     4.0      Size is as expected, but order one or two up.   
384334     5.0                                         Five Stars   

                                                     text images_x  \
0       The socks are seamless so th

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged["brand"] = merged.apply(lambda row: extract_brand(row.get("details"), row.get("store")), axis=1)


Removing duplicate reviews...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged.drop_duplicates(subset=["user_id", "asin", "text"], keep="first", inplace=True)


Computing review length...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged["review_length"] = merged["text"].str.split().apply(len)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged["year"] = pd.to_datetime(merged["timestamp"], unit="ms", errors="coerce").dt.year


Extracting year from timestamp...
 All cleaning steps completed.
         rating                                   title_x  \
0           4.0             Stylish For the Coffee Lover!   
1           5.0                                     Yesss   
2           5.0                                  Love it!   
3           5.0  Make sure to get the right model number.   
4           5.0                          Good replacement   
...         ...                                       ...   
1058051     5.0                                Five Stars   
1058052     5.0                                Five Stars   
1058053     5.0                         Brains & Beauty!!   
1058054     4.0                             Fit & Worked!   
1058055     5.0                               Exceptional   

                                                      text  \
0        Adorable, stylish and perfect for frothing you...   
1                                                I love it   
2              I

FileNotFoundError: [WinError 3] The system cannot find the path specified: 'D:\\UWI\\Year 3\\Sem 2\\COMP3610-Big-Data\\Assignments\\Assignment#3\\A3\\Code\\output_folder\\reviews_pkl'