# COMP 3610 – A3

- Zidane Timothy, Maia Neptune, Christophe Gittens

In [60]:
# %pip install pyspark
# %pip install findspark
# %pip install -q gdown
# %pip install pandas
# %pip install matplotlib
# %pip install seaborn
# %pip install pyarrow
# %pip install setuptools

In [61]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import functions as F
from pyspark.sql.window import Window
# import `DenseVector`
from pyspark.ml.linalg import DenseVector

# import `StandardScaler`
from pyspark.ml.feature import StandardScaler


# sudo apt install python3-distutils 

In [62]:
import findspark
from pathlib import Path
import os
import tarfile
import pandas as pd
import shutil

import time, matplotlib.pyplot as plt, seaborn as sns, matplotlib.ticker as ticker
import numpy as np


# findspark.init()

In [63]:
from datasets import load_dataset
from pathlib import Path
import pyarrow as pa
import pyarrow.parquet as pq
from datetime import datetime
import json

In [64]:
# spark = SparkSession.builder\
# .appName("Amazon_Reviews")\
# .getOrCreate()

In [65]:
review_schema = StructType([
    StructField("rating", FloatType(), True),
    StructField("title", StringType(), True),
    StructField("images", ArrayType(StringType()), True),
    StructField("asin", StringType(), True),
    StructField("parent_asin", FloatType(), True),
    StructField("user_id", ArrayType(StringType()), True),
    StructField("timestamp", IntegerType(), True),
    StructField("verified_purchase", BooleanType(), True),
    StructField("helpful_vote", StringType(), True),
])

# String types in arrays may need to be sequence but couldn't find the actual sequence dytpe syntax
meta_schema = StructType([
    StructField("main_category", StringType(), True),
    StructField("title", StringType(), True),
    StructField("average_rating", FloatType(), True),
    StructField("rating_number", IntegerType(), True),
    StructField("features", ArrayType(StringType()), True),
    StructField("description", ArrayType(StringType()), True),
    StructField("price", FloatType(), True),
    StructField("images", ArrayType(StringType()), True),
    StructField("videos", ArrayType(StringType()), True),
    StructField("store", StringType(), True),
    StructField("categories", ArrayType(StringType()), True),
    StructField("details", MapType(StringType(), IntegerType()), True),
    StructField("parent_asin", FloatType(), True),
    StructField("user_id", ArrayType(StringType()), True),
    StructField("bought_together", ArrayType(StringType()), True),
    # StructField("timestamp", IntegerType(), True),
    # StructField("verified_purchase", BooleanType(), True),
    # StructField("helpful_vote", StringType(), True),
])


In [66]:
# tar_folder = 'root/Data'
# output_folder = 'root/output_folder'
# os.makedirs(output_folder, exist_ok=True)

In [67]:
def extract_tar_bz2(tar_path, extract_dir):
    if not os.path.exists(tar_path):
        print(f"Error: File {tar_path} does not exist.")
        return
    if not tar_path.endswith(".tar.bz2"):
        print(f"Error: File {tar_path} is not a .tar.bz2 file.")
        return

    try:
        with tarfile.open(tar_path, "r:bz2") as tar:
            print(f"Extracting {tar_path} to {extract_dir}")
            tar.extractall(path=extract_dir)
    except Exception as e:
        print(f"Error during extraction: {e}")

In [None]:
import os
import shutil
from datasets import load_dataset
from pathlib import Path
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd

def preprocess_category(review_tar_path, meta_tar_path, output_folder, category,batch_size=1000):
    temp_path = r"/root/Data" # change as needed
    os.makedirs(output_folder, exist_ok=True)

    print("Extracting tar files...")
    extract_tar_bz2(review_tar_path, temp_path)
    extract_tar_bz2(meta_tar_path, temp_path)

    arrow_files = list(Path(temp_path).rglob("*.arrow"))
    print(f"Found {len(arrow_files)} Arrow files")

    for arrow_file in arrow_files:
        try:
            is_meta = "meta" in str(arrow_file).lower()
            folder_name = "meta" if is_meta else "reviews"

            pkl_output_path = os.path.join(output_folder, f"{folder_name}_pkl")
            os.makedirs(pkl_output_path, exist_ok=True)

            # print(f"Streaming {arrow_file.name} → {parquet_output_path}")
            dataset = load_dataset("arrow", data_files=str(arrow_file), split="train", streaming=True)

            batch = []
            seen_keys = set()
            batch_num = 0

            for i, row in enumerate(dataset):
                if not row:
                    continue

                if not is_meta:
                    key = (row.get("user_id"), row.get("asin"), row.get("text"))
                    if key in seen_keys:
                        continue
                    seen_keys.add(key)

                batch.append(row)

                if len(batch) >= batch_size:
                    table = pa.Table.from_pylist(batch)
                    # pq.write_to_dataset(table, root_path=parquet_output_path)

                    # convert to pandas and save as .pkl batch
                    df = pd.DataFrame(batch)
                    df.to_pickle(os.path.join(pkl_output_path, f"{category}_batch_{batch_num}.pkl"))
                    print(f"Saved batch {batch_num} ({len(batch)} rows) to .pkl")
                    batch = []
                    batch_num += 1

            # Final batch
            if batch:
                table = pa.Table.from_pylist(batch)
                # pq.write_to_dataset(table, root_path=parquet_output_path)

                df = pd.DataFrame(batch)
                df.to_pickle(os.path.join(pkl_output_path, f"{category}_batch_{batch_num}.pkl"))
                print(f"Saved final batch {batch_num} ({len(batch)} rows)")

        except Exception as e:
            print(f"Error processing {arrow_file.name}: {e}")

    shutil.rmtree(temp_path)
    print("All done, temp folder removed.")


Calling fn to preprocess for a category

Run for one

In [69]:
# preprocess_category(
#     r"D:\UWI\Year 3\Sem 2\COMP3610-Big-Data\Assignments\Assignment#3\A3\datasets\raw_meta_All_Beauty.tar.bz2",
#     r"D:\UWI\Year 3\Sem 2\COMP3610-Big-Data\Assignments\Assignment#3\A3\datasets\raw_review_All_Beauty.tar.bz2",
#     "output_folder", category="All_Beauty"
# )

Meta and Review parsing

In [70]:
# def convert_to_meta_df(folder):
#     df_m = []

#     for fname in sorted(os.listdir(folder)):
#         if fname.endswith(".pkl"):
#             try:
#                 file_path = os.path.join(folder, fname)
#                 meta_df = pd.read_pickle(file_path)
#                 print(f"{fname} loaded: shape = {meta_df.shape}")
#                 df_m.append(meta_df)
#             except Exception as e:
#                 print(f"Error in {fname}:", e)

#     if df_m:
#         meta_df = pd.concat(df_m, ignore_index=True)
#         print("All .pkl files loaded. Final shape:", meta_df.shape)
        
#     print("Removed meta pkl folder")
#     return meta_df
 

In [71]:
def convert_to_df(folder, category):
    df_r = []
    for fname in sorted(os.listdir(folder)):
        if fname.endswith(".pkl") and category.lower() in fname.lower():
            try:
                file_path = os.path.join(folder, fname)
                review_df = pd.read_pickle(file_path)
                print(f"{fname} loaded: shape = {review_df.shape}")
                df_r.append(review_df)
            except Exception as e:
                print(f"Error in {fname}:", e)

    if df_r:
        review_df = pd.concat(df_r, ignore_index=True)
        print("All .pkl files loaded. Final shape:", review_df.shape)
        
    print("Removed reviews pkl folder")
    return review_df


Merging

In [72]:
# merged_df = pd.merge(
#     review_df,
#     meta_df,
#     on="parent_asin",
#     how="inner"
# )
# merged_df

Clean data

Dealing with the brand

In [73]:
def extract_brand(details, store):
    try:
        if isinstance(details, dict) and "brand" in details and details["brand"]:
            return details["brand"]
    except Exception:
        pass
    if isinstance(store, str) and store.strip():
        return store
    return "Unknown"

In [74]:
def clean_data(category, review_df, meta_df):
    output_dir = r"C:\Users\maian\Downloads\cleaned files"
    os.makedirs(output_dir, exist_ok=True)
    
    print("Merging review and meta...")
    merged_df = pd.merge(review_df, meta_df, on="parent_asin", how="inner")
    print("Merged")

    print("Filtering invalid ratings...")
    merged_df = merged_df[merged_df["rating"].between(1.0, 5.0, inclusive="both")]

    print("Dropping empty review text...")
    merged = merged_df[merged_df["text"].notna() & (merged_df["text"].str.strip() != "")]

    print("Extracting brand from metadata...")
    merged["brand"] = merged.apply(lambda row: extract_brand(row.get("details"), row.get("store")), axis=1)

    print("Removing duplicate reviews...")
    merged.drop_duplicates(subset=["user_id", "asin", "text"], keep="first", inplace=True)

    print("Computing review length...")
    merged["review_length"] = merged["text"].str.split().apply(len)

    print("Extracting year from timestamp...")
    merged["year"] = pd.to_datetime(merged["timestamp"], unit="ms", errors="coerce").dt.year

    output_file = os.path.join(output_dir, f"{category}_cleaned_merged.pkl.bz2")
    merged.to_pickle(output_file, compression="bz2")

    print(" All cleaning steps completed.")
    
    test = merged
    return test

In [75]:
# cleaned = clean_data()

Run for all

In [76]:
categories = ['Home_and_Kitchen', 'Industrial_and_Scientific','Kindle_Store',
              'Magazine_Subscriptions', 'Movies_and_TV', 'Unknown' ]

In [77]:
from pathlib import Path

for category in categories:
    meta_path = r"/root/Data/output_folder musical-video_games/meta_pkl"
    review_path = r"/root/Data/output_folder musical-video_games/reviews_pkl"

    # review pkled folder
    # rev_pkl  = r"/root/Data/output_folder musical-video_games/reviews_pkl" # Make sure this is the folder with review .pkl batches
    # meta_pkl = r"/root/Data/output_folder musical-video_games/meta_pkl"  # Make sure this is the folder with meta .pkl batches

    preprocess_category(meta_path, review_path, "output_folder", category)
    # review_df = convert_to_df(review_path, category)
    # meta_df = convert_to_df(meta_path, category)
    # cleaned = clean_data(category, review_df, meta_df)
    # print(cleaned)
    # del cleaned
    # del meta_df
    # del review_df

    # remove the review and meta pkl files that aren't compressed
    # if os.path.exists(rev_pkl):
    #     shutil.rmtree(rev_pkl)
    # else:
    #     print(f"{rev_pkl} path does not exist")
    
    # if os.path.exists(meta_pkl):
    #     shutil.rmtree(meta_pkl)
    # else:
    #     print(f"{meta_pkl} path does not exist")


Extracting tar files...
Error: File /root/Data/output_folder musical-video_games/meta_pkl is not a .tar.bz2 file.
Error: File /root/Data/output_folder musical-video_games/reviews_pkl is not a .tar.bz2 file.
Found 0 Arrow files
All done, temp folder removed.
Extracting tar files...
Error: File /root/Data/output_folder musical-video_games/meta_pkl is not a .tar.bz2 file.
Error: File /root/Data/output_folder musical-video_games/reviews_pkl is not a .tar.bz2 file.
Found 0 Arrow files


FileNotFoundError: [Errno 2] No such file or directory: '/root/Code/Data/temp_folder'

In [None]:
import pandas as pd

df = pd.read_pickle(r"C:\Users\maian\Downloads\Musical_Instruments_cleaned_merged.pkl.bz2", compression="bz2")


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\maian\\Downloads\\Musical_Instruments_cleaned_merged.pkl.bz2'

In [None]:
df

Unnamed: 0,rating,title_x,text,images_x,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,...,videos,store,categories,details,bought_together,subtitle,author,brand,review_length,year
0,5.0,UP GRADE FOR SOUND SYSTEM,I need XLR Mic cables to replace the XLR to 1/...,[],B000VJJQUU,B08DBDD6CM,AEBDA7OU4G7SO4CDX6PQ37YLMWYA,1361732421000,2,True,...,"{'title': ['Boom Mic Stand Review', 'Nice XLR ...",OnStage,"[Musical Instruments, Live Sound & Stage, Stag...","{""Brand"": ""OnStage"", ""Connector Type"": ""XLR"", ...",,,,OnStage,35,2013
1,5.0,Cables are as advertised!,Cables are as advertised!,[],B08PH8PWJ3,B08F2RDKVX,AEYES74WNLLWQ5PP3EGSROXKBCVQ,1624984116370,0,True,...,"{'title': ['1/4 Inch TRS to XLR Male Cable', '...",KINXIYU,"[Musical Instruments, Live Sound & Stage, Stag...","{""Brand"": ""KINXIYU"", ""Connector Type"": ""XLR"", ...",,,,KINXIYU,4,2021
2,5.0,Very Good!,My Grandson loves these sticks.,[],B0002F741Q,B09TXP3FSK,AHR5TLHJCNNZRSKPA3ZBAXMKOTGQ,1461637110000,0,True,...,"{'title': ['What I Think About These Sticks', ...",Vic Firth,"[Musical Instruments, Instrument Accessories, ...","{""Item Weight"": ""3.52 ounces"", ""Product Dimens...",,,,Vic Firth,5,2016
3,5.0,Rock-Solid Tuner at a Great Price,"I've used this tuner for both bass and guitar,...",[],B00O4L3F9E,B09PWCF6T9,AEVV7GP4OZ44C5E4SEDK5RHQFJGA,1513621523396,0,True,...,{'title': ['One of the best displays and super...,KLIQ Music Gear,"[Musical Instruments, Instrument Accessories, ...","{""Item Weight"": ""3.38 ounces"", ""Package Dimens...",,,,KLIQ Music Gear,33,2017
4,5.0,Five Stars,Good value.,[],B01EWU4DZQ,B077D5Y6WC,AGLHEAM4SJJT4LSSOLLDJRVTWZLQ,1498246355374,0,True,...,"{'title': ['Why choose us', 'Mountain Ark Stai...",MFL.,"[Musical Instruments, Live Sound & Stage, Ligh...","{""Manufacturer"": ""MFL."", ""Part Number"": ""MF-SC...",,,,MFL.,2,2017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
691725,1.0,Not seeing any links for 2 months Free Classes.,Not seeing any links for 2 months Free Classes...,[],B08Y5MSQNP,B0BZLXRTSV,AHHV2KQ7IRYEC7U5EPDOEPOP7WEQ,1655516378523,0,True,...,{'title': ['Pyle Beginner Guitar Kit - Feature...,Pyle,"[Musical Instruments, Guitars, Acoustic Guitar...","{""Item Weight"": ""5.04 pounds"", ""Package Dimens...",,,,Pyle,18,2022
691726,5.0,Easy to use,We use this for multiple purposes...from pacin...,[],B01IP8C04M,B088WMVDZB,AFCVJYCZPFY5MWVYVA656FL3FIFQ,1537903964790,1,True,...,"{'title': ['Easy to use!', 'Digital Metronome ...",Ueteto,"[Musical Instruments, Instrument Accessories, ...","{""Item Weight"": ""6.7 ounces"", ""Best Sellers Ra...",,,,Ueteto,22,2018
691727,4.0,Nice guitar for the price.,So the packaging on my original order was not ...,"[{'attachment_type': 'IMAGE', 'large_image_url...",B00GV0K3GY,B07RYNLZST,AEPJMLMTHCFG6F7B3QA4UYP6YHUQ,1602125824240,2,True,...,"{'title': [], 'url': [], 'user_id': []}",Glen Burton,"[Musical Instruments, Instrument Accessories, ...","{""Item Weight"": ""1 pounds"", ""Product Dimension...",,,,Glen Burton,114,2020
691728,5.0,A must have for your acoustic guitar!,So I just ordered this over the weekend and re...,[],B01N179JSG,B01N179JSG,AEPJMLMTHCFG6F7B3QA4UYP6YHUQ,1592849308314,2,True,...,"{'title': ['Watch Video Review', 'Fishman Soun...",Guild,"[Musical Instruments, Instrument Accessories, ...","{""Item Weight"": ""2.89 ounces"", ""Product Dimens...",,,,Guild,92,2020
