# COMP 3610 – A3

- Zidane Timothy, Maia Neptune, Christophe Gittens

In [1]:
# %pip install pyspark
# %pip install findspark
# %pip install -q gdown
# %pip install pandas
# %pip install matplotlib
# %pip install seaborn
# %pip install pyarrow
# %pip install setuptools

In [2]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import functions as F
from pyspark.sql.window import Window
# import `DenseVector`
from pyspark.ml.linalg import DenseVector

# import `StandardScaler`
from pyspark.ml.feature import StandardScaler


# sudo apt install python3-distutils 

In [3]:
import findspark
from pathlib import Path
import os
import tarfile
import pandas as pd
import shutil

import time, matplotlib.pyplot as plt, seaborn as sns, matplotlib.ticker as ticker
import numpy as np


# findspark.init()

In [4]:
from datasets import load_dataset
from pathlib import Path
import pyarrow as pa
import pyarrow.parquet as pq
from datetime import datetime
import json

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# spark = SparkSession.builder\
# .appName("Amazon_Reviews")\
# .getOrCreate()

In [6]:
review_schema = StructType([
    StructField("rating", FloatType(), True),
    StructField("title", StringType(), True),
    StructField("images", ArrayType(StringType()), True),
    StructField("asin", StringType(), True),
    StructField("parent_asin", FloatType(), True),
    StructField("user_id", ArrayType(StringType()), True),
    StructField("timestamp", IntegerType(), True),
    StructField("verified_purchase", BooleanType(), True),
    StructField("helpful_vote", StringType(), True),
])

# String types in arrays may need to be sequence but couldn't find the actual sequence dytpe syntax
meta_schema = StructType([
    StructField("main_category", StringType(), True),
    StructField("title", StringType(), True),
    StructField("average_rating", FloatType(), True),
    StructField("rating_number", IntegerType(), True),
    StructField("features", ArrayType(StringType()), True),
    StructField("description", ArrayType(StringType()), True),
    StructField("price", FloatType(), True),
    StructField("images", ArrayType(StringType()), True),
    StructField("videos", ArrayType(StringType()), True),
    StructField("store", StringType(), True),
    StructField("categories", ArrayType(StringType()), True),
    StructField("details", MapType(StringType(), IntegerType()), True),
    StructField("parent_asin", FloatType(), True),
    StructField("user_id", ArrayType(StringType()), True),
    StructField("bought_together", ArrayType(StringType()), True),
    # StructField("timestamp", IntegerType(), True),
    # StructField("verified_purchase", BooleanType(), True),
    # StructField("helpful_vote", StringType(), True),
])


In [7]:
tar_folder = 'root/Data'
output_folder = 'root/output_folder'
os.makedirs(output_folder, exist_ok=True)

In [8]:
def extract_tar_bz2(tar_path, extract_dir):
    if not os.path.exists(tar_path):
        print(f"Error: File {tar_path} does not exist.")
        return
    if not tar_path.endswith(".tar.bz2"):
        print(f"Error: File {tar_path} is not a .tar.bz2 file.")
        return

    try:
        with tarfile.open(tar_path, "r:bz2") as tar:
            print(f"Extracting {tar_path} to {extract_dir}")
            tar.extractall(path=extract_dir)
    except Exception as e:
        print(f"Error during extraction: {e}")

In [9]:
# import os
# import shutil
# from datasets import load_dataset
# from pathlib import Path
# import pyarrow as pa
# import pyarrow.parquet as pq

# def preprocess_category(review_tar_path, meta_tar_path, output_folder, batch_size=1000):
#     temp_path = "root/Data/temp_extract"
#     if os.path.exists(temp_path):
#         shutil.rmtree(temp_path)
#     os.makedirs(temp_path, exist_ok=True)
#     os.makedirs(output_folder, exist_ok=True)

#     print("Extracting tar files...")
#     extract_tar_bz2(review_tar_path, temp_path)
#     extract_tar_bz2(meta_tar_path, temp_path)

#     arrow_files = list(Path(temp_path).rglob("*.arrow"))
#     print(f"Found {len(arrow_files)} Arrow files")

#     for arrow_file in arrow_files:
#         try:
#             is_meta = "meta" in str(arrow_file).lower()
#             output_path = os.path.join(output_folder, "meta.parquet" if is_meta else "reviews.parquet")
#             os.makedirs(output_path, exist_ok=True)

#             print(f"Streaming {arrow_file.name} → {output_path}")
#             dataset = load_dataset("arrow", data_files=str(arrow_file), split="train", streaming=True)

#             # once the dataset has been loaded we can clean it HERE one time

#             batch = []
#             for i, row in enumerate(dataset):
#                 batch.append(row)
#                 if len(batch) >= batch_size:
#                     table = pa.Table.from_pylist(batch)
#                     pq.write_to_dataset(table, root_path=output_path)
#                     print(f"Wrote batch of {len(batch)} rows to {output_path}")
#                     batch = []

#             if batch:
#                 table = pa.Table.from_pylist(batch)
#                 pq.write_to_dataset(table, root_path=output_path) 
#                 print(f"Wrote final batch of {len(batch)} rows to {output_path}")

#         except Exception as e:
#             print(f"Error processing {arrow_file.name}: {e}")



In [10]:
def clean_review_row(row):
    # Drop if rating is invalid
    if "rating" not in row or not (1 <= row["rating"] <= 5):
        return None

    # Drop if review text is empty
    if not row.get("text") or not str(row["text"]).strip():
        return None

    # Brand extraction fallback
    details = row.get("details", {})
    brand = None
    if isinstance(details, dict):
        brand = details.get("brand")
    if not brand:
        brand = row.get("store")
    row["brand"] = brand or "Unknown"

    # Add review_length
    row["review_length"] = len(str(row.get("text", "")).split())

    # Add year from timestamp
    ts = row.get("timestamp")
    try:
        row["year"] = datetime.utcfromtimestamp(ts).year if ts else None
    except:
        row["year"] = None

    # Cast helpful_vote to int if needed
    if "helpful_vote" in row:
        try:
            row["helpful_vote"] = int(row["helpful_vote"])
        except:
            row["helpful_vote"] = 0

    return row

In [11]:
def clean_meta_row(row):
    # normalize nested fields to string
    complex_fields = ["features", "description", "images", "videos", "categories", "details", "bought_together"]
    for field in complex_fields:
        if field in row and isinstance(row[field], (dict, list)):
            try:
                row[field] = json.dumps(row[field])
            except:
                row[field] = None
    return row

In [12]:
# import os
# import shutil
# from datasets import load_dataset
# from pathlib import Path
# import pyarrow as pa
# import pyarrow.parquet as pq

# def preprocess_category(review_tar_path, meta_tar_path, output_folder, batch_size=1000):
#     temp_path = "root/Data/temp_extract"
#     if os.path.exists(temp_path):
#         shutil.rmtree(temp_path)
#     os.makedirs(temp_path, exist_ok=True)
#     os.makedirs(output_folder, exist_ok=True)

#     print("Extracting tar files...")
#     extract_tar_bz2(review_tar_path, temp_path)
#     extract_tar_bz2(meta_tar_path, temp_path)

#     arrow_files = list(Path(temp_path).rglob("*.arrow"))
#     print(f"Found {len(arrow_files)} Arrow files")

#     for arrow_file in arrow_files:
#             try:
#                 is_meta = "meta" in str(arrow_file).lower()
#                 output_path = os.path.join(output_folder, "meta.parquet" if is_meta else "reviews.parquet")
#                 os.makedirs(output_path, exist_ok=True)

#                 print(f"Streaming {arrow_file.name} → {output_path}")
#                 dataset = load_dataset("arrow", data_files=str(arrow_file), split="train", streaming=True)

#                 batch = []
#                 seen_keys = set()  # For deduplication

#                 for i, row in enumerate(dataset):
#                     # Clean row depending on type
#                     #row = clean_meta_row(row) if is_meta else clean_review_row(row)

#                     if not row:
#                         continue

#                     # Deduplicate reviews on (user_id, asin, text)
#                     if not is_meta:
#                         key = (row.get("user_id"), row.get("asin"), row.get("text"))
#                         if key in seen_keys:
#                             continue
#                         seen_keys.add(key)

#                     batch.append(row)

#                     if len(batch) >= batch_size:
#                         table = pa.Table.from_pylist(batch)
#                         pq.write_to_dataset(table, root_path=output_path)
#                         print(f"Wrote batch of {len(batch)} rows to {output_path}")
#                         batch = []

#                 if batch:
#                     table = pa.Table.from_pylist(batch)
#                     pq.write_to_dataset(table, root_path=output_path)
#                     print(f"Wrote final batch of {len(batch)} rows to {output_path}")

#             except Exception as e:
#                 print(f"Error processing {arrow_file.name}: {e}")

#     shutil.rmtree(temp_path)

In [13]:
import os
import shutil
from datasets import load_dataset
from pathlib import Path
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd

def preprocess_category(review_tar_path, meta_tar_path, output_folder, batch_size=1000):
    temp_path = "root/Data/temp_extract"
    if os.path.exists(temp_path):
        shutil.rmtree(temp_path)
    os.makedirs(temp_path, exist_ok=True)
    os.makedirs(output_folder, exist_ok=True)

    print("Extracting tar files...")
    extract_tar_bz2(review_tar_path, temp_path)
    extract_tar_bz2(meta_tar_path, temp_path)

    arrow_files = list(Path(temp_path).rglob("*.arrow"))
    print(f"Found {len(arrow_files)} Arrow files")

    for arrow_file in arrow_files:
        try:
            is_meta = "meta" in str(arrow_file).lower()
            folder_name = "meta" if is_meta else "reviews"

            parquet_output_path = os.path.join(output_folder, f"{folder_name}.parquet")
            pkl_output_path = os.path.join(output_folder, f"{folder_name}_pkl")
            os.makedirs(parquet_output_path, exist_ok=True)
            os.makedirs(pkl_output_path, exist_ok=True)

            print(f"Streaming {arrow_file.name} → {parquet_output_path}")
            dataset = load_dataset("arrow", data_files=str(arrow_file), split="train", streaming=True)

            batch = []
            seen_keys = set()
            batch_num = 0

            for i, row in enumerate(dataset):
                if not row:
                    continue

                if not is_meta:
                    key = (row.get("user_id"), row.get("asin"), row.get("text"))
                    if key in seen_keys:
                        continue
                    seen_keys.add(key)

                batch.append(row)

                if len(batch) >= batch_size:
                    table = pa.Table.from_pylist(batch)
                    pq.write_to_dataset(table, root_path=parquet_output_path)

                    # convert to pandas and save as .pkl batch
                    df = pd.DataFrame(batch)
                    df.to_pickle(os.path.join(pkl_output_path, f"batch_{batch_num}.pkl"))
                    print(f"Saved batch {batch_num} ({len(batch)} rows) to .parquet and .pkl")
                    batch = []
                    batch_num += 1

            # Final batch
            if batch:
                table = pa.Table.from_pylist(batch)
                pq.write_to_dataset(table, root_path=parquet_output_path)

                df = pd.DataFrame(batch)
                df.to_pickle(os.path.join(pkl_output_path, f"batch_{batch_num}.pkl"))
                print(f"Saved final batch {batch_num} ({len(batch)} rows)")

        except Exception as e:
            print(f"Error processing {arrow_file.name}: {e}")

    shutil.rmtree(temp_path)
    print("All done, temp folder removed.")


Calling fn to preprocess for a category

Amazon

In [None]:
preprocess_category(
    "/root/Data/raw_meta_Amazon_Fashion.tar.bz2",
    "/root/Data/raw_review_Amazon_Fashion.tar.bz2",
    "/root/Data/output_folder"
)

Extracting tar files...
Extracting /root/Data/raw_meta_Amazon_Fashion.tar.bz2 to root/Data/temp_extract


  tar.extractall(path=extract_dir)


Appliances

In [None]:
preprocess_category(
    "/root/Data/raw_meta_Appliances.tar.bz2",
    "/root/Data/raw_review_Appliances.tar.bz2",
    "/root/output_folder"
)

Extracting tar files...
Error: File root/Data/raw_meta_Appliances.tar.bz2 does not exist.
Error: File root/Datraw_review_Appliances.tar.bz2 does not exist.
Found 0 Arrow files
All done, temp folder removed.


Load the parquets

In [None]:
# os.environ["SPARK_LOCAL_IP"] = "10.17.0.5"

# reviews = pd.read_parquet(r"root/output_folder/reviews.parquet", engine="pyarrow")
# meta = pd.read_parquet(r"root/output_folder/meta.parquet")

# merged = pd.merge(reviews, meta, on="parent_asin", how="inner")
# merged.to_parquet(r"root/output_folder/merged_cleaned.parquet")

Testing - For Amazon

In [None]:
folder = "/root/output_folder/meta"  # Make sure this is the folder with .pkl batches
df_m = []

for fname in sorted(os.listdir(folder)):
    if fname.endswith(".pkl"):
        try:
            file_path = os.path.join(folder, fname)
            meta_df = pd.read_pickle(file_path)
            print(f"{fname} loaded: shape = {meta_df.shape}")
            df_m.append(meta_df)
        except Exception as e:
            print(f"Error in {fname}:", e)

if df_m:
    full_meta_Amazon_df = pd.concat(df_m, ignore_index=True)
    print("All .pkl files loaded. Final shape:", full_meta_Amazon_df.shape)
 

In [None]:
folder = "/root/output_folder/reviews"  # Make sure this is the folder with .pkl batches
df_r = []

for fname in sorted(os.listdir(folder)):
    if fname.endswith(".pkl"):
        try:
            file_path = os.path.join(folder, fname)
            review_df = pd.read_pickle(file_path)
            print(f"{fname} loaded: shape = {review_df.shape}")
            df_r.append(review_df)
        except Exception as e:
            print(f"Error in {fname}:", e)

if df_r:
    full_review_Amazon_df = pd.concat(df_r, ignore_index=True)
    print("All .pkl files loaded. Final shape:", full_review_Amazon_df.shape)


batch_177.pkl loaded: shape = (1000, 13)
batch_178.pkl loaded: shape = (1000, 13)
batch_179.pkl loaded: shape = (1000, 13)
batch_18.pkl loaded: shape = (1000, 13)
batch_180.pkl loaded: shape = (1000, 13)
batch_181.pkl loaded: shape = (1000, 13)
batch_182.pkl loaded: shape = (1000, 13)
batch_183.pkl loaded: shape = (1000, 13)
batch_184.pkl loaded: shape = (1000, 13)
batch_185.pkl loaded: shape = (1000, 13)
batch_186.pkl loaded: shape = (1000, 13)
batch_187.pkl loaded: shape = (1000, 13)
batch_188.pkl loaded: shape = (1000, 13)
batch_189.pkl loaded: shape = (1000, 13)
batch_19.pkl loaded: shape = (1000, 13)
batch_190.pkl loaded: shape = (1000, 13)
batch_191.pkl loaded: shape = (1000, 13)
batch_192.pkl loaded: shape = (1000, 13)
batch_193.pkl loaded: shape = (1000, 13)
batch_194.pkl loaded: shape = (1000, 13)
batch_195.pkl loaded: shape = (1000, 13)
batch_196.pkl loaded: shape = (1000, 13)
batch_197.pkl loaded: shape = (1000, 13)
batch_198.pkl loaded: shape = (1000, 13)
batch_199.pkl load

Merging

In [None]:
amazon_meta_review_df = pd.merge(
    full_review_Amazon_df,
    full_meta_Amazon_df,
    on="parent_asin",
    how="inner"
)
amazon_meta_review_df 

Clean data

In [None]:
# #  1. Load the reviews and metadata
# print(" Loading Parquet files...")
# reviews = pd.read_parquet("root/output_folder/reviews.parquet")
# meta = pd.read_parquet("root/output_folder/meta.parquet")
# print(f" Reviews shape: {reviews.shape}")
# print(f" Metadata shape: {meta.shape}")

# #  2. Merge on 'parent_asin'
# print(" Merging on parent_asin...")
# merged = pd.merge(reviews, meta, on="parent_asin", how="inner", suffixes=("_review", "_meta"))
# print(f" Merged shape: {merged.shape}")

# #  3a. Drop rows with invalid or missing ratings
# print(" Filtering invalid ratings...")
# merged = merged[merged["rating"].between(1.0, 5.0, inclusive="both")]

# #  3b. Drop rows with empty or missing review text
# print(" Dropping empty review text...")
# merged = merged[merged["text"].notna() & (merged["text"].str.strip() != "")]

Dealing with the brand

In [None]:
# def extract_brand(details, store):
#     try:
#         if isinstance(details, dict) and "brand" in details and details["brand"]:
#             return details["brand"]
#     except Exception:
#         pass
#     if isinstance(store, str) and store.strip():
#         return store
#     return "Unknown"

In [None]:
# #dealing with brand  data
# # 3c. Extract brand (from details or store) or set to "Unknown"
# print(" Extracting brand from metadata...")
# merged["brand"] = merged.apply(lambda row: extract_brand(row.get("details"), row.get("store")), axis=1)

# #  4. Remove duplicates: (user id, asin, text)
# print(" Removing duplicate reviews...")
# merged.drop_duplicates(subset=["user id", "asin", "text"], keep="first", inplace=True)

# #  5a. Derived column: review_length (token count)
# print(" Computing review length...")
# merged["review_length"] = merged["text"].str.split().apply(len)

# #  5b. Derived column: year (from timestamp)
# print(" Extracting year from timestamp...")
# merged["year"] = pd.to_datetime(merged["timestamp"], unit="s", errors="coerce").dt.year

# #  6. Save cleaned data
# output_path = "root/output_folder/cleaned_merged.parquet"
# print(f" Saving cleaned dataset to: {output_path}")
# merged.to_parquet(output_path, index=False)

# print(" All cleaning steps completed.")
# print(f" Final dataset shape: {merged.shape}")

Testing

In [None]:
# ds_meta = Dataset.from_file("/root/Code/root/Data/temp_extract/raw_meta_Gift_Cards/full/data-00000-of-00001.arrow")
# ds_review = Dataset.from_file("/root/Code/root/Data/temp_extract/raw_review_Gift_Cards/full/data-00000-of-00001.arrow")
# ds_review

Dead Code (?)

In [None]:
    # Combine review files
    # review_files = [f for f in arrow_files if "meta" not in str(f)]
    # combined_review_file = f"{temp_path}/combined_reviews.arrow"
    # print(combined_review_file)
    # combine_arrow_files(review_files, combined_review_file)

    # # Combine meta files
    # meta_files = [f for f in arrow_files if "meta" in str(f)]
    # combined_meta_file = f"{temp_path}/combined_meta.arrow"
    # print(combined_meta_file)
    # combine_arrow_files(meta_files, combined_meta_file)

In [None]:
# from pyspark.sql.functions import col, size, split, year, from_unixtime, when, lit
# from datasets import Dataset
# import pyarrow.json as pajson
# import pyarrow.dataset as ds
# import pyarrow as pa
# import pyarrow.parquet as pq
# import json


# def preprocess_category(review_tar_path, meta_tar_path, output_folder):
#     temp_path = "root/Data/temp_extract"
#     if os.path.exists(temp_path):
#         shutil.rmtree(temp_path)
#     os.makedirs(temp_path, exist_ok=True)

#     print("attempting to call extract function...")
#     extract_tar_bz2(review_tar_path, temp_path)
#     extract_tar_bz2(meta_tar_path, temp_path)

#     # finding the json files and reading
#     print("Finding Arrow files...")
#     arrow_files = list(Path(temp_path).rglob("*.arrow"))
#     print(f"Found Arrow files: {arrow_files}")

#     # print the length of the arrow files
#     print(len(arrow_files))
#     review_file = []
#     meta_file = []

#     # review_file.append([f for f in arrow_files if "meta" not in str(f)][0])
#     # meta_file = [f for f in arrow_files if "meta" in str(f)][0]

#     # print(review_file)
#     test_df = pd.DataFrame()

#     for arrow_file in arrow_files:
#         try:
#             table = Dataset.from_file(str(arrow_file))
#             test_df = table.to_pandas()
#             print("Successful!")
#     #         test_df = spark.createDataFrame(table.to_pandas())
#     #         if "meta" in str(arrow_file):
#     #             metadata_frames.append(df)
#     #         else:
#     #             review_frames.append(df)
#         except Exception as e:
#             print(f"Error processing file {arrow_file}: {e}")

#     print(test_df)
    

#     # Combine all metadata and review dataframes
#     # metadata_df = metadata_frames[0]
#     # for frame in metadata_frames[1:]:
#     #     metadata_df = metadata_df.union(frame)

#     # review_df = review_frames[0]
#     # for frame in review_frames[1:]:
#     #     review_df = review_df.union(frame)



#     # # Load the combined JSON files into Spark DataFrames
#     # reviews_df = spark.read.json(combined_review_file)
#     # meta_df = spark.read.json(combined_meta_file)

#     # reviews_df.show()
#     # meta_df.show()
#     # Load with pyarrow and convert to Spark DataFrame
#     # reviews_df = spark.createDataFrame(pajson.read_json(str(review_file)).to_pandas())  # Use pyarrow to read JSON
#     # meta_df = spark.createDataFrame(pajson.read_json(str(meta_file)).to_pandas())  # Use pyarrow to read JSON
#     # reviews_df = spark.read.schema(review_schema).json(str(review_file))  # Use pyarrow to read JSON
#     # meta_df = spark.read.schema(meta_schema).json(str(meta_file))  # Use pyarrow to read JSON

#     # Assuming 'asin' is present in both DataFrames, we join on it
#     # df = reviews_df.join(meta_df, on='parent_asin', how='inner') 

#     # # # load with Spark
#     # reviews_df = spark.read.json(str(review_file))
#     # meta_df = spark.read.json(str(meta_file))

#     # # print(reviews_df)
#     # reviews_df.show()

#     # # merge on 'parent_asin'
#     # df = reviews_df.join(meta_df, on='parent_asin', how='inner')

#     # # drop rows with invalid ratings or empty text
#     # df = df.filter((col("rating").between(1, 5)) &
#     #                (col("text").isNotNull()) &
#     #                (col("text") != ""))

#     # # brand logic
#     # df = df.withColumn(
#     #     "brand",
#     #     when(col("details.brand").isNotNull(), col("details.brand"))
#     #     .when(col("store").isNotNull(), col("store"))
#     #     .otherwise(lit("Unknown"))
#     # )

#     # # derived columns
#     # df = df.withColumn("review_length", size(split(col("text"), " ")))
#     # df = df.withColumn("year", year(from_unixtime(col("timestamp"))))

#     # # drop duplicates (based on user_id, asin, text)
#     # df = df.dropDuplicates(["user_id", "asin", "text"])

#     # # save as Parquet
#     # category = Path(review_tar_path).stem.replace("raw_review_", "")
#     # output_file = os.path.join(output_folder, f"cleaned_{category}.parquet")
#     # df.write.mode("overwrite").parquet(output_file)

#     # shutil.rmtree(temp_path)


In [None]:
# import json 
# def combine_json_files(json_files, output_file):
#     combined_data = []
#     for file in json_files:
#         with open(file, 'r', encoding='utf-8', errors='replace') as f:
#             try:
#                 data = json.load(f)
#                 if isinstance(data, list):  # If the JSON is an array
#                     combined_data.extend(data)
#                 else:  # If the JSON is an object
#                     combined_data.append(data)
#             except json.JSONDecodeError as e:
#                 print(f"Error decoding JSON from file {file}: {e}")
    
#     # Save the combined data to a new JSON file
#     with open(output_file, 'w', encoding='utf-8') as f:
#         json.dump(combined_data, f, indent=4)
#     print(f"Combined JSON saved to {output_file}")