# Colab Install Datasets

In [None]:
!pip install "datasets<3.0.0"

Collecting datasets<3.0.0
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting fsspec<=2024.6.1,>=2023.1.0 (from fsspec[http]<=2024.6.1,>=2023.1.0->datasets<3.0.0)
  Downloading fsspec-2024.6.1-py3-none-any.whl.metadata (11 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.6.1-py3-none-any.whl (177 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m177.6/177.6 kB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.0
    Uninstalling fsspec-2025.3.0:
      Successfully uninstalled fsspec-2025.3.0
  Attempting uninstall: datasets
    Found existing installation: datasets 4.0.0
    Uninstalling datasets-4.0.0:
      Successfully uninstalled datasets-4.0.0
[31mERROR: pip's d

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# EDA

## Returned Feature

In [None]:
from datasets import load_dataset
import re

# Keywords that signal a return
RETURN_KEYWORDS = ["returned", "returning", "sent back", "send back", "too small", "too large", "fit", "didn't fit"]

# Counter
counts = {"return_signal": 0, "total_checked": 0}

# Load stream (no download)
dataset_stream = load_dataset(
    "McAuley-Lab/Amazon-Reviews-2023",
    "raw_review_Clothing_Shoes_and_Jewelry",
    split="full",
    streaming=True,
    trust_remote_code=True
)

print("Scanning for return signals (stopping at 10k)...")

for review in dataset_stream:
    counts["total_checked"] += 1

    # 1. Check Rating
    if review['rating'] <= 3.0:
        text = review['text'].lower()

        # 2. Check for Keywords
        if any(word in text for word in RETURN_KEYWORDS):
            counts["return_signal"] += 1

    # Stop after checking 10,000 reviews just to get a sample ratio
    if counts["total_checked"] >= 100000:
        break

ratio = (counts["return_signal"] / counts["total_checked"]) * 100
print(f"Results: Found {counts['return_signal']} potential returns out of {counts['total_checked']}.")
print(f"Estimated Positive Class Ratio: {ratio:.2f}%")

if counts["return_signal"] < 100:
    print("WARNING: Signal is very weak. Consider 2018 dataset or broader keywords.")
else:
    print("SUCCESS: Enough data exists to build a balanced dataset!")

Downloading builder script:   0%|          | 0.00/39.6k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/30.3k [00:00<?, ?B/s]

Scanning for return signals (stopping at 10k)...
Results: Found 6088 potential returns out of 100000.
Estimated Positive Class Ratio: 6.09%
SUCCESS: Enough data exists to build a balanced dataset!


## "Returned" Word Choice by Category

In [None]:
from datasets import load_dataset
from tqdm import tqdm

# --- CONFIGURATION ---
categories_to_test = {
    # === GROUP A: HIGH VISUAL-SUBJECTIVITY (The "Discrepancy" Group) ===
    "raw_review_Amazon_Fashion": [
        "returned", "returning", "sent back", "too small", "too large",
        "didn't fit", "doesn't fit", "ill fitting", "poor quality",
        "looks different", "color was off", "fabric", "material"
    ],
    "raw_review_Clothing_Shoes_and_Jewelry": [
        "returned", "returning", "sent back", "too small", "too large",
        "didn't fit", "doesn't fit", "ill fitting", "poor quality",
        "looks different", "color was off", "fabric", "material"
    ],
    "raw_review_Beauty_and_Personal_Care": [
        "returned", "returning", "sent back", "smell", "scent",
        "color", "consistency", "fake", "rash", "reaction",
        "texture", "looks different", "dry", "bottle broken"
    ],
    "raw_review_Home_and_Kitchen": [
        "returned", "returning", "sent back", "broken", "damaged",
        "poor quality", "cheap plastic", "missing parts", "leaked",
        "color", "smaller than expected", "wobble", "unstable"
    ],

    # === GROUP B: LOW VISUAL-SUBJECTIVITY (The "Functional" Group) ===
    "raw_review_Electronics": [
        "returned", "returning", "sent back", "stopped working", "died",
        "defective", "broken", "glitch", "compatible", "connection",
        "battery life", "plug", "refurbished", "damaged", "screen"
    ],
    "raw_review_Cell_Phones_and_Accessories": [
        "returned", "returning", "sent back", "case", "protector",
        "bubbles", "cracked", "peeled", "fit", "buttons",
        "charge", "cord", "yellowing"
    ],
    "raw_review_Tools_and_Home_Improvement": [
        "returned", "returning", "sent back", "broke", "snapped",
        "bent", "torque", "power", "battery", "flimsy",
        "missing piece", "leaked", "rust"
    ],
    "raw_review_Automotive": [
        "returned", "returning", "sent back", "fit", "install",
        "leak", "light", "bright", "dim", "code",
        "model", "wrong part", "adhesive"
    ],
    "raw_review_Sports_and_Outdoors": [
        "returned", "returning", "sent back", "uncomfortable", "grip",
        "size", "tight", "broke", "snapped", "deflated",
        "leaked", "poor quality", "slippery"
    ]
}
SCAN_LIMIT = 1000000

print(f"Starting Comparative Keyword Analysis...\n")

for category_name, keywords in categories_to_test.items():
    print(f"=== Analyzing: {category_name} ===")

    # Counters
    counts = {"return_signal": 0, "total_checked": 0}

    # Dictionary to track individual word hits
    # Initialize with 0 for all keywords
    word_stats = {word: 0 for word in keywords}

    try:
        dataset_stream = load_dataset(
            "McAuley-Lab/Amazon-Reviews-2023",
            category_name,
            split="full",
            streaming=True,
            trust_remote_code=True
        )

        for review in tqdm(dataset_stream, total=SCAN_LIMIT, desc=f"Scanning {category_name}"):
            counts["total_checked"] += 1

            # 1. Check Rating
            if review['rating'] <= 3.0:
                text = review['text']
                if text:
                    text = text.lower()

                    # 2. Check for Keywords AND count them
                    is_return = False

                    for word in keywords:
                        if word in text:
                            word_stats[word] += 1
                            is_return = True

                    if is_return:
                        counts["return_signal"] += 1

            if counts["total_checked"] >= SCAN_LIMIT:
                break

        # --- PRINT RESULTS ---
        ratio = (counts["return_signal"] / counts["total_checked"]) * 100
        print(f"\n   Total Return Signals: {counts['return_signal']} ({ratio:.2f}%)")
        print(f"   Keyword Breakdown:")

        # Sort words by frequency (highest to lowest) for better readability
        sorted_words = sorted(word_stats.items(), key=lambda item: item[1], reverse=True)

        for word, count in sorted_words:
            # Print visuals to see the distribution easily
            bar = "#" * int((count / counts["return_signal"] * 20)) if counts["return_signal"] > 0 else ""
            print(f"      - {word:<20}: {count} {bar}")

    except Exception as e:
        print(f"   Error: {e}")

    print("\n" + "="*40 + "\n")

Starting Comparative Keyword Analysis...

=== Analyzing: raw_review_Amazon_Fashion ===


Scanning raw_review_Amazon_Fashion: 100%|█████████▉| 999999/1000000 [00:21<00:00, 45609.40it/s]



   Total Return Signals: 75636 (7.56%)
   Keyword Breakdown:
      - material            : 28821 #######
      - fabric              : 16083 ####
      - too small           : 14680 ###
      - returned            : 11885 ###
      - returning           : 7209 #
      - poor quality        : 3332 
      - too large           : 2538 
      - didn't fit          : 1350 
      - doesn't fit         : 917 
      - sent back           : 609 
      - ill fitting         : 174 
      - looks different     : 71 
      - color was off       : 41 


=== Analyzing: raw_review_Clothing_Shoes_and_Jewelry ===


Scanning raw_review_Clothing_Shoes_and_Jewelry: 100%|█████████▉| 999999/1000000 [00:22<00:00, 45098.42it/s]



   Total Return Signals: 57771 (5.78%)
   Keyword Breakdown:
      - material            : 20159 ######
      - fabric              : 13365 ####
      - returned            : 10708 ###
      - too small           : 10703 ###
      - returning           : 5412 #
      - too large           : 2345 
      - poor quality        : 1932 
      - didn't fit          : 983 
      - sent back           : 593 
      - doesn't fit         : 549 
      - ill fitting         : 128 
      - looks different     : 49 
      - color was off       : 29 


=== Analyzing: raw_review_Beauty_and_Personal_Care ===


Scanning raw_review_Beauty_and_Personal_Care: 100%|█████████▉| 999999/1000000 [00:22<00:00, 44108.62it/s]



   Total Return Signals: 74795 (7.48%)
   Keyword Breakdown:
      - smell               : 26355 #######
      - color               : 20109 #####
      - dry                 : 17273 ####
      - scent               : 14517 ###
      - returned            : 3888 #
      - texture             : 3062 
      - rash                : 3050 
      - returning           : 2424 
      - consistency         : 1871 
      - fake                : 1858 
      - reaction            : 1457 
      - sent back           : 165 
      - looks different     : 26 
      - bottle broken       : 7 


=== Analyzing: raw_review_Home_and_Kitchen ===


Scanning raw_review_Home_and_Kitchen: 100%|█████████▉| 999999/1000000 [00:21<00:00, 45762.53it/s]



   Total Return Signals: 29750 (2.97%)
   Keyword Breakdown:
      - color               : 10123 ######
      - returned            : 6679 ####
      - broken              : 4915 ###
      - returning           : 3965 ##
      - damaged             : 2134 #
      - poor quality        : 1665 #
      - leaked              : 759 
      - cheap plastic       : 723 
      - wobble              : 540 
      - smaller than expected: 409 
      - sent back           : 343 
      - unstable            : 264 
      - missing parts       : 173 


=== Analyzing: raw_review_Electronics ===


Scanning raw_review_Electronics: 100%|█████████▉| 999999/1000000 [00:19<00:00, 51853.63it/s]



   Total Return Signals: 60455 (6.05%)
   Keyword Breakdown:
      - plug                : 13873 ####
      - screen              : 13107 ####
      - returned            : 9472 ###
      - connection          : 7745 ##
      - stopped working     : 5711 #
      - returning           : 5517 #
      - battery life        : 3459 #
      - broken              : 3281 #
      - defective           : 3188 #
      - died                : 3085 #
      - compatible          : 2236 
      - damaged             : 1277 
      - refurbished         : 1032 
      - glitch              : 895 
      - sent back           : 498 


=== Analyzing: raw_review_Cell_Phones_and_Accessories ===


Scanning raw_review_Cell_Phones_and_Accessories: 100%|█████████▉| 999999/1000000 [00:22<00:00, 45324.88it/s]



   Total Return Signals: 125324 (12.53%)
   Keyword Breakdown:
      - case                : 60858 #########
      - fit                 : 30891 ####
      - protector           : 23382 ###
      - charge              : 22392 ###
      - cracked             : 9242 #
      - returned            : 6668 #
      - buttons             : 6131 
      - bubbles             : 5522 
      - cord                : 4829 
      - returning           : 4768 
      - peeled              : 793 
      - sent back           : 330 
      - yellowing           : 209 


=== Analyzing: raw_review_Tools_and_Home_Improvement ===


Scanning raw_review_Tools_and_Home_Improvement: 100%|█████████▉| 999999/1000000 [00:40<00:00, 24461.67it/s]



   Total Return Signals: 36626 (3.66%)
   Keyword Breakdown:
      - broke               : 8943 ####
      - battery             : 6543 ###
      - returned            : 6264 ###
      - power               : 5316 ##
      - rust                : 4238 ##
      - returning           : 3416 #
      - flimsy              : 3040 #
      - bent                : 1647 
      - leaked              : 808 
      - snapped             : 685 
      - torque              : 592 
      - sent back           : 334 
      - missing piece       : 80 


=== Analyzing: raw_review_Automotive ===


Scanning raw_review_Automotive: 100%|█████████▉| 999999/1000000 [00:21<00:00, 46737.78it/s]



   Total Return Signals: 71335 (7.13%)
   Keyword Breakdown:
      - fit                 : 34409 #########
      - install             : 19164 #####
      - light               : 15763 ####
      - returned            : 5377 #
      - bright              : 4574 #
      - leak                : 4109 #
      - returning           : 2780 
      - model               : 2365 
      - adhesive            : 1575 
      - dim                 : 1379 
      - code                : 1227 
      - sent back           : 364 
      - wrong part          : 289 


=== Analyzing: raw_review_Sports_and_Outdoors ===


Scanning raw_review_Sports_and_Outdoors: 100%|█████████▉| 999999/1000000 [00:22<00:00, 43829.71it/s]


   Total Return Signals: 45196 (4.52%)
   Keyword Breakdown:
      - size                : 12548 #####
      - broke               : 11128 ####
      - tight               : 7886 ###
      - returned            : 6348 ##
      - returning           : 3502 #
      - grip                : 3137 #
      - uncomfortable       : 2541 #
      - poor quality        : 1514 
      - snapped             : 1299 
      - leaked              : 1197 
      - slippery            : 708 
      - deflated            : 477 
      - sent back           : 302 







Keywords seem very subjectives,  need to get gemini to verify.

## Positive Keywords Analysis

In [None]:
from datasets import load_dataset
from tqdm import tqdm

# --- CONFIGURATION: POSITIVE (Low Discrepancy) KEYWORDS ---
base_positive = [
    "exactly like", "just like the picture", "true to picture",
    "true to color", "better than the photo", "exactly what i was looking for",
    "looks just like", "colors are vivid"
]
categories_to_test = {
    # === GROUP A: HIGH VISUAL-SUBJECTIVITY ===
    "raw_review_Amazon_Fashion": base_positive + [
        "perfect fit", "fits perfectly", "true to size", "soft",
        "comfortable", "great material", "high quality"
    ],
    "raw_review_Clothing_Shoes_and_Jewelry": base_positive + [
        "perfect fit", "fits perfectly", "true to size", "soft",
        "comfortable", "great material", "high quality"
    ],
    "raw_review_Beauty_and_Personal_Care": base_positive + [
        "authentic", "smells great", "soft", "smooth",
        "perfect shade", "genuine", "long lasting"
    ],
    "raw_review_Home_and_Kitchen": base_positive + [
        "sturdy", "solid", "beautiful", "easy to assemble",
        "well made", "high quality", "looks great"
    ],

    # === GROUP B: LOW VISUAL-SUBJECTIVITY ===
    "raw_review_Electronics": base_positive + [
        "works perfectly", "easy to set up", "fast", "great quality",
        "sound is great", "picture is clear", "good condition"
    ],
    "raw_review_Cell_Phones_and_Accessories": base_positive + [
        "fits perfectly", "great case", "protection", "crystal clear",
        "easy to install", "durable"
    ],
    "raw_review_Tools_and_Home_Improvement": base_positive + [
        "powerful", "sturdy", "well built", "heavy duty",
        "works great", "sharp", "reliable"
    ],
    "raw_review_Automotive": base_positive + [
        "perfect fit", "easy to install", "bright", "works great",
        "exact match", "good quality"
    ],
    "raw_review_Sports_and_Outdoors": base_positive + [
        "comfortable", "sturdy", "great grip", "fun",
        "durable", "holds up well"
    ]
}

# Scan Limit
SCAN_LIMIT = 1000000

print(f"Starting Positive Signal Analysis (The 'Perfect Match' Report)...\n")

for category_name, pos_keywords in categories_to_test.items():
    print(f"=== Analyzing: {category_name} ===")

    # Counters
    counts = {"pos_signal": 0, "total_checked": 0}

    # Dictionary to track individual word hits
    pos_stats = {word: 0 for word in pos_keywords}

    try:
        # Load stream
        dataset_stream = load_dataset(
            "McAuley-Lab/Amazon-Reviews-2023",
            category_name,
            split="full",
            streaming=True,
            trust_remote_code=True
        )

        for review in tqdm(dataset_stream, total=SCAN_LIMIT, desc=f"Scanning"):
            counts["total_checked"] += 1

            # Logic: 5-Star Rating AND Positive Keyword found
            if review['rating'] == 5.0:
                text = review['text']
                if text:
                    text = text.lower()

                    is_pos = False
                    for word in pos_keywords:
                        if word in text:
                            pos_stats[word] += 1
                            is_pos = True

                    if is_pos:
                        counts["pos_signal"] += 1

            if counts["total_checked"] >= SCAN_LIMIT:
                break

        # --- PRINT RESULTS ---
        ratio = (counts["pos_signal"] / counts["total_checked"]) * 100 if counts["total_checked"] > 0 else 0

        print(f"\n   [POSITIVE] Keep Signals Found: {counts['pos_signal']} ({ratio:.2f}%)")
        print(f"   Keyword Breakdown:")

        # Sort high to low
        sorted_stats = sorted(pos_stats.items(), key=lambda x: x[1], reverse=True)

        # Find max for bar scaling
        max_val = sorted_stats[0][1] if sorted_stats else 1

        for word, count in sorted_stats:
            # Scale ASCII bar to max 20 characters
            bar_len = int((count / max_val * 20))
            bar = "#" * bar_len
            print(f"      - {word:<20}: {count} {bar}")

    except Exception as e:
        print(f"   Error: {e}")

    print("\n" + "="*40 + "\n")

Starting Positive Signal Analysis (The 'Perfect Match' Report)...

=== Analyzing: raw_review_Amazon_Fashion ===


Scanning: 100%|█████████▉| 999999/1000000 [00:35<00:00, 27972.32it/s]



   [POSITIVE] Keep Signals Found: 109015 (10.90%)
   Keyword Breakdown:
      - comfortable         : 59256 ####################
      - soft                : 35180 ###########
      - true to size        : 7694 ##
      - fits perfectly      : 6352 ##
      - perfect fit         : 5546 #
      - high quality        : 4348 #
      - exactly like        : 1682 
      - exactly what i was looking for: 1590 
      - looks just like     : 1476 
      - just like the picture: 1470 
      - great material      : 1250 
      - true to color       : 188 
      - true to picture     : 133 
      - colors are vivid    : 84 
      - better than the photo: 53 


=== Analyzing: raw_review_Clothing_Shoes_and_Jewelry ===


Scanning: 100%|█████████▉| 999999/1000000 [00:41<00:00, 24091.66it/s]



   [POSITIVE] Keep Signals Found: 178962 (17.90%)
   Keyword Breakdown:
      - comfortable         : 113828 ####################
      - soft                : 57850 ##########
      - true to size        : 15833 ##
      - perfect fit         : 6950 #
      - fits perfectly      : 6642 #
      - high quality        : 5219 
      - exactly what i was looking for: 1638 
      - exactly like        : 1358 
      - just like the picture: 1089 
      - looks just like     : 1063 
      - great material      : 1049 
      - true to color       : 244 
      - true to picture     : 90 
      - colors are vivid    : 72 
      - better than the photo: 44 


=== Analyzing: raw_review_Beauty_and_Personal_Care ===


Scanning: 100%|█████████▉| 999999/1000000 [00:45<00:00, 22190.16it/s]



   [POSITIVE] Keep Signals Found: 87963 (8.80%)
   Keyword Breakdown:
      - soft                : 49433 ####################
      - smooth              : 33406 #############
      - smells great        : 7532 ###
      - long lasting        : 3089 #
      - exactly what i was looking for: 940 
      - exactly like        : 687 
      - authentic           : 546 
      - genuine             : 478 
      - perfect shade       : 281 
      - looks just like     : 229 
      - just like the picture: 222 
      - true to color       : 144 
      - true to picture     : 26 
      - colors are vivid    : 20 
      - better than the photo: 6 


=== Analyzing: raw_review_Home_and_Kitchen ===


Scanning: 100%|█████████▉| 999999/1000000 [00:56<00:00, 17612.48it/s]



   [POSITIVE] Keep Signals Found: 121880 (12.19%)
   Keyword Breakdown:
      - sturdy              : 43152 ####################
      - beautiful           : 38549 #################
      - well made           : 20668 #########
      - looks great         : 10692 ####
      - solid               : 9063 ####
      - easy to assemble    : 7562 ###
      - high quality        : 5792 ##
      - exactly what i was looking for: 2353 #
      - exactly like        : 1161 
      - looks just like     : 947 
      - just like the picture: 898 
      - true to color       : 214 
      - colors are vivid    : 122 
      - true to picture     : 99 
      - better than the photo: 29 


=== Analyzing: raw_review_Electronics ===


Scanning: 100%|█████████▉| 999999/1000000 [00:45<00:00, 21884.34it/s]



   [POSITIVE] Keep Signals Found: 50980 (5.10%)
   Keyword Breakdown:
      - fast                : 28110 ####################
      - works perfectly     : 8191 #####
      - easy to set up      : 6653 ####
      - great quality       : 4987 ###
      - sound is great      : 1517 #
      - exactly what i was looking for: 1270 
      - exactly like        : 764 
      - good condition      : 370 
      - looks just like     : 288 
      - picture is clear    : 214 
      - just like the picture: 143 
      - colors are vivid    : 53 
      - true to color       : 50 
      - true to picture     : 20 
      - better than the photo: 10 


=== Analyzing: raw_review_Cell_Phones_and_Accessories ===


Scanning: 100%|█████████▉| 999999/1000000 [00:40<00:00, 24941.99it/s]



   [POSITIVE] Keep Signals Found: 86481 (8.65%)
   Keyword Breakdown:
      - protection          : 27098 ####################
      - durable             : 23126 #################
      - easy to install     : 18338 #############
      - great case          : 10650 #######
      - fits perfectly      : 9368 ######
      - exactly what i was looking for: 1734 #
      - crystal clear       : 1477 #
      - exactly like        : 1052 
      - looks just like     : 740 
      - just like the picture: 485 
      - true to color       : 129 
      - true to picture     : 58 
      - colors are vivid    : 25 
      - better than the photo: 19 


=== Analyzing: raw_review_Tools_and_Home_Improvement ===


Scanning: 100%|█████████▉| 999999/1000000 [00:49<00:00, 20382.99it/s]



   [POSITIVE] Keep Signals Found: 70881 (7.09%)
   Keyword Breakdown:
      - works great         : 29353 ####################
      - sturdy              : 20856 ##############
      - sharp               : 6908 ####
      - heavy duty          : 5338 ###
      - powerful            : 3890 ##
      - well built          : 2430 #
      - reliable            : 2124 #
      - exactly what i was looking for: 1642 #
      - exactly like        : 777 
      - looks just like     : 382 
      - just like the picture: 238 
      - true to color       : 46 
      - colors are vivid    : 41 
      - better than the photo: 17 
      - true to picture     : 10 


=== Analyzing: raw_review_Automotive ===


Scanning: 100%|█████████▉| 999999/1000000 [00:44<00:00, 22472.01it/s]



   [POSITIVE] Keep Signals Found: 111889 (11.19%)
   Keyword Breakdown:
      - easy to install     : 35683 ####################
      - works great         : 30275 ################
      - perfect fit         : 18943 ##########
      - bright              : 18284 ##########
      - good quality        : 13062 #######
      - exactly like        : 1308 
      - exactly what i was looking for: 982 
      - exact match         : 647 
      - looks just like     : 641 
      - just like the picture: 163 
      - true to color       : 23 
      - true to picture     : 9 
      - better than the photo: 7 
      - colors are vivid    : 7 


=== Analyzing: raw_review_Sports_and_Outdoors ===


Scanning: 100%|█████████▉| 999999/1000000 [00:39<00:00, 25114.37it/s]


   [POSITIVE] Keep Signals Found: 99466 (9.95%)
   Keyword Breakdown:
      - comfortable         : 38858 ####################
      - sturdy              : 29114 ##############
      - fun                 : 18938 #########
      - durable             : 17776 #########
      - exactly what i was looking for: 1571 
      - exactly like        : 715 
      - great grip          : 659 
      - holds up well       : 470 
      - looks just like     : 355 
      - just like the picture: 236 
      - true to color       : 48 
      - true to picture     : 34 
      - colors are vivid    : 29 
      - better than the photo: 11 







# Saving Returned Parquet

In [None]:
import pandas as pd
from datasets import load_dataset
from tqdm import tqdm
import os

# --- 1. CONFIGURATION: CATEGORIES & KEYWORDS ---
categories_map = {
    # === GROUP A: HIGH VISUAL-SUBJECTIVITY ===
    "raw_review_Amazon_Fashion": [
        "returned", "returning", "sent back", "too small", "too large",
        "didn't fit", "doesn't fit", "ill fitting", "poor quality",
        "looks different", "color was off", "fabric", "material"
    ],
    "raw_review_Clothing_Shoes_and_Jewelry": [
        "returned", "returning", "sent back", "too small", "too large",
        "didn't fit", "doesn't fit", "ill fitting", "poor quality",
        "looks different", "color was off", "fabric", "material"
    ],
    "raw_review_Beauty_and_Personal_Care": [
        "returned", "returning", "sent back", "smell", "scent",
        "color", "consistency", "fake", "rash", "reaction",
        "texture", "looks different", "dry", "bottle broken"
    ],
    "raw_review_Home_and_Kitchen": [
        "returned", "returning", "sent back", "broken", "damaged",
        "poor quality", "cheap plastic", "missing parts", "leaked",
        "color", "smaller than expected", "wobble", "unstable"
    ],

    # === GROUP B: LOW VISUAL-SUBJECTIVITY ===
    "raw_review_Electronics": [
        "returned", "returning", "sent back", "stopped working", "died",
        "defective", "broken", "glitch", "compatible", "connection",
        "battery life", "plug", "refurbished", "damaged", "screen"
    ],
    "raw_review_Cell_Phones_and_Accessories": [
        "returned", "returning", "sent back", "case", "protector",
        "bubbles", "cracked", "peeled", "fit", "buttons",
        "charge", "cord", "yellowing"
    ],
    "raw_review_Tools_and_Home_Improvement": [
        "returned", "returning", "sent back", "broke", "snapped",
        "bent", "torque", "power", "battery", "flimsy",
        "missing piece", "leaked", "rust"
    ],
    "raw_review_Automotive": [
        "returned", "returning", "sent back", "fit", "install",
        "leak", "light", "bright", "dim", "code",
        "model", "wrong part", "adhesive"
    ],
    "raw_review_Sports_and_Outdoors": [
        "returned", "returning", "sent back", "uncomfortable", "grip",
        "size", "tight", "broke", "snapped", "deflated",
        "leaked", "poor quality", "slippery"
    ]
}

# --- 2. SETTINGS ---
# 20,000 per category = ~180,000 total rows.
SAMPLES_PER_CATEGORY = 50000
OUTPUT_FILENAME = "/content/drive/MyDrive/03. Resources/03. Education/39 Bachelors IU/26. Bachelors Thesis/Data/thesis_dataset_returned_ALL.parquet"

# Buffer
all_data = []

print(f"🚀 Starting Harvest: Target is {SAMPLES_PER_CATEGORY} returns per category...")

# --- 3. THE HARVEST LOOP ---
for category_name, keywords in categories_map.items():
    print(f"\n--> Streaming: {category_name}")

    try:
        # Load the stream
        dataset_stream = load_dataset(
            "McAuley-Lab/Amazon-Reviews-2023",
            category_name,
            split="full",
            streaming=True,
            trust_remote_code=True
        )

        count = 0
        # TQDM progress bar for visual feedback
        pbar = tqdm(total=SAMPLES_PER_CATEGORY, desc=f"   Collecting {category_name}")

        for review in dataset_stream:
            # FILTER: Low Rating (<=3) AND Keyword Match
            if review['rating'] <= 3.0 and review['text']:
                text_lower = review['text'].lower()

                # Check for keywords
                trigger = next((w for w in keywords if w in text_lower), None)

                if trigger:
                    # Capture the data
                    all_data.append({
                        "parent_asin": review['parent_asin'],
                        "user_id": review['user_id'],
                        "rating": review['rating'],
                        "text": review['text'],
                        "title": review.get('title', ''),
                        "category": category_name,
                        "trigger_keyword": trigger,
                        "label": 1
                    })

                    count += 1
                    pbar.update(1)

            # Stop when we hit the limit for this category
            if count >= SAMPLES_PER_CATEGORY:
                break

        pbar.close()

    except Exception as e:
        print(f"   [!] Error with {category_name}: {e}")

# --- 4. SAVE TO DISK ---
print(f"\nProcessing complete. Total rows collected: {len(all_data)}")

if len(all_data) > 0:
    df = pd.DataFrame(all_data)

    # Save as Parquet
    df.to_parquet(OUTPUT_FILENAME, index=False)

    print(f"✅ Success! Saved to {OUTPUT_FILENAME}")
    print("   Columns:", df.columns.tolist())
    print("   Breakdown by Category:")
    print(df['category'].value_counts())
else:
    print("⚠️ No data collected. Check internet connection or keywords.")

🚀 Starting Harvest: Target is 50000 returns per category...

--> Streaming: raw_review_Amazon_Fashion


Downloading builder script:   0%|          | 0.00/39.6k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/30.3k [00:00<?, ?B/s]

   Collecting raw_review_Amazon_Fashion: 100%|██████████| 50000/50000 [00:53<00:00, 938.17it/s] 



--> Streaming: raw_review_Clothing_Shoes_and_Jewelry


   Collecting raw_review_Clothing_Shoes_and_Jewelry: 100%|██████████| 50000/50000 [02:04<00:00, 402.58it/s]



--> Streaming: raw_review_Beauty_and_Personal_Care


   Collecting raw_review_Beauty_and_Personal_Care: 100%|██████████| 50000/50000 [01:24<00:00, 592.09it/s]



--> Streaming: raw_review_Home_and_Kitchen


   Collecting raw_review_Home_and_Kitchen: 100%|██████████| 50000/50000 [03:57<00:00, 210.67it/s]



--> Streaming: raw_review_Electronics


   Collecting raw_review_Electronics: 100%|██████████| 50000/50000 [01:43<00:00, 480.88it/s] 



--> Streaming: raw_review_Cell_Phones_and_Accessories


   Collecting raw_review_Cell_Phones_and_Accessories: 100%|██████████| 50000/50000 [00:55<00:00, 898.35it/s] 



--> Streaming: raw_review_Tools_and_Home_Improvement


   Collecting raw_review_Tools_and_Home_Improvement: 100%|██████████| 50000/50000 [03:36<00:00, 230.86it/s]



--> Streaming: raw_review_Automotive


   Collecting raw_review_Automotive: 100%|██████████| 50000/50000 [01:37<00:00, 510.40it/s]



--> Streaming: raw_review_Sports_and_Outdoors


   Collecting raw_review_Sports_and_Outdoors: 100%|██████████| 50000/50000 [02:31<00:00, 329.16it/s]



Processing complete. Total rows collected: 450000
✅ Success! Saved to /content/drive/MyDrive/03. Resources/03. Education/39 Bachelors IU/26. Bachelors Thesis/Data/thesis_dataset_returned_ALL.parquet
   Columns: ['parent_asin', 'user_id', 'rating', 'text', 'title', 'category', 'trigger_keyword', 'label']
   Breakdown by Category:
category
raw_review_Amazon_Fashion                 50000
raw_review_Clothing_Shoes_and_Jewelry     50000
raw_review_Beauty_and_Personal_Care       50000
raw_review_Home_and_Kitchen               50000
raw_review_Electronics                    50000
raw_review_Cell_Phones_and_Accessories    50000
raw_review_Tools_and_Home_Improvement     50000
raw_review_Automotive                     50000
raw_review_Sports_and_Outdoors            50000
Name: count, dtype: int64


In [None]:
import pandas as pd

filename = "/content/drive/MyDrive/03. Resources/03. Education/39 Bachelors IU/26. Bachelors Thesis/Data/thesis_dataset_returned_ALL.parquet"


# 1. Load the file
df = pd.read_parquet(filename)

# 2. Check the dimensions (Rows, Columns)
print(f"✅ Loaded: {filename}")
print(f"📊 Shape: {df.shape} (Rows, Columns)")
print(f"features: {df.columns.tolist()}\n")

# 3. View the first 5 rows
print("--- First 5 Rows ---")
pd.set_option('display.max_columns', None)  # Ensure all columns are visible
pd.set_option('display.max_colwidth', 100)  # Expand column width to see text
print(df.head())

#verification of categories
print("\n--- Category Count ---")
print(df['category'].value_counts())


✅ Loaded: /content/drive/MyDrive/03. Resources/03. Education/39 Bachelors IU/26. Bachelors Thesis/Data/thesis_dataset_returned_ALL.parquet
📊 Shape: (450000, 8) (Rows, Columns)
features: ['parent_asin', 'user_id', 'rating', 'text', 'title', 'category', 'trigger_keyword', 'label']

--- First 5 Rows ---
  parent_asin                       user_id  rating  \
0  B089PWHFVW  AHREXOGQPZDA6354MHH4ETSF3MCQ     3.0   
1  B07DXV99Z7  AHX4XWVVQUKT3FCNWCVASDF4Q56Q     1.0   
2  B076H322XJ  AEVZVWZP3S2J57XSTFN5LPQNOONA     3.0   
3  B095CHR38Z  AFZUK3MTBIBEDQOPAK3OATUOUKLA     2.0   
4  B074XF4KG4  AFZUK3MTBIBEDQOPAK3OATUOUKLA     1.0   

                                                                                                  text  \
0  If you are large chested, this isn’t for you.  The pleated bust area, did not cover my chest.  T...   
1                   Run VERY small , cheap material and not slit up the leg like pix shows... returned   
2  I am 5'4&#34;/140 lbs. and ordered the small. 

# Saving "Perfect Item/Positive" Parquet

In [None]:
import pandas as pd
from datasets import load_dataset
from tqdm import tqdm
import os

# --- 1. CONFIGURATION: POSITIVE KEYWORDS (Visual/Semantic Match) ---
base_positive = [
    "exactly like", "just like the picture", "true to picture",
    "true to color", "better than the photo", "exactly what i was looking for",
    "looks just like", "colors are vivid"
]

# Customized map to capture "Category Specific" perfection
categories_map_positive = {
    # === GROUP A: HIGH VISUAL-SUBJECTIVITY ===
    "raw_review_Amazon_Fashion": base_positive + ["perfect fit", "fits perfectly", "true to size", "soft", "comfortable", "great material", "high quality"],
    "raw_review_Clothing_Shoes_and_Jewelry": base_positive + ["perfect fit", "fits perfectly", "true to size", "soft", "comfortable", "great material", "high quality"],
    "raw_review_Beauty_and_Personal_Care": base_positive + ["authentic", "smells great", "soft", "smooth", "perfect shade", "genuine", "long lasting"],
    "raw_review_Home_and_Kitchen": base_positive + ["sturdy", "solid", "beautiful", "easy to assemble", "well made", "high quality", "looks great"],

    # === GROUP B: LOW VISUAL-SUBJECTIVITY ===
    "raw_review_Electronics": base_positive + ["works perfectly", "easy to set up", "fast", "great quality", "sound is great", "picture is clear", "good condition"],
    "raw_review_Cell_Phones_and_Accessories": base_positive + ["fits perfectly", "great case", "protection", "crystal clear", "easy to install", "durable"],
    "raw_review_Tools_and_Home_Improvement": base_positive + ["powerful", "sturdy", "well built", "heavy duty", "works great", "sharp", "reliable"],
    "raw_review_Automotive": base_positive + ["perfect fit", "easy to install", "bright", "works great", "exact match", "good quality"],
    "raw_review_Sports_and_Outdoors": base_positive + ["comfortable", "sturdy", "great grip", "fun", "durable", "holds up well"]
}

# --- 2. SETTINGS ---
# Matching the Negative Dataset size for statistical balance
SAMPLES_PER_CATEGORY = 50000
OUTPUT_FILENAME = "/content/drive/MyDrive/03. Resources/03. Education/39 Bachelors IU/26. Bachelors Thesis/Data/thesis_dataset_positive_ALL.parquet"

# Buffer
all_data = []

print(f"🚀 Starting POSITIVE Harvest: Target is {SAMPLES_PER_CATEGORY} perfect matches per category...")

# --- 3. THE HARVEST LOOP ---
for category_name, keywords in categories_map_positive.items():
    print(f"\n--> Streaming: {category_name}")

    try:
        # Load the stream
        dataset_stream = load_dataset(
            "McAuley-Lab/Amazon-Reviews-2023",
            category_name,
            split="full",
            streaming=True,
            trust_remote_code=True
        )

        count = 0
        # TQDM progress bar
        pbar = tqdm(total=SAMPLES_PER_CATEGORY, desc=f"   Collecting {category_name}")

        for review in dataset_stream:
            # FILTER: High Rating (5.0) AND Positive Keyword Match
            if review['rating'] == 5.0 and review['text']:
                text_lower = review['text'].lower()

                # Check for keywords
                trigger = next((w for w in keywords if w in text_lower), None)

                if trigger:
                    # Capture the data
                    all_data.append({
                        "parent_asin": review['parent_asin'], # Key for Images
                        "user_id": review['user_id'],
                        "rating": review['rating'],
                        "text": review['text'],
                        "title": review.get('title', ''),
                        "category": category_name,
                        "trigger_keyword": trigger,
                        "label": 0                        # 0 = Low Risk / Perfect Match
                    })

                    count += 1
                    pbar.update(1)

            # Stop when we hit the limit for this category
            if count >= SAMPLES_PER_CATEGORY:
                break

        pbar.close()

    except Exception as e:
        print(f"   [!] Error with {category_name}: {e}")

# --- 4. SAVE TO DISK ---
print(f"\nProcessing complete. Total rows collected: {len(all_data)}")

if len(all_data) > 0:
    df = pd.DataFrame(all_data)

    # Save as Parquet
    df.to_parquet(OUTPUT_FILENAME, index=False)

    print(f"✅ Success! Saved to {OUTPUT_FILENAME}")
    print("   Columns:", df.columns.tolist())
    print("   Breakdown by Category:")
    print(df['category'].value_counts())
else:
    print("⚠️ No data collected. Check internet connection or keywords.")

🚀 Starting POSITIVE Harvest: Target is 50000 perfect matches per category...

--> Streaming: raw_review_Amazon_Fashion


Downloading builder script:   0%|          | 0.00/39.6k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/30.3k [00:00<?, ?B/s]

   Collecting raw_review_Amazon_Fashion: 100%|██████████| 50000/50000 [00:16<00:00, 2967.89it/s]



--> Streaming: raw_review_Clothing_Shoes_and_Jewelry


   Collecting raw_review_Clothing_Shoes_and_Jewelry: 100%|██████████| 50000/50000 [00:14<00:00, 3472.61it/s]



--> Streaming: raw_review_Beauty_and_Personal_Care


   Collecting raw_review_Beauty_and_Personal_Care: 100%|██████████| 50000/50000 [00:26<00:00, 1900.20it/s]



--> Streaming: raw_review_Home_and_Kitchen


   Collecting raw_review_Home_and_Kitchen: 100%|██████████| 50000/50000 [00:29<00:00, 1677.74it/s]



--> Streaming: raw_review_Electronics


   Collecting raw_review_Electronics: 100%|██████████| 50000/50000 [00:45<00:00, 1110.77it/s]



--> Streaming: raw_review_Cell_Phones_and_Accessories


   Collecting raw_review_Cell_Phones_and_Accessories: 100%|██████████| 50000/50000 [00:27<00:00, 1830.78it/s]



--> Streaming: raw_review_Tools_and_Home_Improvement


   Collecting raw_review_Tools_and_Home_Improvement: 100%|██████████| 50000/50000 [00:33<00:00, 1490.66it/s]



--> Streaming: raw_review_Automotive


   Collecting raw_review_Automotive: 100%|██████████| 50000/50000 [00:14<00:00, 3501.16it/s]



--> Streaming: raw_review_Sports_and_Outdoors


   Collecting raw_review_Sports_and_Outdoors: 100%|██████████| 50000/50000 [00:31<00:00, 1611.67it/s]



Processing complete. Total rows collected: 450000
✅ Success! Saved to /content/drive/MyDrive/03. Resources/03. Education/39 Bachelors IU/26. Bachelors Thesis/Data/thesis_dataset_positive_ALL.parquet
   Columns: ['parent_asin', 'user_id', 'rating', 'text', 'title', 'category', 'trigger_keyword', 'label']
   Breakdown by Category:
category
raw_review_Amazon_Fashion                 50000
raw_review_Clothing_Shoes_and_Jewelry     50000
raw_review_Beauty_and_Personal_Care       50000
raw_review_Home_and_Kitchen               50000
raw_review_Electronics                    50000
raw_review_Cell_Phones_and_Accessories    50000
raw_review_Tools_and_Home_Improvement     50000
raw_review_Automotive                     50000
raw_review_Sports_and_Outdoors            50000
Name: count, dtype: int64


In [None]:
import pandas as pd

filename = "/content/drive/MyDrive/03. Resources/03. Education/39 Bachelors IU/26. Bachelors Thesis/Data/thesis_dataset_positive_ALL.parquet"


try:
    # 1. Load
    df = pd.read_parquet(filename)

    # 2. Basic Stats
    print(f"✅ Loaded: {filename}")
    print(f"📊 Shape: {df.shape}")
    print(f"   (Should be ~450,000 rows if all categories hit the target)\n")

    # 3. View Content
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_colwidth', 100)
    print("--- First 5 Rows (Check if text is positive) ---")
    print(df.head())

    # 4. Verify Integrity
    print("\n--- Integrity Check ---")
    print(f"Unique Labels (Should only be [0]): {df['label'].unique()}")
    print(f"Unique Ratings (Should only be [5.0]): {df['rating'].unique()}")

    # 5. Category Breakdown
    print("\n--- Category Counts ---")
    print(df['category'].value_counts())

except FileNotFoundError:
    print(f"❌ Error: File not found at {filename}")
except Exception as e:
    print(f"❌ Error: {e}")

✅ Loaded: /content/drive/MyDrive/03. Resources/03. Education/39 Bachelors IU/26. Bachelors Thesis/Data/thesis_dataset_positive_ALL.parquet
📊 Shape: (450000, 8)
   (Should be ~450,000 rows if all categories hit the target)

--- First 5 Rows (Check if text is positive) ---
  parent_asin                       user_id  rating  \
0  B00LOPVX74  AGBFYI2DDIKXC5Y4FARTYDTQBMFQ     5.0   
1  B097DQPCP2  AHREXOGQPZDA6354MHH4ETSF3MCQ     5.0   
2  B092J4ZT1V  AHREXOGQPZDA6354MHH4ETSF3MCQ     5.0   
3  B07C3P1DKX  AH6CATODIVPVUOJEWHRSRCSKAOHA     5.0   
4  B0856TH4LK  AFSKPY37N3C43SOI5IEXEK5JSIYA     5.0   

                                                                                                  text  \
0  I think this locket is really pretty. The inside back is a solid silver depression and the front...   
1  I really like this Tshirt.  Quality fabric.  Soft.  The puffed sleeves add interest.  I ordered ...   
2  I received this T the other day.  I took it out of the package and was insta

## Gemini Sub Review

# Removing Duplicated Values

In [None]:
import pandas as pd
import os

# --- CONFIGURATION (Input Paths) ---
file_neg_in = "/content/drive/MyDrive/03. Resources/03. Education/39 Bachelors IU/26. Bachelors Thesis/Data/thesis_dataset_returned_ALL.parquet"
file_pos_in = "/content/drive/MyDrive/03. Resources/03. Education/39 Bachelors IU/26. Bachelors Thesis/Data/thesis_dataset_positive_ALL.parquet"

# --- CONFIGURATION (Output Paths for Deduped Files) ---
# We save them to the same folder but with a new suffix
file_neg_out = file_neg_in.replace("_ALL.parquet", "_DEDUPED.parquet")
file_pos_out = file_pos_in.replace("_ALL.parquet", "_DEDUPED.parquet")

print("🔄 Loading Raw Datasets...")
df_neg = pd.read_parquet(file_neg_in)
df_pos = pd.read_parquet(file_pos_in)

print(f"   Raw Negative Rows: {len(df_neg):,}")
print(f"   Raw Positive Rows: {len(df_pos):,}")

# --- STEP 1: DEDUPLICATE NEGATIVE (RETURNS) ---
print("\n🧹 Processing Negative Dataset...")
initial_neg = len(df_neg)
# Remove duplicate reviews
df_neg_clean = df_neg.drop_duplicates(subset=['user_id', 'text'], keep='first')
removed_neg = initial_neg - len(df_neg_clean)

print(f"   Removed: {removed_neg:,} duplicates")
print(f"   Remaining: {len(df_neg_clean):,}")
# Save
df_neg_clean.to_parquet(file_neg_out, index=False)
print(f"   ✅ Saved to: {os.path.basename(file_neg_out)}")

# --- STEP 2: DEDUPLICATE POSITIVE (PERFECT) ---
print("\n🧹 Processing Positive Dataset...")
initial_pos = len(df_pos)
# Remove duplicate reviews
df_pos_clean = df_pos.drop_duplicates(subset=['user_id', 'text'], keep='first')
removed_pos = initial_pos - len(df_pos_clean)

print(f"   Removed: {removed_pos:,} duplicates")
print(f"   Remaining: {len(df_pos_clean):,}")
# Save
df_pos_clean.to_parquet(file_pos_out, index=False)
print(f"   ✅ Saved to: {os.path.basename(file_pos_out)}")

🔄 Loading Raw Datasets...
   Raw Negative Rows: 450,000
   Raw Positive Rows: 450,000

🧹 Processing Negative Dataset...
   Removed: 2,706 duplicates
   Remaining: 447,294
   ✅ Saved to: thesis_dataset_returned_DEDUPED.parquet

🧹 Processing Positive Dataset...
   Removed: 8,488 duplicates
   Remaining: 441,512
   ✅ Saved to: thesis_dataset_positive_DEDUPED.parquet


# Removing Short Reviews

In [None]:
import pandas as pd
import os

# --- CONFIGURATION (Input: The Full Deduped Files) ---
file_neg_in = "/content/drive/MyDrive/03. Resources/03. Education/39 Bachelors IU/26. Bachelors Thesis/Data/thesis_dataset_returned_DEDUPED.parquet"
file_pos_in = "/content/drive/MyDrive/03. Resources/03. Education/39 Bachelors IU/26. Bachelors Thesis/Data/thesis_dataset_positive_DEDUPED.parquet"

# --- CONFIGURATION (Output: The New "Clean" Files) ---
file_neg_out = file_neg_in.replace("_DEDUPED.parquet", "_DEDUPED_NO_SHORT.parquet")
file_pos_out = file_pos_in.replace("_DEDUPED.parquet", "_DEDUPED_NO_SHORT.parquet")

print("🔄 Loading Deduped Datasets...")
df_neg = pd.read_parquet(file_neg_in)
df_pos = pd.read_parquet(file_pos_in)

# Define Threshold (Reviews must have >= 4 words)
MIN_WORD_COUNT = 4

# --- FUNCTION TO FILTER & REPORT ---
def filter_short_text(df, dataset_name):
    # Calculate lengths
    df['word_count'] = df['text'].apply(lambda x: len(str(x).split()))

    # Identify short ones
    mask_short = df['word_count'] < MIN_WORD_COUNT
    short_count = mask_short.sum()

    print(f"\n🔍 Analyzing {dataset_name}...")
    print(f"   Total Rows: {len(df):,}")
    print(f"   Found {short_count:,} reviews < {MIN_WORD_COUNT} words.")

    if short_count > 0:
        # Keep only long enough reviews
        df_filtered = df[~mask_short].copy()
        print(f"   Removed {short_count:,} rows.")
        print(f"   Final Count: {len(df_filtered):,}")
        return df_filtered
    else:
        print("   No short reviews found.")
        return df

# --- APPLY FILTERS ---
df_neg_final = filter_short_text(df_neg, "Negative (Return)")
df_pos_final = filter_short_text(df_pos, "Positive (Perfect)")

# --- FINAL SAVE ---
print("\n" + "="*40)
print("💾 SAVING NEW 'NO_SHORT' DATASETS...")

df_neg_final.to_parquet(file_neg_out, index=False)
df_pos_final.to_parquet(file_pos_out, index=False)

print(f"✅ Saved CLEAN Negative data to:\n   {os.path.basename(file_neg_out)}")
print(f"✅ Saved CLEAN Positive data to:\n   {os.path.basename(file_pos_out)}")

# Check Class Balance Ratio
ratio = len(df_neg_final) / len(df_pos_final) if len(df_pos_final) > 0 else 0
print(f"\n   Final Balance Ratio (Neg/Pos): {ratio:.2f}")
print("="*40)

🔄 Loading Deduped Datasets...

🔍 Analyzing Negative (Return)...
   Total Rows: 447,294
   Found 7,126 reviews < 4 words.
   Removed 7,126 rows.
   Final Count: 440,168

🔍 Analyzing Positive (Perfect)...
   Total Rows: 441,512
   Found 14,205 reviews < 4 words.
   Removed 14,205 rows.
   Final Count: 427,307

💾 SAVING NEW 'NO_SHORT' DATASETS...
✅ Saved CLEAN Negative data to:
   thesis_dataset_returned_DEDUPED_NO_SHORT.parquet
✅ Saved CLEAN Positive data to:
   thesis_dataset_positive_DEDUPED_NO_SHORT.parquet

   Final Balance Ratio (Neg/Pos): 1.03


# Get Meta Data, Description and Image Urls

In [None]:
import pandas as pd
from datasets import load_dataset
from tqdm import tqdm
import os

# --- 1. CONFIGURATION ---
file_neg = "/content/drive/MyDrive/03. Resources/03. Education/39 Bachelors IU/26. Bachelors Thesis/Data/thesis_dataset_returned_DEDUPED.parquet"
file_pos = "/content/drive/MyDrive/03. Resources/03. Education/39 Bachelors IU/26. Bachelors Thesis/Data/thesis_dataset_positive_DEDUPED.parquet"

output_meta_file = "/content/drive/MyDrive/03. Resources/03. Education/39 Bachelors IU/26. Bachelors Thesis/Data/thesis_metadata_images.parquet"

print("🔄 Loading Target IDs from both datasets...")
try:
    df_neg = pd.read_parquet(file_neg)
    df_pos = pd.read_parquet(file_pos)

    targets_neg = df_neg[['parent_asin', 'category']].drop_duplicates()
    targets_pos = df_pos[['parent_asin', 'category']].drop_duplicates()

    # Create the Master List of items to find
    df_targets = pd.concat([targets_neg, targets_pos]).drop_duplicates()

    # Create a fast lookup set: (parent_asin, category)
    target_set = set(zip(df_targets['parent_asin'], df_targets['category']))

    print(f"✅ Targets Loaded.")
    print(f"   Unique Items to Find: {len(target_set):,}")
    print(f"   (Derived from {len(df_neg):,} Returns + {len(df_pos):,} Perfect Matches)")

except Exception as e:
    print(f"❌ Error loading files: {e}")
    target_set = set()

# --- 2. THE META SCANNING LOOP ---
if len(target_set) > 0:
    # Identify which categories we need to scan
    categories_to_scan = df_targets['category'].unique()

    meta_buffer = []

    print(f"\n🚀 Starting Meta Scan across {len(categories_to_scan)} categories...")

    for review_cat in categories_to_scan:
        # Convert "raw_review_X" -> "raw_meta_X"
        meta_cat = review_cat.replace("raw_review", "raw_meta")
        print(f"\n--> Scanning Metadata: {meta_cat}")

        try:
            # Load the meta stream
            dataset_stream = load_dataset(
                "McAuley-Lab/Amazon-Reviews-2023",
                meta_cat,
                split="full",
                streaming=True,
                trust_remote_code=True
            )

            found_count = 0

            # We iterate through the massive meta dataset
            for item in tqdm(dataset_stream, desc=f"   Searching {meta_cat}"):

                # Check if this item is in our target list
                key = (item['parent_asin'], review_cat)

                if key in target_set:

                    # 1. Get Best Image
                    image_url = None
                    if item.get('images') and len(item['images']) > 0:
                        # Prioritize 'hi_res', fall back to 'large'
                        image_url = item['images'].get('hi_res') or item['images'].get('large')

                    # 2. Get Description
                    desc = None
                    if item.get('description'):
                        desc = " ".join(item['description']) if isinstance(item['description'], list) else item['description']

                    # 3. Get Features (Bullet Points)
                    feats = None
                    if item.get('features'):
                        feats = " ".join(item['features'])

                    # 4. Get Title
                    title = item.get('title', '')

                    # Save it
                    meta_buffer.append({
                        "parent_asin": item['parent_asin'],
                        "category": review_cat, # Keep review category name for easy joining
                        "title": title,
                        "image_url": image_url,
                        "description": desc,
                        "features": feats
                    })

                    found_count += 1

            print(f"   Found metadata for {found_count} items in this category.")

        except Exception as e:
            print(f"   [!] Error loading {meta_cat}: {e}")

    # --- 3. SAVE THE META DATABASE ---
    print("\n" + "="*40)
    print("💾 Saving Metadata...")

    if len(meta_buffer) > 0:
        df_meta = pd.DataFrame(meta_buffer)

        df_meta = df_meta.drop_duplicates(subset=['parent_asin', 'category'])

        df_meta.to_parquet(output_meta_file, index=False)

        print(f"✅ Success! Saved {len(df_meta):,} items to:")
        print(f"   {output_meta_file}")

        # Quality Check
        missing_imgs = df_meta['image_url'].isnull().sum()
        print(f"\n   Quality Report:")
        print(f"   Items with Images: {len(df_meta) - missing_imgs:,}")
        print(f"   Items missing Images: {missing_imgs:,}")
    else:
        print("⚠️ Warning: No metadata found. Check your ASIN matching logic.")

🔄 Loading Target IDs from both datasets...
✅ Targets Loaded.
   Unique Items to Find: 578,489
   (Derived from 447,294 Returns + 441,512 Perfect Matches)

🚀 Starting Meta Scan across 9 categories...

--> Scanning Metadata: raw_meta_Amazon_Fashion


   Searching raw_meta_Amazon_Fashion: 826108it [09:25, 1460.77it/s]


   Found metadata for 79491 items in this category.

--> Scanning Metadata: raw_meta_Clothing_Shoes_and_Jewelry


   Searching raw_meta_Clothing_Shoes_and_Jewelry: 5146352it [1:09:29, 1580.15it/s]'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 83c0cc57-1164-4ad3-b3af-a28f34587280)')' thrown while requesting GET https://huggingface.co/datasets/McAuley-Lab/Amazon-Reviews-2023/resolve/main/raw/meta_categories/meta_Clothing_Shoes_and_Jewelry.jsonl
Retrying in 1s [Retry 1/5].
   Searching raw_meta_Clothing_Shoes_and_Jewelry: 5409311it [1:13:36, 1274.10it/s]'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: b05d460b-a29a-49d7-b86d-95d2b857ee69)')' thrown while requesting GET https://huggingface.co/datasets/McAuley-Lab/Amazon-Reviews-2023/resolve/main/raw/meta_categories/meta_Clothing_Shoes_and_Jewelry.jsonl
Retrying in 1s [Retry 1/5].
   Searching raw_meta_Clothing_Shoes_and_Jewelry: 6112878it [1:25:33, 1667.44it/s]'(ReadTimeoutError("HTTPSConnectionPool(

   Found metadata for 74139 items in this category.

--> Scanning Metadata: raw_meta_Beauty_and_Personal_Care


   Searching raw_meta_Beauty_and_Personal_Care: 1028914it [16:36, 1032.38it/s]


   Found metadata for 54363 items in this category.

--> Scanning Metadata: raw_meta_Home_and_Kitchen


   Searching raw_meta_Home_and_Kitchen: 2552054it [45:05, 1479.81it/s]'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 6deb1d39-9b54-46ca-b70d-bba3a2093dd5)')' thrown while requesting GET https://huggingface.co/datasets/McAuley-Lab/Amazon-Reviews-2023/resolve/main/raw/meta_categories/meta_Home_and_Kitchen.jsonl
Retrying in 1s [Retry 1/5].
   Searching raw_meta_Home_and_Kitchen: 3735584it [1:05:50, 945.71it/s] 


   Found metadata for 74225 items in this category.

--> Scanning Metadata: raw_meta_Electronics


   Searching raw_meta_Electronics: 46010it [00:55, 1225.23it/s]'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 325b1fdd-2f16-4c47-bc84-c2ed64c13d38)')' thrown while requesting GET https://huggingface.co/datasets/McAuley-Lab/Amazon-Reviews-2023/resolve/main/raw/meta_categories/meta_Electronics.jsonl
Retrying in 1s [Retry 1/5].
   Searching raw_meta_Electronics: 921952it [16:13, 1543.84it/s]'('Connection broken: IncompleteRead(0 bytes read, 5251072 more expected)', IncompleteRead(0 bytes read, 5251072 more expected))' thrown while requesting GET https://huggingface.co/datasets/McAuley-Lab/Amazon-Reviews-2023/resolve/main/raw/meta_categories/meta_Electronics.jsonl
Retrying in 1s [Retry 1/5].
   Searching raw_meta_Electronics: 1237297it [21:21, 1590.56it/s]'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 17db6424-3900-4155-ab66-f9e68bfe28f

   Found metadata for 53376 items in this category.

--> Scanning Metadata: raw_meta_Cell_Phones_and_Accessories


   Searching raw_meta_Cell_Phones_and_Accessories: 89717it [01:44, 1482.12it/s]'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 7225341d-2f5f-4c7f-bc38-1032f8b04d1e)')' thrown while requesting GET https://huggingface.co/datasets/McAuley-Lab/Amazon-Reviews-2023/resolve/main/raw/meta_categories/meta_Cell_Phones_and_Accessories.jsonl
Retrying in 1s [Retry 1/5].
   Searching raw_meta_Cell_Phones_and_Accessories: 1288490it [20:57, 1024.26it/s]


   Found metadata for 56812 items in this category.

--> Scanning Metadata: raw_meta_Tools_and_Home_Improvement


   Searching raw_meta_Tools_and_Home_Improvement: 1473810it [25:19, 970.14it/s] 


   Found metadata for 60800 items in this category.

--> Scanning Metadata: raw_meta_Automotive


   Searching raw_meta_Automotive: 302248it [04:25, 1659.77it/s]'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 4592bc24-28d5-41f4-83d1-a017b4b42c3a)')' thrown while requesting GET https://huggingface.co/datasets/McAuley-Lab/Amazon-Reviews-2023/resolve/main/raw/meta_categories/meta_Automotive.jsonl
Retrying in 1s [Retry 1/5].
   Searching raw_meta_Automotive: 1426538it [21:54, 1660.14it/s]'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 08b28861-1f78-41e1-bc8f-d77ba1927966)')' thrown while requesting GET https://huggingface.co/datasets/McAuley-Lab/Amazon-Reviews-2023/resolve/main/raw/meta_categories/meta_Automotive.jsonl
Retrying in 1s [Retry 1/5].
   Searching raw_meta_Automotive: 2003129it [30:06, 1108.94it/s]


   Found metadata for 65723 items in this category.

--> Scanning Metadata: raw_meta_Sports_and_Outdoors


   Searching raw_meta_Sports_and_Outdoors: 1587421it [21:33, 1227.33it/s]


   Found metadata for 59560 items in this category.

💾 Saving Metadata...
✅ Success! Saved 578,489 items to:
   /content/drive/MyDrive/03. Resources/03. Education/39 Bachelors IU/26. Bachelors Thesis/Data/thesis_metadata_images.parquet

   Quality Report:
   Items with Images: 578,489
   Items missing Images: 0


# Downloading Images

In [None]:
import pandas as pd
import requests
import os
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
from PIL import Image
from io import BytesIO
import numpy as np

# --- 1. CONFIGURATION ---
meta_file = "/content/drive/MyDrive/03. Resources/03. Education/39 Bachelors IU/26. Bachelors Thesis/Data/thesis_metadata_images.parquet"
neg_file = "/content/drive/MyDrive/03. Resources/03. Education/39 Bachelors IU/26. Bachelors Thesis/Data/thesis_dataset_returned_DEDUPED_NO_SHORT.parquet"
pos_file = "/content/drive/MyDrive/03. Resources/03. Education/39 Bachelors IU/26. Bachelors Thesis/Data/thesis_dataset_positive_DEDUPED_NO_SHORT.parquet"

output_folder = "/content/drive/MyDrive/03. Resources/03. Education/39 Bachelors IU/26. Bachelors Thesis/Data/images/"
os.makedirs(output_folder, exist_ok=True)

neg_out = "/content/drive/MyDrive/03. Resources/03. Education/39 Bachelors IU/26. Bachelors Thesis/Data/thesis_dataset_returned_FINAL.parquet"
pos_out = "/content/drive/MyDrive/03. Resources/03. Education/39 Bachelors IU/26. Bachelors Thesis/Data/thesis_dataset_positive_FINAL.parquet"

# --- 2. LOAD DATA ---
print("🔄 Loading Metadata & Reviews...")
try:
    df_meta = pd.read_parquet(meta_file)
    df_neg = pd.read_parquet(neg_file)
    df_pos = pd.read_parquet(pos_file)
    print(f"   Unique Images to Download: {len(df_meta):,}")
except FileNotFoundError as e:
    print(f"❌ Error: {e}")
    exit()

# --- 3. ROBUST DOWNLOADER FUNCTION ---
def download_image(row):
    url = row['image_url']
    asin = row['parent_asin']

    # Sanitize URL (Handle lists)
    try:
        if hasattr(url, '__len__') and not isinstance(url, str):
            if len(url) > 0:
                url = url[0]
            else:
                return None
    except:
        return None

    if not url or not isinstance(url, str):
        return None

    filename = f"{asin}.jpg"
    file_path = os.path.join(output_folder, filename)

    # CHECK 1: If file exists, make sure it's not empty (0 bytes)
    if os.path.exists(file_path):
        if os.path.getsize(file_path) > 100: # If > 100 bytes, it's likely valid
            return file_path
        # If it's 0 bytes, we ignore it and re-download

    # CHECK 2: Add Headers to fool Amazon
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }

    try:
        response = requests.get(url, headers=headers, timeout=5)

        if response.status_code == 200:
            # Verify Image
            img = Image.open(BytesIO(response.content))
            img.verify()

            # Save
            with open(file_path, 'wb') as f:
                f.write(response.content)
            return file_path
    except:
        pass # Fail silently

    return None

# --- 4. EXECUTE DOWNLOADS ---
print("\n🚀 Starting Multi-Threaded Download...")
rows = df_meta.to_dict('records')

with ThreadPoolExecutor(max_workers=20) as executor:
    results = list(tqdm(executor.map(download_image, rows), total=len(rows), desc="Downloading"))

# --- 5. VERIFY & SAVE ---
df_meta['local_image_path'] = results

# Check how many actually worked
valid_count = df_meta['local_image_path'].notna().sum()
print(f"\n🔍 Download Summary:")
print(f"   Total Targets: {len(df_meta):,}")
print(f"   Successful Downloads: {valid_count:,}")
print(f"   Failed: {len(df_meta) - valid_count:,}")

if valid_count == 0:
    print("\n❌ CRITICAL: No images downloaded. Amazon might be blocking requests without headers.")
    print("   (The headers added in this script should fix it.)")
    exit()

# Map successful downloads
valid_images = df_meta.dropna(subset=['local_image_path'])
asin_to_path = dict(zip(valid_images['parent_asin'], valid_images['local_image_path']))
asin_to_desc = dict(zip(valid_images['parent_asin'], valid_images['description']))

print("\n📊 Mapping Images to Reviews...")

def process_dataset(df, name, output_path):
    initial_count = len(df)

    # Map the data
    df['image_path'] = df['parent_asin'].map(asin_to_path)
    df['product_description'] = df['parent_asin'].map(asin_to_desc)

    # Drop rows that didn't get an image
    df_final = df.dropna(subset=['image_path'])

    final_count = len(df_final)

    print(f"\n   --- {name} Results ---")
    print(f"   Started: {initial_count:,}")
    print(f"   Kept (Has Image): {final_count:,}")
    print(f"   Dropped: {initial_count - final_count:,}")

    # SAVE CHECK
    if final_count > 0:
        df_final.to_parquet(output_path, index=False)
        print(f"   ✅ Saved to: {os.path.basename(output_path)}")
    else:
        print(f"   ⚠️ WARNING: Resulting dataset is empty! No file saved.")

process_dataset(df_neg, "NEGATIVE (Returns)", neg_out)
process_dataset(df_pos, "POSITIVE (Perfect)", pos_out)

print("\n✅ PIPELINE COMPLETE")

🔄 Loading Metadata & Reviews...
   Unique Images to Download: 578,489

🚀 Starting Multi-Threaded Download...


Downloading: 100%|██████████| 578489/578489 [2:11:43<00:00, 73.19it/s]



🔍 Download Summary:
   Total Targets: 578,489
   Successful Downloads: 526,627
   Failed: 51,862

📊 Mapping Images to Reviews...

   --- NEGATIVE (Returns) Results ---
   Started: 440,168
   Kept (Has Image): 403,660
   Dropped: 36,508
   ✅ Saved to: thesis_dataset_returned_FINAL.parquet

   --- POSITIVE (Perfect) Results ---
   Started: 427,307
   Kept (Has Image): 397,303
   Dropped: 30,004
   ✅ Saved to: thesis_dataset_positive_FINAL.parquet

✅ PIPELINE COMPLETE


In [None]:
import os
import pandas as pd

# The paths we expected to save to
neg_out = "/content/drive/MyDrive/03. Resources/03. Education/39 Bachelors IU/26. Bachelors Thesis/Data/thesis_dataset_returned_FINAL.parquet"
pos_out = "/content/drive/MyDrive/03. Resources/03. Education/39 Bachelors IU/26. Bachelors Thesis/Data/thesis_dataset_positive_FINAL.parquet"

def check_file(path):
    print(f"\nChecking: {os.path.basename(path)}...")
    if os.path.exists(path):
        size_mb = os.path.getsize(path) / (1024 * 1024)
        print(f"✅ FOUND! Size: {size_mb:.2f} MB")

        # Try loading it to prove it's not corrupt
        try:
            df = pd.read_parquet(path)
            print(f"   Shape: {df.shape} (Rows, Columns)")
            print(f"   Columns: {df.columns.tolist()}")
        except Exception as e:
            print(f"   ⚠️ File exists but cannot be read: {e}")
    else:
        print(f"❌ NOT FOUND at: {path}")

check_file(neg_out)
check_file(pos_out)

## Make FINAL.parquet

In [None]:
import pandas as pd
import os
from tqdm import tqdm

# --- CONFIGURATION ---
image_folder = "/content/drive/MyDrive/03. Resources/03. Education/39 Bachelors IU/26. Bachelors Thesis/Data/images/"

# Inputs (The Clean Text)
neg_file_in = "/content/drive/MyDrive/03. Resources/03. Education/39 Bachelors IU/26. Bachelors Thesis/Data/thesis_dataset_returned_DEDUPED_NO_SHORT.parquet"
pos_file_in = "/content/drive/MyDrive/03. Resources/03. Education/39 Bachelors IU/26. Bachelors Thesis/Data/thesis_dataset_positive_DEDUPED_NO_SHORT.parquet"

# Outputs (The Final Files)
neg_out = "/content/drive/MyDrive/03. Resources/03. Education/39 Bachelors IU/26. Bachelors Thesis/Data/thesis_dataset_returned_FINAL.parquet"
pos_out = "/content/drive/MyDrive/03. Resources/03. Education/39 Bachelors IU/26. Bachelors Thesis/Data/thesis_dataset_positive_FINAL.parquet"

# --- 1. SCAN IMAGES ---
print("🔍 Scanning Image Folder...")
if not os.path.exists(image_folder):
    print("❌ Error: Image folder not found.")
    exit()

# Get set of all ASINs that have a file
files = os.listdir(image_folder)
valid_asins = {f.replace(".jpg", "") for f in tqdm(files) if f.endswith(".jpg")}

print(f"✅ Found {len(valid_asins):,} images.")

# --- 2. BUILD PARQUETS ---
def build_final(input_path, output_path, dataset_name):
    print(f"\n🔄 Building {dataset_name}...")
    try:
        df = pd.read_parquet(input_path)
    except:
        print(f"   ❌ Could not load {input_path}")
        return

    initial_len = len(df)

    # Filter: Keep only rows that have an image
    df_final = df[df['parent_asin'].isin(valid_asins)].copy()

    # Add the local path column
    df_final['image_path'] = df_final['parent_asin'].apply(lambda x: os.path.join(image_folder, f"{x}.jpg"))

    final_len = len(df_final)

    print(f"   Original Rows: {initial_len:,}")
    print(f"   Matched Images: {final_len:,}")

    if final_len > 0:
        df_final.to_parquet(output_path, index=False)
        print(f"   💾 Saved: {os.path.basename(output_path)}")
    else:
        print("   ⚠️ No matches found (Dataset empty).")

# --- 3. EXECUTE ---
build_final(neg_file_in, neg_out, "NEGATIVE (Returns)")
build_final(pos_file_in, pos_out, "POSITIVE (Perfect)")

print("\n✅ Dataset Sync Complete.")

##  Gemini Judge Test

In [None]:
import google.generativeai as genai

API_KEY = "YOUR_API_KEY"
genai.configure(api_key=API_KEY)

print("🔍 Checking available models...")
try:
    for m in genai.list_models():
        if 'generateContent' in m.supported_generation_methods:
            print(f"✅ Available: {m.name}")
except Exception as e:
    print(f"❌ Error: {e}")

🔍 Checking available models...
✅ Available: models/gemini-2.5-flash
✅ Available: models/gemini-2.5-pro
✅ Available: models/gemini-2.0-flash-exp
✅ Available: models/gemini-2.0-flash
✅ Available: models/gemini-2.0-flash-001
✅ Available: models/gemini-2.0-flash-exp-image-generation
✅ Available: models/gemini-2.0-flash-lite-001
✅ Available: models/gemini-2.0-flash-lite
✅ Available: models/gemini-2.0-flash-lite-preview-02-05
✅ Available: models/gemini-2.0-flash-lite-preview
✅ Available: models/gemini-exp-1206
✅ Available: models/gemini-2.5-flash-preview-tts
✅ Available: models/gemini-2.5-pro-preview-tts
✅ Available: models/gemma-3-1b-it
✅ Available: models/gemma-3-4b-it
✅ Available: models/gemma-3-12b-it
✅ Available: models/gemma-3-27b-it
✅ Available: models/gemma-3n-e4b-it
✅ Available: models/gemma-3n-e2b-it
✅ Available: models/gemini-flash-latest
✅ Available: models/gemini-flash-lite-latest
✅ Available: models/gemini-pro-latest
✅ Available: models/gemini-2.5-flash-lite
✅ Available: models