In [None]:
!pip install "datasets<3.0.0"

Collecting datasets<3.0.0
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting fsspec<=2024.6.1,>=2023.1.0 (from fsspec[http]<=2024.6.1,>=2023.1.0->datasets<3.0.0)
  Downloading fsspec-2024.6.1-py3-none-any.whl.metadata (11 kB)
[31mERROR: Operation cancelled by user[0m[31m
[0m

KeyboardInterrupt: 

In [None]:
from datasets import load_dataset
import re

# Keywords that signal a return
RETURN_KEYWORDS = ["returned", "returning", "sent back", "send back", "too small", "too large", "fit", "didn't fit"]

# Counter
counts = {"return_signal": 0, "total_checked": 0}

# Load stream (no download)
dataset_stream = load_dataset(
    "McAuley-Lab/Amazon-Reviews-2023",
    "raw_review_Clothing_Shoes_and_Jewelry",
    split="full",
    streaming=True, # Stream it so we don't crash RAM
    trust_remote_code=True
)

print("Scanning for return signals (stopping at 10k)...")

for review in dataset_stream:
    counts["total_checked"] += 1

    # 1. Check Rating (Must be low for a return proxy)
    if review['rating'] <= 3.0:
        text = review['text'].lower()

        # 2. Check for Keywords
        if any(word in text for word in RETURN_KEYWORDS):
            counts["return_signal"] += 1

    # Stop after checking 10,000 reviews just to get a sample ratio
    if counts["total_checked"] >= 100000:
        break

ratio = (counts["return_signal"] / counts["total_checked"]) * 100
print(f"Results: Found {counts['return_signal']} potential returns out of {counts['total_checked']}.")
print(f"Estimated Positive Class Ratio: {ratio:.2f}%")

if counts["return_signal"] < 100:
    print("WARNING: Signal is very weak. Consider 2018 dataset or broader keywords.")
else:
    print("SUCCESS: Enough data exists to build a balanced dataset!")

Scanning for return signals (stopping at 10k)...
Results: Found 6088 potential returns out of 100000.
Estimated Positive Class Ratio: 6.09%
SUCCESS: Enough data exists to build a balanced dataset!


In [None]:
from datasets import load_dataset
from tqdm import tqdm

# --- CONFIGURATION ---
# The new "Perfect Match" keywords
KEEP_KEYWORDS = [
    "exactly like", "just like the picture", "true to picture",
    "true to color", "perfect fit", "fits perfectly", "true to size",
    "better than the photo", "exactly what i was looking for",
    "better than i thought", "colors are vivid", "looks just like"
]

# Counters
counts = {"perfect_keeps": 0, "total_checked": 0}
scan_limit = 100000  # Check the first 100k reviews

# --- LOAD DATASTREAM ---
print("Initializing Stream...")
dataset_stream = load_dataset(
    "McAuley-Lab/Amazon-Reviews-2023",
    "raw_review_Clothing_Shoes_and_Jewelry",
    split="full",
    streaming=True,
    trust_remote_code=True
)

print(f"Scanning first {scan_limit} reviews for 'Perfect Matches'...")
pbar = tqdm(total=scan_limit)

for review in dataset_stream:
    counts["total_checked"] += 1

    # LOGIC: 5 Stars AND contains a "Perfect Match" keyword
    if review['rating'] == 5.0:
        if any(word in review['text'].lower() for word in KEEP_KEYWORDS):
            counts["perfect_keeps"] += 1

    pbar.update(1)

    if counts["total_checked"] >= scan_limit:
        break

pbar.close()

# --- RESULTS ---
ratio = (counts["perfect_keeps"] / counts["total_checked"]) * 100
print(f"\nResults: Found {counts['perfect_keeps']} 'Perfect Matches' out of {counts['total_checked']}.")
print(f"Estimated Density: {ratio:.2f}%")

if counts["perfect_keeps"] > 1000:
    print("SUCCESS: High density! You can easily build the dataset.")
else:
    print("WARNING: These are rare. You might need to relax the keywords.")

Initializing Stream...


Downloading builder script:   0%|          | 0.00/39.6k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/30.3k [00:00<?, ?B/s]

Scanning first 100000 reviews for 'Perfect Matches'...


100%|██████████| 100000/100000 [00:03<00:00, 26849.75it/s]


Results: Found 4767 'Perfect Matches' out of 100000.
Estimated Density: 4.77%
SUCCESS: High density! You can easily build the dataset.





# Check for fashion

In [None]:
from datasets import load_dataset

# CONFIGURATION
# "Clothing_Shoes_and_Jewelry" = 66M reviews (Huge, but noisier)
# "Amazon_Fashion" = 2.5M reviews (Cleaner, strictly fashion items)
CATEGORY = "raw_review_Clothing_Shoes_and_Jewelry"

# Keywords that signal a return (Expanded for better capture)
RETURN_KEYWORDS = [
    "returned", "returning", "sent back", "send back",
    "too small", "too large", "too short", "too long",
    "didn't fit", "doesn't fit", "ill fitting", "poor quality",
    "looked different", "color was off", "misleading"
]

counts = {"return_signal": 0, "total_checked": 0}

print(f"Streaming {CATEGORY} to check return ratio...")

# Load stream (no download required)
try:
    dataset_stream = load_dataset(
        "McAuley-Lab/Amazon-Reviews-2023",
        CATEGORY,
        split="full",
        streaming=True,
        trust_remote_code=True
    )

    for review in dataset_stream:
        counts["total_checked"] += 1

        # Progress update every 1k
        if counts["total_checked"] % 1000 == 0:
            print(f"Checked {counts['total_checked']}...", end="\r")

        # 1. Check Rating (Returns usually accompany low ratings)
        if review['rating'] <= 3.0:
            # Handle cases where text is None
            text = review.get('text', "")
            if text:
                text = text.lower()
                # 2. Check for Keywords
                if any(word in text for word in RETURN_KEYWORDS):
                    counts["return_signal"] += 1

        # STOP condition: Check 50k reviews to get a stable ratio
        if counts["total_checked"] >= 50000:
            break

    # RESULTS
    ratio = (counts["return_signal"] / counts["total_checked"]) * 100
    print(f"\n\n--- RESULTS FOR {CATEGORY} ---")
    print(f"Total Scanned: {counts['total_checked']}")
    print(f"Potential Returns Found: {counts['return_signal']}")
    print(f"Estimated Positive Class Ratio: {ratio:.2f}%")

    if counts["return_signal"] < 500: # Less than 1%
        print("WARNING: Severe Class Imbalance. You MUST use SMOTE or Weighted Loss.")
    else:
        print("SUCCESS: Signal is strong enough. Standard training should work.")

except Exception as e:
    print(f"Error: {e}")
    print("Tip: Make sure you have 'pip install datasets' and an internet connection.")

Streaming raw_review_Clothing_Shoes_and_Jewelry to check return ratio...


Downloading builder script:   0%|          | 0.00/39.6k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/30.3k [00:00<?, ?B/s]

Checked 50000...

--- RESULTS FOR raw_review_Clothing_Shoes_and_Jewelry ---
Total Scanned: 50000
Potential Returns Found: 1510
Estimated Positive Class Ratio: 3.02%
SUCCESS: Signal is strong enough. Standard training should work.


In [None]:
from datasets import load_dataset
from tqdm import tqdm

# --- CONFIGURATION ---
# We use the specific Fashion subset now
CATEGORY = "raw_review_Amazon_Fashion"

# The "Perfect Match" keywords (Low Visual-Semantic Discrepancy)
KEEP_KEYWORDS = [
    "exactly like", "just like the picture", "true to picture",
    "true to color", "perfect fit", "fits perfectly", "true to size",
    "better than the photo", "exactly what i was looking for",
    "better than i thought", "colors are vivid", "looks just like"
]

# Counters
counts = {"perfect_keeps": 0, "total_checked": 0}
scan_limit = 100000  # Check the first 100k reviews

# --- LOAD DATASTREAM ---
print(f"Initializing Stream for {CATEGORY}...")
try:
    dataset_stream = load_dataset(
        "McAuley-Lab/Amazon-Reviews-2023",
        CATEGORY,
        split="full",
        streaming=True,
        trust_remote_code=True
    )

    print(f"Scanning first {scan_limit} reviews for 'Perfect Matches'...")
    pbar = tqdm(total=scan_limit)

    for review in dataset_stream:
        counts["total_checked"] += 1

        # LOGIC: 5 Stars AND contains a "Perfect Match" keyword
        if review['rating'] == 5.0:
            # Safety check: ensure text exists before lower()
            text = review.get('text', "")
            if text and any(word in text.lower() for word in KEEP_KEYWORDS):
                counts["perfect_keeps"] += 1

        pbar.update(1)

        if counts["total_checked"] >= scan_limit:
            break

    pbar.close()

    # --- RESULTS ---
    ratio = (counts["perfect_keeps"] / counts["total_checked"]) * 100
    print(f"\nResults: Found {counts['perfect_keeps']} 'Perfect Matches' out of {counts['total_checked']}.")
    print(f"Estimated Density: {ratio:.2f}%")

    if counts["perfect_keeps"] > 1000:
        print("SUCCESS: High density! You have plenty of 'Low Discrepancy' data.")
    else:
        print("WARNING: These are rare in this subset. You might need to switch back to 'Clothing_Shoes_and_Jewelry'.")

except Exception as e:
    print(f"Error: {e}")

Initializing Stream for raw_review_Amazon_Fashion...
Scanning first 100000 reviews for 'Perfect Matches'...


100%|██████████| 100000/100000 [00:04<00:00, 21802.04it/s]


Results: Found 2711 'Perfect Matches' out of 100000.
Estimated Density: 2.71%
SUCCESS: High density! You have plenty of 'Low Discrepancy' data.



