In [7]:
# Configuration
from pathlib import Path
import math
import pandas as pd
import random

# Input file (CSV only)
INPUT_CSV = r'C:\Users\Ven\Documents\Coding\pambihira\CRAZY\poop\ShopeeAnalysis_Amalgam\consolidated_trimed_50.csv'

# Trimming behavior
TRIM_PERCENT = 0.01  # keep 1% per product id
RANDOM_SEED = 42

# Output file (auto-named using TRIM_PERCENT)
OUTPUT_CSV = f'consolidated_file_cleaned_v1_trimmed_by_product_ids_{int(TRIM_PERCENT*100)}pct.csv'

In [8]:
# Load CSV and inspect structure
csv_path = Path(INPUT_CSV)
if not csv_path.exists():
    raise FileNotFoundError("CSV input not found. Ensure the CSV file exists.")

df = pd.read_csv(csv_path)
print('Columns:', df.columns.tolist())
print('Rows:', len(df))
print('Sample rows:')
print(df.head())

Columns: ['product', 'time', 'avg.sku_price(₱)', 'sold/day', 'revenue/day(₱)', 'sold/m', 'product_sales_rate(%)', 'price(₱)', 'sku', 'sold', 'sold/month(₱)', 'revenue/month', 'new_ratings', 'ratings', 'ratings_rate', 'likes', 'rating_star', 'new_likes', 'id', 'top-level_category', 'seller_from', 'listing_time', 'active_months', 'suitable_for_seasonal_analysis']
Rows: 3777313
Sample rows:
                                             product        time  \
0     Cute Different Designs  button accessories ...  2022-03-01   
1     Cute Different Designs  button accessories ...  2022-04-01   
2     Cute Different Designs  button accessories ...  2022-05-01   
3     Cute Different Designs  button accessories ...  2022-06-01   
4     Cute Different Designs  button accessories ...  2022-07-01   

   avg.sku_price(₱)  sold/day  revenue/day(₱)  sold/m  product_sales_rate(%)  \
0               NaN       NaN             NaN     NaN                    NaN   
1               NaN       NaN           

In [9]:
# Prepare product ID list from CSV
grouped = df.groupby('id')
product_ids = list(grouped.groups.keys())
n_unique = len(product_ids)

print('Unique product IDs:', n_unique)

Unique product IDs: 83794


In [10]:
# Sample product IDs to keep (keep TRIM_PERCENT)
keep_fraction = max(0.0, min(1.0, TRIM_PERCENT))
keep_count = max(0, math.floor(n_unique * keep_fraction))

if keep_count == 0 and n_unique > 0:
    keep_count = 1  # keep at least one product

# Deterministic sampling
sampled_ids = pd.Series(product_ids).sample(n=keep_count, random_state=RANDOM_SEED).tolist() if n_unique > 0 else []
trimmed_df = df[df['id'].isin(sampled_ids)]

print(f'Products total: {n_unique:,}')
print(f'Products kept (~{keep_fraction*100:.1f}%): {len(sampled_ids):,}')
print('Sample kept product IDs:', sampled_ids[:5])

Products total: 83,794
Products kept (~1.0%): 837
Sample kept product IDs: [23286022697, 26781562347, 20246220530, 22424011026, 13032304534]


In [11]:
# Save trimmed CSV and quick preview
trimmed_df.to_csv(OUTPUT_CSV, index=False)

print('Saved trimmed dataset to:', OUTPUT_CSV)
if not trimmed_df.empty:
    first_pid = trimmed_df.iloc[0]['id']
    print('Preview product ID:', first_pid)
    print('Preview product keys:', trimmed_df.columns.tolist())

Saved trimmed dataset to: consolidated_file_cleaned_v1_trimmed_by_product_ids_1pct.csv
Preview product ID: 23943860856
Preview product keys: ['product', 'time', 'avg.sku_price(₱)', 'sold/day', 'revenue/day(₱)', 'sold/m', 'product_sales_rate(%)', 'price(₱)', 'sku', 'sold', 'sold/month(₱)', 'revenue/month', 'new_ratings', 'ratings', 'ratings_rate', 'likes', 'rating_star', 'new_likes', 'id', 'top-level_category', 'seller_from', 'listing_time', 'active_months', 'suitable_for_seasonal_analysis']
