In [5]:
import os
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import kagglehub

# -----------------------------
# CONFIG PATHS
# -----------------------------
DATA_DIR_RAW = "./data/raw"
DATA_DIR_PROCESSED = "./data/processed"
os.makedirs(DATA_DIR_RAW, exist_ok=True)
os.makedirs(DATA_DIR_PROCESSED, exist_ok=True)

# -----------------------------
# DOWNLOAD DATASET FROM KAGGLE
# -----------------------------
path = kagglehub.dataset_download("sanskar21072005/amazon-best-sellers-2025")
print("Path to dataset files:", path)

# Find CSV file in downloaded folder
csv_files = [f for f in os.listdir(path) if f.endswith(".csv")]
if not csv_files:
    raise FileNotFoundError("No CSV file found in Kaggle dataset")
raw_file_path = os.path.join(path, csv_files[0])

# Copy the file to data/raw for consistency
raw_path = os.path.join(DATA_DIR_RAW, "amazon_bestsellers_2025.csv")
df_raw = pd.read_csv(raw_file_path)
df_raw.to_csv(raw_path, index=False)
print(f"Raw dataset copied to: {raw_path}")

# -----------------------------
# CLEANING FUNCTIONS
# -----------------------------
def fill_missing_median(df, columns=None):
    df_copy = df.copy()
    if columns is None:
        columns = df_copy.select_dtypes(include="number").columns
    for col in columns:
        df_copy[col] = df_copy[col].fillna(df_copy[col].median())
    return df_copy

def drop_missing(df, threshold=0.5):
    df_copy = df.copy()
    drop_cols = df_copy.columns[df_copy.isna().mean() > threshold]
    return df_copy.drop(columns=drop_cols)

def normalize_data(df, columns=None):
    df_copy = df.copy()
    if columns is None:
        columns = df_copy.select_dtypes(include="number").columns
    scaler = MinMaxScaler()
    df_copy[columns] = scaler.fit_transform(df_copy[columns])
    return df_copy

# -----------------------------
# LOAD RAW DATA
# -----------------------------
df_raw = pd.read_csv(raw_path)
print("===== ORIGINAL DATA =====")
print(df_raw.head())
print("\nMissing values per column:\n", df_raw.isna().sum())

# -----------------------------
# CLEANING PIPELINE
# -----------------------------
df_clean = drop_missing(df_raw)
df_clean = fill_missing_median(df_clean)
df_clean = normalize_data(df_clean)

# -----------------------------
# SAVE CLEANED DATA
# -----------------------------
processed_path = os.path.join(DATA_DIR_PROCESSED, "amazon_bestsellers_2025_cleaned.csv")
df_clean.to_csv(processed_path, index=False)

# -----------------------------
# COMPARE ORIGINAL VS CLEANED
# -----------------------------
print("\n===== CLEANED DATA =====")
print(df_clean.head())
print("\nOriginal shape:", df_raw.shape)
print("Cleaned shape:", df_clean.shape)
print(f"\n✅ Cleaned dataset saved to: {processed_path}")

# -----------------------------
# DOCUMENTATION / ASSUMPTIONS
# -----------------------------
"""
Assumptions:
1. Numeric missing values are filled with median for robustness.
2. Columns with >50% missing values are dropped (assumed low information).
3. Min-Max normalization scales numeric features to [0,1].
4. Non-numeric columns remain unchanged.
5. Saved cleaned dataset to /data/processed/ for reproducibility.
"""


Downloading from https://www.kaggle.com/api/v1/datasets/download/sanskar21072005/amazon-best-sellers-2025?dataset_version_number=1...


100%|██████████| 59.3k/59.3k [00:00<00:00, 15.2MB/s]

Extracting files...
Path to dataset files: C:\Users\sarda\.cache\kagglehub\datasets\sanskar21072005\amazon-best-sellers-2025\versions\1
Raw dataset copied to: ./data/raw\amazon_bestsellers_2025.csv
===== ORIGINAL DATA =====
   Unnamed: 0  rank        asin  \
0           0     1  B073VKKNN9   
1           1     2  B07PQZJ6Y8   
2           2     3  B0D1KL34JM   
3           3     4  B07B9YYLGG   
4           4     5  B073VLGMZ4   

                                       product_title product_price  \
0  Kaspersky | Premium - Total Security (Ultimate...       ₹469.00   
1  K7 Security K7, Total Security, 1 User, 1 Year...       ₹370.00   
2  Microsoft Office 2021 Professional - Lifetime ...     ₹1,799.00   
3  Bitdefender - 1 Device,1 Year - Mobile Securit...        ₹94.00   
4  McAfee Total Protection 2025 | 1 Device, 3 Yea...     ₹1,699.00   

   product_star_rating  product_num_ratings  \
0                  4.3              13324.0   
1                  4.4               2291.0   
2  




'\nAssumptions:\n1. Numeric missing values are filled with median for robustness.\n2. Columns with >50% missing values are dropped (assumed low information).\n3. Min-Max normalization scales numeric features to [0,1].\n4. Non-numeric columns remain unchanged.\n5. Saved cleaned dataset to /data/processed/ for reproducibility.\n'