# E-Commerce Dataset Cleaning - FIXED VERSION

This notebook handles the cleaning and preprocessing of the e-commerce dataset for the product recommendation system.

## Objectives:
- Load and explore the dataset
- Remove duplicates and handle missing values
- Standardize data formats
- Add product categories
- Prepare data for vectorization
- Export cleaned data to database

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import zipfile
import os
import sys
import re
from pathlib import Path

# Add parent directory to path to import services
sys.path.append('..')
try:
    from services.database import DatabaseService
    print("✅ Database service imported successfully")
except ImportError as e:
    print(f"⚠️ Database service not available: {e}")

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Set style for plots
plt.style.use('default')
sns.set_palette("husl")

print("📚 Libraries imported successfully!")

In [None]:
def create_sample_data():
    """Create sample e-commerce data for demonstration with categories"""
    np.random.seed(42)
    
    # Sample product categories and descriptions
    categories_data = {
        'Electronics': [
            'Wireless Headphones Premium Quality',
            'Smartphone Android Latest Model', 
            'Laptop Computer Gaming Performance',
            'Tablet iPad Pro Professional',
            'Smart Watch Fitness Tracker',
            'Bluetooth Speaker Portable Sound',
            'Computer Mouse Wireless Ergonomic',
            'Keyboard Mechanical RGB Gaming',
            'USB Hub Multi-Port Expansion',
            'Laptop Stand Adjustable Height'
        ],
        'Clothing': [
            'T-Shirt Cotton Comfortable Casual',
            'Jeans Denim Classic Blue',
            'Dress Summer Elegant Style',
            'Jacket Winter Warm Coat',
            'Sneakers Running Sport Shoes',
            'Hat Baseball Cap Fashion',
            'Shirt Business Professional',
            'Sweater Wool Cozy Warm'
        ],
        'Home & Kitchen': [
            'Coffee Maker Automatic Brewing',
            'Vacuum Cleaner Powerful Suction',
            'Plant Pot Ceramic Decorative',
            'Lamp LED Modern Design',
            'Cushion Soft Decorative Pillow',
            'Candle Scented Relaxing Aroma',
            'Teapot Ceramic Traditional Design',
            'Glass Teapot Heat Resistant',
            'Stainless Steel Teapot Modern'
        ],
        'Sports': [
            'Yoga Mat Non-Slip Exercise',
            'Dumbbells Weight Training Set',
            'Running Shoes Athletic Performance',
            'Water Bottle Stainless Steel',
            'Fitness Tracker Smart Health',
            'Tennis Racket Professional Grade'
        ],
        'Books': [
            'Programming Book Python Guide',
            'Novel Fiction Bestseller Story',
            'Cookbook Healthy Recipe Collection',
            'Biography Inspiring Life Story',
            'Science Book Educational Learning',
            'Art Book Creative Inspiration'
        ],
        'Beauty': [
            'Face Cream Anti-Aging Formula',
            'Lipstick Matte Long-Lasting',
            'Shampoo Natural Organic Care',
            'Perfume Floral Fragrance Scent',
            'Nail Polish Glossy Finish',
            'Moisturizer Hydrating Skin Care'
        ],
        'Antiques': [
            'Vintage Wooden Clock Antique Timepiece',
            'Antique Brass Compass Navigation',
            'Victorian Era Jewelry Box Ornate',
            'Vintage Mirror Decorative Frame',
            'Antique Vase Ceramic Collectible'
        ]
    }
    
    countries = ['USA', 'UK', 'Canada', 'Australia', 'Germany', 'France', 'Japan', 'China', 'Italy', 'Spain']
    
    # Generate sample data with categories
    n_samples = 1000
    data = []
    
    for i in range(n_samples):
        # Randomly select category
        category = np.random.choice(list(categories_data.keys()))
        # Randomly select product from that category
        description = np.random.choice(categories_data[category])
        
        data.append({
            'StockCode': f'SKU{i+1:04d}',
            'Description': description,
            'Category': category,
            'UnitPrice': round(np.random.uniform(5.0, 500.0), 2),
            'Country': np.random.choice(countries)
        })
    
    return pd.DataFrame(data)

# Load dataset from zip file or create sample data
data_path = '../data/dataset.zip'

if os.path.exists(data_path):
    print("📁 Loading dataset from zip file...")
    with zipfile.ZipFile(data_path, 'r') as zip_ref:
        # List files in zip
        file_list = zip_ref.namelist()
        print("Files in dataset.zip:")
        for file in file_list:
            print(f"  - {file}")
        
        # Extract to temporary directory
        zip_ref.extractall('../data/temp')
        
        # Find CSV files
        csv_files = [f for f in file_list if f.endswith('.csv')]
        if csv_files:
            # Load the first CSV file
            csv_file = csv_files[0]
            df = pd.read_csv(f'../data/temp/{csv_file}')
            print(f"\n✅ Loaded dataset: {csv_file}")
            print(f"📊 Shape: {df.shape}")
        else:
            print("❌ No CSV files found in the zip archive")
            print("🔄 Creating sample data for demonstration...")
            df = create_sample_data()
else:
    print("❌ Dataset.zip not found.")
    print("🔄 Creating sample data for demonstration...")
    df = create_sample_data()

# Display basic information about the dataset
print("\n📋 Dataset Info:")
print(df.info())
print("\n📝 First 5 rows:")
print(df.head())
print("\n📊 Dataset shape:", df.shape)