In [1]:
1+7

8

In [4]:
import pandas as pd

# Using the path you provided with the 'r' prefix to handle backslashes
file_path = r"C:\project\amazon project\data\amazon_india_complete_2015_2025.csv"

try:
    df = pd.read_csv(file_path)
    print("‚úÖ Dataset Loaded Successfully!")
    print(f"Total Rows: {len(df)}")
except FileNotFoundError:
    print("‚ùå Error: The file was not found at the specified path.")
    print("Double-check if the folder 'data' exists inside 'C:\project\amazon project\'")

  print("Double-check if the folder 'data' exists inside 'C:\project\amazon project\'")


‚úÖ Dataset Loaded Successfully!
Total Rows: 1127609


In [5]:
#1. The Date Format Challenge
def challenge_1_dates(df):
    from dateutil import parser
    def robust_parse(date_str):
        try:
            return parser.parse(str(date_str), dayfirst=True).date()
        except:
            return None
    df['order_date'] = df['order_date'].apply(robust_parse)
    df['order_date'] = pd.to_datetime(df['order_date'])
    return df

In [7]:
#2. The Currency & Symbol Challenge
def challenge_2_prices(df):
    def clean_currency(value):
        if pd.isna(value) or "request" in str(value).lower():
            return 0.0
        # Remove symbols and commas
        cleaned = re.sub(r'[‚Çπ,]', '', str(value)).strip()
        try:
            return float(cleaned)
        except:
            return 0.0
    df['original_price_inr'] = df['original_price_inr'].apply(clean_currency)
    return df

In [8]:
#3. The Product Rating Normalization
def challenge_3_ratings(df):
    def normalize_rating(rating):
        s = str(rating).split()[0] # Get the first part (e.g., "4.5")
        try:
            val = float(s.split('/')[0])
            return val if val <= 5 else val / 20 # Handle 100-base ratings
        except:
            return np.nan
    df['product_rating'] = df['product_rating'].apply(normalize_rating)
    return df

In [9]:
#4. City Name Fuzzy Matching
def challenge_4_cities(df):
    city_map = {
        'bangalore': 'Bengaluru', 'bengaluru': 'Bengaluru',
        'bombay': 'Mumbai', 'mumbai': 'Mumbai',
        'new delhi': 'Delhi', 'delhi': 'Delhi'
    }
    df['customer_city'] = df['customer_city'].str.lower().str.strip()
    df['customer_city'] = df['customer_city'].replace(city_map).str.title()
    return df

In [10]:
#5. Handling Discount Outliers
def challenge_5_discounts(df):
    # Cap discounts between 0% and 90%
    df['discount_percent'] = pd.to_numeric(df['discount_percent'], errors='coerce').fillna(0)
    df.loc[df['discount_percent'] > 90, 'discount_percent'] = 0
    df.loc[df['discount_percent'] < 0, 'discount_percent'] = 0
    return df

In [11]:
#6. Logic Verification (Cross-Column Check)
def challenge_6_logic(df):
    # Recalculate to ensure mathematical integrity
    df['calc_final'] = df['original_price_inr'] * (1 - df['discount_percent']/100)
    # If the difference is significant, use the calculated value
    mask = (df['final_amount_inr'] - df['calc_final']).abs() > 1
    df.loc[mask, 'final_amount_inr'] = df['calc_final']
    return df

In [12]:
#7. Delivery Days Extraction
def challenge_7_delivery(df):
    def extract_days(s):
        s = str(s).lower()
        if 'same' in s: return 0
        nums = re.findall(r'\d+', s)
        if nums:
            return int(sum(map(int, nums)) / len(nums)) # Average of range
        return 7 # Default fallback
    df['delivery_days_clean'] = df['delivery_days'].apply(extract_days)
    return df

In [13]:
#8. Categorical Grouping
def challenge_8_categories(df):
    def simplify_cat(cat):
        cat = str(cat).lower()
        if 'phone' in cat or 'mobile' in cat: return 'Electronics'
        if 'shirt' in cat or 'wear' in cat: return 'Fashion'
        if 'cook' in cat or 'home' in cat: return 'Home & Kitchen'
        return 'Others'
    df['category_group'] = df['category'].apply(simplify_cat)
    return df

In [14]:
#9. Duplicate Transaction Removal
def challenge_9_duplicates(df):
    # Remove exact duplicates
    df = df.drop_duplicates()
    # Remove duplicate IDs, keeping the most recent entry
    if 'transaction_id' in df.columns:
        df = df.sort_values('order_date').drop_duplicates(subset=['transaction_id'], keep='last')
    return df

In [15]:
#10. Missing Value Imputation (Prime Membership)
def challenge_10_impute(df):
    # Assume False for missing Prime status
    df['is_prime_member'] = df['is_prime_member'].fillna(False)
    # Convert various strings to actual Boolean
    df['is_prime_member'] = df['is_prime_member'].apply(lambda x: str(x).lower() in ['true', '1', 'yes', 'y'])
    return df

In [17]:
import pandas as pd
import re

# --- STEP 1: DEFINE HELPER FUNCTIONS ---

def price_to_float(price_val):
    """Removes currency symbols and commas, converts to float."""
    if pd.isna(price_val) or str(price_val).strip() == "":
        return 0.0
    # Use regex to keep only digits and decimal points
    cleaned = re.sub(r'[^\d.]', '', str(price_val))
    try:
        return float(cleaned)
    except ValueError:
        return 0.0

def rating_to_float(rating_val):
    """Extracts numeric rating (e.g., '4.5 out of 5' -> 4.5)."""
    if pd.isna(rating_val):
        return None
    try:
        # Just in case it's a string like "4.2 stars"
        val = str(rating_val).split()[0]
        return float(val)
    except (ValueError, IndexError):
        return None

def clean_city(city_val):
    """Normalizes city names (lowercase, no extra spaces)."""
    if pd.isna(city_val):
        return "Unknown"
    return str(city_val).strip().title()

def to_bool(val):
    """Converts strings/numbers to proper Boolean True/False."""
    if pd.isna(val):
        return False
    truthy_values = ['true', '1', 'yes', 'y', 'prime']
    return str(val).lower().strip() in truthy_values

# --- STEP 2: APPLY CLEANING LOGIC ---

print("Starting data cleaning...")

# Price Cleaning
if 'original_price_inr' in df.columns:
    df['original_price_inr_clean'] = df['original_price_inr'].apply(price_to_float)

# Rating Cleaning
if 'product_rating' in df.columns:
    df['product_rating_clean'] = df['product_rating'].apply(rating_to_float)

# City Cleaning
if 'customer_city' in df.columns:
    df['customer_city_clean'] = df['customer_city'].apply(clean_city)

# Boolean Columns
bool_cols = ['is_prime_member', 'is_prime_eligible', 'is_festival_sale']
for col in bool_cols:
    if col in df.columns:
        df[col + '_clean'] = df[col].apply(to_bool)

print("‚úÖ Data Cleaning Complete!")
print(df[['original_price_inr_clean', 'product_rating_clean', 'customer_city_clean']].head())

Starting data cleaning...
‚úÖ Data Cleaning Complete!
   original_price_inr_clean  product_rating_clean customer_city_clean
0                  27340.84                   3.5             Kolkata
1                  32907.49                   4.5            Ludhiana
2                  47052.18                   4.3           Bangalore
3                 238725.44                   3.6           Bangalore
4                  25970.76                   3.7               Kochi


In [18]:
#2 Brand and Date Normalization
# === BRAND NORMALIZATION ===
if 'brand' in df.columns:
    df['brand_clean'] = df['brand'].str.lower().str.strip()
    df['brand_clean'] = df['brand_clean'].str.replace(r'[^a-z0-9 ]', '', regex=True)
    df['brand_clean'] = df['brand_clean'].replace({
        'samsung electronics': 'samsung',
        'one plus': 'oneplus',
        'mi': 'xiaomi'
    })

# === DERIVED COLUMNS (DATES) ===
if 'order_date_clean' in df.columns:
    df['order_year'] = df['order_date_clean'].dt.year
    df['order_month'] = df['order_date_clean'].dt.month
    df['is_weekend'] = df['order_date_clean'].dt.weekday >= 5

In [19]:
#3. Outlier Handling and Deduplication
# === DISCOUNT & QUANTITY OUTLIERS ===
if 'discount_percent' in df.columns:
    df.loc[df['discount_percent'] > 90, 'discount_percent'] = None

if 'quantity' in df.columns:
    # Setting invalid quantities to 1
    df.loc[(df['quantity'] <= 0) | (df['quantity'] > 20), 'quantity'] = 1

# === CUSTOMER DEDUPLICATION ===
if 'customer_email' in df.columns:
    df['customer_id'] = df.groupby('customer_email').ngroup()

# === TRANSACTION DUPLICATES ===
if 'transaction_id' in df.columns:
    df = df.drop_duplicates(subset=['transaction_id'])

In [21]:
# 4. Category Cleanup
# === CATEGORY CLEANUP ===
if 'category_clean' in df.columns:
    category_map = {
        'mobiles & accessories': 'Mobiles',
        'mobile accessories': 'Mobiles',
        'home & kitchen': 'Home'
    }
    # Convert to lower to match keys, map them, and fill remaining with original values
    df['category_clean'] = df['category_clean'].str.lower().map(category_map).fillna(df['category_clean'])

In [23]:
# Check how many nulls are in each column
print("--- Missing Values Before Handling ---")
print(df.isnull().sum())

# Handle Nulls: Fill with a default value so you don't lose rows
# For numbers, use 0.0; for text, use 'Unknown'
df['original_price_inr_clean'] = df['original_price_inr_clean'].fillna(0.0)
df['product_rating_clean'] = df['product_rating_clean'].fillna(0.0)
df['customer_city_clean'] = df['customer_city_clean'].fillna('Unknown')

# Check again to ensure all are 0
print("\n--- Missing Values After Handling ---")
print(df.isnull().sum())

--- Missing Values Before Handling ---
transaction_id                   0
order_date                       0
customer_id                      0
product_id                       0
product_name                     0
category                         0
subcategory                      0
brand                            0
original_price_inr               0
discount_percent                 0
discounted_price_inr             0
quantity                         0
subtotal_inr                     0
delivery_charges             90201
final_amount_inr                 0
customer_city                    0
customer_state                   0
customer_tier                    0
customer_spending_tier           0
customer_age_group          135315
payment_method                   0
delivery_days                    0
delivery_type                    0
is_prime_member                  0
is_festival_sale                 0
festival_name               777736
customer_rating             341696
return_status   

In [7]:
# 2. Try parsing with 'format="mixed"' - this is best for varied datasets
df['order_date_clean'] = pd.to_datetime(df['order_date'], format='mixed', dayfirst=True, errors='coerce')

# 3. If any rows are STILL NaN, they might be truly broken data. 
# We should drop only the rows where the date is missing.
df = df.dropna(subset=['order_date_clean'])

# 4. Re-extract features from the now-clean dates
df['order_year'] = df['order_date_clean'].dt.year
df['order_month'] = df['order_date_clean'].dt.month
df['is_weekend'] = df['order_date_clean'].dt.weekday >= 5

print(f"‚úÖ Fixed! New row count: {len(df)}")
print(f"Remaining NaNs in year: {df['order_year'].isna().sum()}")

‚úÖ Fixed! New row count: 1127609
Remaining NaNs in year: 0


In [8]:
import pandas as pd
import os

# --- STEP 1: LOAD THE DATA ---
file_path = r"C:\project\amazon project\data\amazon_india_complete_2015_2025.csv"

try:
    df = pd.read_csv(file_path)
    print("‚úÖ Dataset Loaded Successfully!")
    
    # --- STEP 2: HANDLE MISSING VALUES ---
    # Fix delivery charges (assume NaN is Free Delivery)
    if 'delivery_charges' in df.columns:
        df['delivery_charges'] = df['delivery_charges'].fillna(0.0)
    
    # Fix age group (label NaN as Unknown)
    if 'customer_age_group' in df.columns:
        df['customer_age_group'] = df['customer_age_group'].fillna('Unknown')
        
    print("‚úÖ Missing values handled!")

    # --- STEP 3: SAVE THE FINAL CSV ---
    save_path = r"C:\project\amazon project\data\amazon_india_cleaned_final.csv"
    
    # Save the file
    df.to_csv(save_path, index=False, encoding='utf-8')
    
    print("-" * 30)
    print(f"üéâ SUCCESS! FINAL CSV SAVED")
    print(f"üìç Path: {save_path}")
    print(f"üìä Total Rows: {len(df)}")
    print("-" * 30)

except FileNotFoundError:
    print(f"‚ùå Error: Could not find the file at {file_path}")
except Exception as e:
    print(f"‚ùå An error occurred: {e}")

‚úÖ Dataset Loaded Successfully!
‚úÖ Missing values handled!
------------------------------
üéâ SUCCESS! FINAL CSV SAVED
üìç Path: C:\project\amazon project\data\amazon_india_cleaned_final.csv
üìä Total Rows: 1127609
------------------------------


In [5]:
 # Shows the count of missing values for each column
print(df.isnull().sum())

transaction_id                 0
order_date                     0
customer_id                    0
product_id                     0
product_name                   0
category                       0
subcategory                    0
brand                          0
original_price_inr             0
discount_percent               0
discounted_price_inr           0
quantity                       0
subtotal_inr                   0
delivery_charges               0
final_amount_inr               0
customer_city                  0
customer_state                 0
customer_tier                  0
customer_spending_tier         0
customer_age_group             0
payment_method                 0
delivery_days                  0
delivery_type                  0
is_prime_member                0
is_festival_sale               0
festival_name             777736
customer_rating           341696
return_status                  0
order_month                    0
order_year                     0
order_quar