In [4]:
import pandas as pd

# Remember to use the 'r' before the quotes for the file path!
file_path = r"C:\Users\noahi\Downloads\delivery_routes.csv\delivery_routes.csv"
try:
    # We only load the first 5 rows to save your RAM
    df_preview = pd.read_csv(file_path, nrows=5)
    print("✅ File detected!")
    print(f"--- Columns Found ---")
    print(df_preview.columns.tolist())
except Exception as e:
    print(f"❌ Error: {e}")

✅ File detected!
--- Columns Found ---
['delivery_id', 'robot_id', 'start_time', 'start_lat', 'start_lon', 'end_lat', 'end_lon', 'distance', 'duration', 'obstacles_encountered', 'traffic_level', 'end_time']


In [3]:
import pandas as pd
import numpy as np
import os

# 1. Setup Paths
input_path = r"C:\Users\noahi\Downloads\demand_forecasts.csv\demand_forecasts.csv" # Update if name differs
output_path = r"C:\Users\noahi\Downloads\demand_forecasts_PROFESSIONAL.csv"

try:
    df = pd.read_csv(input_path)
    raw_rows = len(df)

    # 2. DEDUPLICATION
    # Removing rows where the same campus, time, and day are repeated
    df = df.drop_duplicates(subset=['timestamp', 'campus_id'])

    # 3. REMOVE BLANKS
    # Drop rows missing the primary order_count or campus_id
    df = df.dropna(subset=['order_count', 'campus_id'])

    # 4. MEDIAN OUTLIER REPLACEMENT
    # Target: order_count (to fix sensor glitches or input errors)
    col = 'order_count'
    col_median = df[col].median()
    
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Replace impossible order counts with the median
    df.loc[(df[col] < lower_bound) | (df[col] > upper_bound), col] = col_median

    # 5. DATA OPTIMIZATION (Laptop Protection)
    # Downcasting numeric columns to save RAM
    if 'campus_id' in df.columns:
        df['campus_id'] = pd.to_numeric(df['campus_id'], downcast='integer')
    df['order_count'] = pd.to_numeric(df['order_count'], downcast='float')

    # 6. FINAL EXPORT
    df.to_csv(output_path, index=False)
    
    print("--- CAMPUS LOGISTICS: CLEANING AUDIT ---")
    print(f"Original Rows:    {raw_rows:,}")
    print(f"Cleaned Rows:     {len(df):,}")
    print(f"Noise Removed:    {raw_rows - len(df):,}")
    print(f"Status:           PROFESSIONAL GRADE ✅")

except Exception as e:
    print(f"❌ Error during cleanup: {e}")

--- CAMPUS LOGISTICS: CLEANING AUDIT ---
Original Rows:    737,904
Cleaned Rows:     726,888
Noise Removed:    11,016
Status:           PROFESSIONAL GRADE ✅


In [9]:
import pandas as pd
import numpy as np
import os

# 1. Setup Paths
input_path = r"C:\Users\noahi\Downloads\delivery_routes.csv\delivery_routes.csv"
output_path = r"C:\Users\noahi\Downloads\delivery_routes_PROFESSIONAL.csv"

try:
    df = pd.read_csv(input_path)
    raw_rows = len(df)

    # 2. DEDUPLICATION
    # Each delivery should be unique
    df = df.drop_duplicates(subset=['delivery_id'])

    # 3. REMOVE BLANKS
    # Drop rows where critical tracking data is missing
    df = df.dropna(subset=['delivery_id', 'robot_id', 'distance', 'duration'])

    # 4. SURGICAL OUTLIER FIX (Numeric Only)
    # This identifies columns with numbers and ignores text like "medium"
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    
    for col in numeric_cols:
        # We don't want to change IDs even if they look like outliers
        if 'id' in col.lower():
            continue
            
        col_median = df[col].median()
        
        # IQR Calculation
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        # Replace sensor glitches with the median
        df.loc[(df[col] < lower_bound) | (df[col] > upper_bound), col] = col_median

    # 5. LAPTOP PROTECTION (Memory Optimization)
    # Downcast floats and ints to smaller types to save RAM
    df['distance'] = pd.to_numeric(df['distance'], downcast='float')
    df['duration'] = pd.to_numeric(df['duration'], downcast='float')

    # 6. FINAL EXPORT
    df.to_csv(output_path, index=False)
    
    print("--- DELIVERY ROUTES: CLEANING AUDIT ---")
    print(f"Original Rows:    {raw_rows:,}")
    print(f"Cleaned Rows:     {len(df):,}")
    print(f"Glitches Removed: {raw_rows - len(df):,}")
    print(f"Status:           PROFESSIONAL GRADE ✅")

except Exception as e:
    print(f"❌ Error during cleanup: {e}")

--- DELIVERY ROUTES: CLEANING AUDIT ---
Original Rows:    737,904
Cleaned Rows:     735,271
Glitches Removed: 2,633
Status:           PROFESSIONAL GRADE ✅


In [12]:
import os

# Paths for the Delivery Routes files
raw_path = r"C:\Users\noahi\Downloads\delivery_routes.csv\delivery_routes.csv"
clean_path = r"C:\Users\noahi\Downloads\delivery_routes.csv\delivery_routes_PROFESSIONAL.csv"

def get_mb(path):
    return os.path.getsize(path) / (1024 * 1024)

if os.path.exists(raw_path) and os.path.exists(clean_path):
    raw_size = get_mb(raw_path)
    clean_size = get_mb(clean_path)
    
    print("--- DELIVERY ROUTES: SIZE COMPARISON ---")
    print(f"Original File: {raw_size:.2f} MB")
    print(f"Cleaned File:  {clean_size:.2f} MB")
    print(f"Difference:    {clean_size - raw_size:+.2f} MB")
    
    if clean_size > raw_size:
        print("\nNote: Size increased slightly because we filled missing data gaps with Median values.")
    else:
        print("\nNote: Size decreased because we removed thousands of duplicate or 'noisy' rows.")
else:
    print("❌ One of the files was not found. Please check the file names.")

--- DELIVERY ROUTES: SIZE COMPARISON ---
Original File: 131.01 MB
Cleaned File:  118.62 MB
Difference:    -12.39 MB

Note: Size decreased because we removed thousands of duplicate or 'noisy' rows.
