In [3]:
import os
import pandas as pd

# 1. Let's see what is actually in your Downloads
path = r"C:\Users\noahi\Downloads\delivery_cost_comparison\delivery_cost_comparison.csv"
files = [f for f in os.listdir(path) if f.endswith('.csv')]

print("--- Files found in Downloads ---")
for i, f in enumerate(files):
    print(f"{i}: {f}")

# 2. Automatically try to grab the most recent one that ISN'T our clean files
# (Adjust the index [i] if it picks the wrong one)
try:
    target_file = os.path.join(path, files[-1]) 
    print(f"\nAttempting to open: {target_file}")
    
    df_preview = pd.read_csv(target_file, nrows=5)
    print("\n--- Success! Columns found: ---")
    print(df_preview.columns.tolist())
except Exception as e:
    print(f"\n‚ùå Error: {e}")

FileNotFoundError: [WinError 3] The system cannot find the path specified: 'C:\\Users\\noahi\\Downloads\\delivery_cost_comparison\\delivery_cost_comparison.csv'

In [4]:
import pandas as pd

# We point directly to the file, not the folder
target_file = r"C:\Users\noahi\Downloads\delivery_cost_comparison\delivery_cost_comparison.csv"

try:
    # Let's peek at the first 5 rows
    df_preview = pd.read_csv(target_file, nrows=5)
    print("--- Success! Dataset Loaded ---")
    print("\nColumns:", df_preview.columns.tolist())
    print("\nSample Data:\n", df_preview.head())
    
    # Check the total size while we are at it
    # This helps us know if we need the 'Streaming Method'
    total_rows = pd.read_csv(target_file, usecols=[0]).shape[0]
    print(f"\nTotal Row Count: {total_rows:,}")

except Exception as e:
    print(f"‚ùå Still having trouble: {e}")

‚ùå Still having trouble: [Errno 2] No such file or directory: 'C:\\Users\\noahi\\Downloads\\delivery_cost_comparison\\delivery_cost_comparison.csv'


In [7]:
import pandas as pd
import os

# Let's try the direct file path
file_path = r"C:\Users\noahi\Downloads\delivery_cost_comparison"

if os.path.exists(file_path):
    print("üéØ File Found! Loading preview...")
    df = pd.read_csv(file_path, nrows=5)
    print("\n--- Columns ---")
    print(df.columns.tolist())
    print("\n--- Data Sample ---")
    print(df)
else:
    print("‚ùå Path is still wrong. Let's check the Downloads folder for the real name:")
    downloads = r"C:\Users\noahi\Downloads"
    # This will list only FOLDERS in your downloads to find the right one
    folders = [f for f in os.listdir(downloads) if os.path.isdir(os.path.join(downloads, f))]
    print(f"Folders found in Downloads: {folders}")

‚ùå Path is still wrong. Let's check the Downloads folder for the real name:
Folders found in Downloads: ['.ipynb_checkpoints', 'Application', 'CDIR', 'Certificate & Evalution', 'Communication', 'DA YUU', 'daikibo-telemetry-data.json', 'delivery_cost_comparison.csv', 'delivery_financials.csv', 'delivery_reliability.csv', 'Excel in work place', 'Reflection Report - 1756243885133_files', 'scaling_forecasts.csv', 'SIP', 'Software', 'Tech Essential']


In [8]:
import pandas as pd

# Using your exact verified path
path = r"C:\Users\noahi\Downloads\delivery_cost_comparison.csv\delivery_cost_comparison.csv"

try:
    df_preview = pd.read_csv(path, nrows=5)
    print("--- 4th Dataset: Comparison Data ---")
    print(f"Columns: {df_preview.columns.tolist()}")
    print("\n--- Data Sample ---")
    print(df_preview.head())
except Exception as e:
    print(f"‚ùå Error: {e}")

--- 4th Dataset: Comparison Data ---
Columns: ['delivery_id', 'timestamp', 'type', 'cost', 'time_taken', 'distance']

--- Data Sample ---
   delivery_id            timestamp   type       cost  time_taken  distance
0          1.0  2024-11-06 18:32:00  human  16.300128   28.276063  2.809946
1          2.0  2025-05-26 10:26:00  robot   6.189864   19.796571  0.694347
2          3.0  2022-02-27 03:01:00  robot   2.452831   25.755434  4.849879
3          4.0  2022-08-13 04:12:00  robot  17.155278   20.705736  2.121815
4          5.0  2021-02-07 04:09:00  human  17.500188   22.654103  3.174608


In [10]:
import pandas as pd
import numpy as np

# Your verified double-extension path
input_path = r"C:\Users\noahi\Downloads\delivery_cost_comparison.csv\delivery_cost_comparison.csv"
output_path = r"C:\Users\noahi\Downloads\delivery_cost_comparison_PROFESSIONAL.csv"

# 1. Load Data
df = pd.read_csv(input_path)

# 2. DROP BLANKS (Removes any row that is completely empty or missing critical data)
df = df.dropna(how='any') 

# 3. Basic Cleanup
df = df.drop_duplicates(subset=['delivery_id'])
df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')

# FIX: Added .str before .strip() to avoid that AttributeError
df['type'] = df['type'].str.lower().str.strip() 

# 4. Surgical Outlier Removal (The IQR Method)
for col in ['cost', 'time_taken', 'distance']:
    # Ensure column is numeric
    df[col] = pd.to_numeric(df[col], errors='coerce')
    
    # Find the "normal" range
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Replace "999 million" type placeholders with the median
    median_val = df[col].median()
    df.loc[(df[col] < lower_bound) | (df[col] > upper_bound), col] = median_val
    
    # Round for professional readability
    df[col] = df[col].round(2)

# 5. Save
df.to_csv(output_path, index=False)
print("‚úÖ Comparison Dataset Cleaned, Blanks Dropped, and Rounded!")

# 6. The "Big Reveal": Robot vs Human
print("\n--- FINAL COST COMPARISON AUDIT ---")
print(df.groupby('type')[['cost', 'time_taken', 'distance']].mean())

‚úÖ Comparison Dataset Cleaned, Blanks Dropped, and Rounded!

--- FINAL COST COMPARISON AUDIT ---
              cost  time_taken  distance
type                                    
human    10.484397   17.493650  2.551748
invalid  10.532591   17.163290  2.585946
robot    10.497891   17.491233  2.548761


In [11]:
import pandas as pd

# Load our newly cleaned professional file
path = r"C:\Users\noahi\Downloads\delivery_cost_comparison_PROFESSIONAL.csv"
df = pd.read_csv(path)

# 1. Remove the "invalid" rows
df = df[df['type'] != 'invalid']

# 2. Let's look at the MEDIAN instead of the Mean
print("\n--- REFINED AUDIT (MEDIANS) ---")
print(df.groupby('type')[['cost', 'time_taken', 'distance']].median())

# 3. Let's see the Count - how many of each do we actually have?
print("\n--- DATA VOLUME ---")
print(df['type'].value_counts())


--- REFINED AUDIT (MEDIANS) ---
        cost  time_taken  distance
type                              
human  10.48       17.49      2.55
robot  10.48       17.49      2.55

--- DATA VOLUME ---
type
human    414321
robot    412591
Name: count, dtype: int64


In [12]:
import os

orig_path = r"C:\Users\noahi\Downloads\delivery_cost_comparison.csv\delivery_cost_comparison.csv"
final_path = r"C:\Users\noahi\Downloads\delivery_cost_comparison_PROFESSIONAL.csv"

def get_mb(path):
    return f"{os.path.getsize(path) / (1024 * 1024):.2f} MB"

print(f"{'Version':<20} | {'File Size':<10}")
print("-" * 35)
print(f"{'Original Raw':<20} | {get_mb(orig_path)}")
print(f"{'Professional Final':<20} | {get_mb(final_path)}")

Version              | File Size 
-----------------------------------
Original Raw         | 71.91 MB
Professional Final   | 40.97 MB


In [13]:
import pandas as pd

# Path to your newest file (adjusting for the typo just in case)
path = r"C:\Users\noahi\Downloads\customer_satisfaction_metrics.csv\customer_satisfaction_metrics.csv"

try:
    df_preview = pd.read_csv(path, nrows=5)
    print("--- 6th Dataset: Customer Satisfaction ---")
    print(f"Columns: {df_preview.columns.tolist()}")
    print("\n--- Data Sample ---")
    print(df_preview)
except Exception as e:
    print(f"‚ùå Error: {e}. Check if the file is named 'customer_satisfaction_metrics.csv' or 'cumstomer_satisfaction_metrics.csv'")

--- 6th Dataset: Customer Satisfaction ---
Columns: ['customer_id', 'timestamp', 'rating', 'retention_flag', 'feedback_length']

--- Data Sample ---
   customer_id            timestamp  rating  retention_flag  feedback_length
0       8737.0  2020-11-09 16:45:00     4.0             1.0             99.0
1       4989.0  2025-09-16 02:14:00     5.0             1.0            168.0
2       2928.0  2024-07-15 09:22:00     5.0             1.0            170.0
3       9810.0  2021-10-20 13:54:00     4.0             1.0            131.0
4       1650.0  2020-05-03 02:53:00     1.0             1.0            197.0


In [14]:
import pandas as pd
import numpy as np

# Path to your file
input_path = r"C:\Users\noahi\Downloads\customer_satisfaction_metrics.csv\customer_satisfaction_metrics.csv"
output_path = r"C:\Users\noahi\Downloads\customer_satisfaction_PROFESSIONAL.csv"

# 1. Load Data
df = pd.read_csv(input_path)

# 2. Drop rows with missing Customer IDs or Ratings
df = df.dropna(subset=['customer_id', 'rating'])

# 3. Clean the Ratings (Force 1-5 scale)
# If a rating is outside this, we'll cap it at the nearest bound
df['rating'] = df['rating'].clip(lower=1, upper=5).round(0).astype(int)

# 4. Clean Retention Flag (Ensure it's just 0 or 1)
df['retention_flag'] = df['retention_flag'].apply(lambda x: 1 if x >= 0.5 else 0)

# 5. Scrub Feedback Length Outliers (IQR Method)
Q1 = df['feedback_length'].quantile(0.25)
Q3 = df['feedback_length'].quantile(0.75)
IQR = Q3 - Q1
upper_limit = Q3 + 1.5 * IQR

# Replace reviews longer than the limit with the median length
median_len = df['feedback_length'].median()
df.loc[df['feedback_length'] > upper_limit, 'feedback_length'] = median_len
df['feedback_length'] = df['feedback_length'].round(0).astype(int)

# 6. Save
df.to_csv(output_path, index=False)
print("‚úÖ Satisfaction Metrics Cleaned!")

# 7. Quick Professional Insight
print("\n--- SATISFACTION SUMMARY ---")
print(f"Average Rating: {df['rating'].mean():.2f} / 5.0")
print(f"Retention Rate: {df['retention_flag'].mean()*100:.1f}%")
print("\n--- Ratings Distribution ---")
print(df['rating'].value_counts().sort_index())

IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

In [18]:
import pandas as pd
import numpy as np

# Path to your file
input_path = r"C:\Users\noahi\Downloads\customer_satisfaction_metrics.csv\customer_satisfaction_metrics.csv"
output_path = r"C:\Users\noahi\Downloads\customer_satisfaction_PROFESSIONAL.csv"

# 1. Load Data
df = pd.read_csv(input_path)

# 2. DROP ALL BLANKS AND INFINITIES (The Fix)
# This removes rows with missing values in critical columns
df = df.dropna(subset=['customer_id', 'rating', 'retention_flag', 'feedback_length'])

# Also replace any hidden 'inf' with the median to prevent casting errors
df = df.replace([np.inf, -np.inf], np.nan).dropna()

# 3. Clean the Ratings (Force 1-5 scale)
df['rating'] = df['rating'].clip(lower=1, upper=5).round(0).astype(int)

# 4. Clean Retention Flag (Ensure it's just 0 or 1)
df['retention_flag'] = df['retention_flag'].round(0).astype(int)

# 5. Scrub Feedback Length Outliers
Q1 = df['feedback_length'].quantile(0.25)
Q3 = df['feedback_length'].quantile(0.75)
IQR = Q3 - Q1
upper_limit = Q3 + 1.5 * IQR

median_len = df['feedback_length'].median()
df.loc[df['feedback_length'] > upper_limit, 'feedback_length'] = median_len
df['feedback_length'] = df['feedback_length'].round(0).astype(int)

# 6. Save
df.to_csv(output_path, index=False)
print("‚úÖ Satisfaction Metrics Cleaned and Blank Rows Removed!")

# 7. Professional Insight Audit
print("\n--- FINAL SATISFACTION AUDIT ---")
print(f"Total Responses: {len(df):,}")
print(f"Average Rating: {df['rating'].mean():.2f} / 5.0")
print(f"Retention Rate: {df['retention_flag'].mean()*100:.1f}%")

‚úÖ Satisfaction Metrics Cleaned and Blank Rows Removed!

--- FINAL SATISFACTION AUDIT ---
Total Responses: 823,588
Average Rating: 3.00 / 5.0
Retention Rate: -702055416933944448.0%


In [19]:
import pandas as pd
import numpy as np

# Path to your file
input_path = r"C:\Users\noahi\Downloads\customer_satisfaction_metrics.csv\customer_satisfaction_metrics.csv"
output_path = r"C:\Users\noahi\Downloads\customer_satisfaction_PROFESSIONAL.csv"

# 1. Load Data
df = pd.read_csv(input_path)

# 2. DROP DUPLICATES (Immediate cleanup)
df = df.drop_duplicates()

# 3. CONVERT TO NUMERIC (The Fix)
# This turns anything weird into "NaN" so we can drop it easily
cols_to_fix = ['rating', 'retention_flag', 'feedback_length']
for col in cols_to_fix:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# 4. DROP ALL BLANKS AND INFINITIES
# This removes the rows that were causing your IntCastingNaNError
df = df.dropna(subset=cols_to_fix)
df = df.replace([np.inf, -np.inf], np.nan).dropna(subset=cols_to_fix)

# 5. HARD FILTER (The Ghost-Buster)
# This removes the crazy numbers that broke your Retention Rate
df = df[(df['rating'] >= 1) & (df['rating'] <= 5)]
df = df[df['retention_flag'].isin([0, 1])]

# 6. NOW cast to Integer (This will not fail now)
df['rating'] = df['rating'].astype(int)
df['retention_flag'] = df['retention_flag'].astype(int)
df['feedback_length'] = df['feedback_length'].astype(int)

# 7. Save
df.to_csv(output_path, index=False)
print("‚úÖ NUCLEAR CLEAN COMPLETE!")

# 8. FINAL AUDIT
print("\n--- FINAL SATISFACTION AUDIT ---")
print(f"Total Valid Rows: {len(df):,}")
print(f"Average Rating: {df['rating'].mean():.2f} / 5.0")
print(f"Retention Rate: {df['retention_flag'].mean()*100:.2f}%")

‚úÖ NUCLEAR CLEAN COMPLETE!

--- FINAL SATISFACTION AUDIT ---
Total Valid Rows: 826,834
Average Rating: 3.00 / 5.0
Retention Rate: 89.98%


In [20]:
import os
import pandas as pd

# Paths to your files
raw_path = r"C:\Users\noahi\Downloads\customer_satisfaction_metrics.csv\customer_satisfaction_metrics.csv"
clean_path = r"C:\Users\noahi\Downloads\customer_satisfaction_PROFESSIONAL.csv"

def get_stats(file_path):
    # Get size in Megabytes
    size_mb = os.path.getsize(file_path) / (1024 * 1024)
    # Get row count
    df = pd.read_csv(file_path)
    rows = len(df)
    return size_mb, rows

# Execute Audit
raw_size, raw_rows = get_stats(raw_path)
clean_size, clean_rows = get_stats(clean_path)

print("--- FILE SIZE & INTEGRITY AUDIT ---")
print(f"{'Metric':<20} | {'Raw File':<15} | {'Clean File':<15}")
print("-" * 55)
print(f"{'Size (MB)':<20} | {raw_size:>12.2f} MB | {clean_size:>12.2f} MB")
print(f"{'Row Count':<20} | {raw_rows:>15,} | {clean_rows:>15,}")
print(f"{'Rows Removed':<20} | {'-':>15} | {raw_rows - clean_rows:>15,}")

--- FILE SIZE & INTEGRITY AUDIT ---
Metric               | Raw File        | Clean File     
-------------------------------------------------------
Size (MB)            |        32.08 MB |        27.81 MB
Row Count            |         835,071 |         826,834
Rows Removed         |               - |           8,237


In [22]:
import pandas as pd
import numpy as np

# Paths
input_path = r"C:\Users\noahi\Downloads\customer_satisfaction_metrics.csv\customer_satisfaction_metrics.csv"
output_path = r"C:\Users\noahi\Downloads\customer_satisfaction_PROFESSIONAL.csv"

# 1. Load and Remove Duplicates immediately
df = pd.read_csv(input_path)
df = df.drop_duplicates()

# 2. Drop Blank Rows for critical IDs
# We drop if customer_id or timestamp is blank because we can't "guess" those
df = df.dropna(subset=['customer_id', 'timestamp'])

# 3. Handle Numerical Columns (Rating & Feedback Length)
cols_to_fix = ['rating', 'feedback_length', 'retention_flag']

for col in cols_to_fix:
    # Convert to numeric, forcing errors to NaN
    df[col] = pd.to_numeric(df[col], errors='coerce')
    
    # Calculate the Median for this column
    col_median = df[col].median()
    
    # Identify Outliers (IQR Method)
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # REPLACEMENT LOGIC:
    # If it's a NaN OR an outlier, replace with the Median
    df.loc[(df[col].isna()) | (df[col] < lower_bound) | (df[col] > upper_bound), col] = col_median

# 4. Final Type Fixing (Clean whole numbers)
df['rating'] = df['rating'].round(0).astype(int)
df['retention_flag'] = df['retention_flag'].round(0).astype(int)
df['feedback_length'] = df['feedback_length'].round(0).astype(int)

# 5. Save and Audit
df.to_csv(output_path, index=False)
print("‚úÖ SUCCESS: Blanks removed, Duplicates dropped, and Outliers replaced with Medians!")

# Before/After Size Check
import os
raw_size = os.path.getsize(input_path) / (1024 * 1024)
clean_size = os.path.getsize(output_path) / (1024 * 1024)

print(f"\n--- SIZE REPORT ---")
print(f"Raw File Size:   {raw_size:.2f} MB")
print(f"Clean File Size: {clean_size:.2f} MB")
print(f"Final Row Count: {len(df):,}")

‚úÖ SUCCESS: Blanks removed, Duplicates dropped, and Outliers replaced with Medians!

--- SIZE REPORT ---
Raw File Size:   32.08 MB
Clean File Size: 27.99 MB
Final Row Count: 830,540


In [1]:
import pandas as pd

# Update this filename for whichever of the 5 you want to do next
file_name = "customer_behavior.csv" 
path = rf"C:\Users\noahi\Downloads\customer_behavior.csv\customer_behavior.csv"

df_preview = pd.read_csv(path, nrows=5)
print(f"--- Dataset: {file_name} ---")
print(f"Columns: {df_preview.columns.tolist()}")
print("\n--- Data Sample ---")
print(df_preview.head())

--- Dataset: customer_behavior.csv ---
Columns: ['customer_id', 'timestamp', 'action_type', 'item_id', 'session_duration', 'device_type']

--- Data Sample ---
   customer_id            timestamp action_type  item_id  session_duration  \
0        571.0  2023-04-08 21:00:00       order    210.0         23.146134   
1        407.0  2022-10-20 03:38:00     abandon    176.0        250.761297   
2       6270.0  2023-08-30 19:48:00        view    499.0        134.534415   
3       9844.0  2023-01-10 17:17:00     abandon     72.0        214.301357   
4       1597.0  2024-11-05 21:21:00     abandon    276.0        276.243561   

  device_type  
0         app  
1      mobile  
2     desktop  
3     desktop  
4         app  


In [2]:
import pandas as pd
import numpy as np

# 1. Retrieve the data
input_path = r"C:\Users\noahi\Downloads\customer_behavior.csv\customer_behavior.csv"
output_path = r"C:\Users\noahi\Downloads\customer_behavior_PROFESSIONAL.csv"

df = pd.read_csv(input_path)
print(f"Initial Rows: {len(df):,}")

# 2. Drop Duplicates
df = df.drop_duplicates()

# 3. Remove Blanks
# We drop rows if essential IDs (who, what, when) are missing
df = df.dropna(subset=['customer_id', 'timestamp', 'action_type'])

# 4. Replace Outliers with Median (for 'session_duration')
# First, ensure it's a number
df['session_duration'] = pd.to_numeric(df['session_duration'], errors='coerce')

# Fill actual NaN values with median before outlier detection
duration_median = df['session_duration'].median()
df['session_duration'] = df['session_duration'].fillna(duration_median)

# Use IQR to find "Glitch" durations (like 999999 seconds)
Q1 = df['session_duration'].quantile(0.25)
Q3 = df['session_duration'].quantile(0.75)
IQR = Q3 - Q1
upper_limit = Q3 + 1.5 * IQR

# Replace any duration above the limit with the median
df.loc[df['session_duration'] > upper_limit, 'session_duration'] = duration_median

# 5. Final Formatting
df['session_duration'] = df['session_duration'].round(1) # Keeping one decimal for seconds

# Save the professional version
df.to_csv(output_path, index=False)

print("\n--- CLEANING COMPLETE ---")
print(f"Final Row Count: {len(df):,}")
print(f"Median Session Duration: {duration_median:.2f} seconds")

Initial Rows: 737,904


  values = self.values.round(decimals)  # type: ignore[union-attr]



--- CLEANING COMPLETE ---
Final Row Count: 734,038
Median Session Duration: 150.39 seconds


In [4]:
import pandas as pd

# Path to the reliability dataset
path = r"C:\Users\noahi\Downloads\customer_behavior.csv\customer_behavior.csv"

df_preview = pd.read_csv(path, nrows=5)
print(f"--- Dataset: delivery_reliability.csv ---")
print(f"Columns: {df_preview.columns.tolist()}")
print("\n--- Data Sample ---")
print(df_preview.head())

--- Dataset: delivery_reliability.csv ---
Columns: ['customer_id', 'timestamp', 'action_type', 'item_id', 'session_duration', 'device_type']

--- Data Sample ---
   customer_id            timestamp action_type  item_id  session_duration  \
0        571.0  2023-04-08 21:00:00       order    210.0         23.146134   
1        407.0  2022-10-20 03:38:00     abandon    176.0        250.761297   
2       6270.0  2023-08-30 19:48:00        view    499.0        134.534415   
3       9844.0  2023-01-10 17:17:00     abandon     72.0        214.301357   
4       1597.0  2024-11-05 21:21:00     abandon    276.0        276.243561   

  device_type  
0         app  
1      mobile  
2     desktop  
3     desktop  
4         app  


In [5]:
import pandas as pd
import numpy as np

# 1. Retrieve the data
input_path = r"C:\Users\noahi\Downloads\customer_behavior.csv\customer_behavior.csv"
output_path = r"C:\Users\noahi\Downloads\delivery_reliability_PROFESSIONAL.csv"

try:
    df = pd.read_csv(input_path)
    initial_count = len(df)

    # 2. Step: Remove Duplicates
    df = df.drop_duplicates()
    after_dup_count = len(df)

    # 3. Step: Remove Blank Rows
    # We drop rows where delivery_id or status is missing
    df = df.dropna(subset=['delivery_id', 'status'])

    # 4. Step: Replace Outliers with Median
    # Identifying numeric columns automatically
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

    for col in numeric_cols:
        col_median = df[col].median()
        
        # IQR Outlier Detection
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_limit = Q1 - 1.5 * IQR
        upper_limit = Q3 + 1.5 * IQR
        
        # Swap outliers/NaNs with the median
        df.loc[(df[col] < lower_limit) | (df[col] > upper_limit) | (df[col].isna()), col] = col_median

    # 5. Save the Professional version
    df.to_csv(output_path, index=False)

    print("--- RELIABILITY CLEANING REPORT ---")
    print(f"Initial Rows:     {initial_count:,}")
    print(f"Duplicates Removed: {initial_count - after_dup_count:,}")
    print(f"Final Clean Rows:   {len(df):,}")
    print(f"Numerical Columns Fixed: {numeric_cols}")

except Exception as e:
    print(f"‚ùå Error: {e}")

‚ùå Error: ['delivery_id', 'status']


In [6]:
import pandas as pd
import numpy as np

# 1. Retrieve the data
input_path = r"C:\Users\noahi\Downloads\customer_behavior.csv\customer_behavior.csv"
output_path = r"C:\Users\noahi\Downloads\delivery_reliability_PROFESSIONAL.csv"

try:
    df = pd.read_csv(input_path)
    print(f"Detected Columns: {df.columns.tolist()}")

    # 2. REMOVE DUPLICATES (The Step you requested)
    df = df.drop_duplicates()
    
    # 3. REMOVE BLANKS
    # Instead of guessing names, we drop rows that are completely empty
    # and then drop rows where the first column (usually the ID) is missing.
    df = df.dropna(how='all')
    df = df.dropna(subset=[df.columns[0]])

    # 4. REPLACE OUTLIERS WITH MEDIAN
    # This automatically finds every column with numbers
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    
    for col in numeric_cols:
        col_median = df[col].median()
        
        # Identify outliers using IQR
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        
        # Replace anything outside the lines or NaN with the median
        df.loc[(df[col] < lower) | (df[col] > upper) | (df[col].isna()), col] = col_median

    # 5. Save
    df.to_csv(output_path, index=False)
    print("\n‚úÖ CLEANING SUCCESSFUL!")
    print(f"Final Row Count: {len(df):,}")

except Exception as e:
    print(f"‚ùå Still getting an error: {e}")

Detected Columns: ['customer_id', 'timestamp', 'action_type', 'item_id', 'session_duration', 'device_type']

‚úÖ CLEANING SUCCESSFUL!
Final Row Count: 736,940


In [8]:
import os

# Paths to your specific files
raw_file = r"C:\Users\noahi\Downloads\customer_behavior.csv\customer_behavior.csv"
clean_file = r"C:\Users\noahi\Downloads\delivery_reliability_PROFESSIONAL.csv"

def get_stats(path):
    size_mb = os.path.getsize(path) / (1024 * 1024)
    # Quick count of lines without loading the whole thing into memory
    with open(path, 'r', encoding='utf-8', errors='ignore') as f:
        row_count = sum(1 for line in f)
    return size_mb, row_count

# Calculate
raw_size, raw_rows = get_stats(raw_file)
clean_size, clean_rows = get_stats(clean_file)

print("--- RELIABILITY SIZE AUDIT ---")
print(f"{'Metric':<20} | {'Original':<15} | {'Cleaned':<15}")
print("-" * 55)
print(f"{'Size (MB)':<20} | {raw_size:>12.2f} MB | {clean_size:>12.2f} MB")
print(f"{'Total Rows':<20} | {raw_rows:>15,} | {clean_rows:>15,}")
print(f"{'Reduction (%)':<20} | {'-':>15} | {((raw_size-clean_size)/raw_size)*100:>14.1f}%")

--- RELIABILITY SIZE AUDIT ---
Metric               | Original        | Cleaned        
-------------------------------------------------------
Size (MB)            |        44.79 MB |        45.38 MB
Total Rows           |         737,905 |         736,941
Reduction (%)        |               - |           -1.3%


In [None]:
import os

# Paths to your specific files
raw_file = r"C:\Users\noahi\Downloads\customer_behavior.csv\customer_behavior.csv"
clean_file = r"C:\Users\noahi\Downloads\delivery_reliability_PROFESSIONAL.csv"

def get_stats(path):
    size_mb = os.path.getsize(path) / (1024 * 1024)
    # Quick count of lines without loading the whole thing into memory
    with open(path, 'r', encoding='utf-8', errors='ignore') as f:
        row_count = sum(1 for line in f)
    return size_mb, row_count

# Calculate
raw_size, raw_rows = get_stats(raw_file)
clean_size, clean_rows = get_stats(clean_file)

print("--- RELIABILITY SIZE AUDIT ---")
print(f"{'Metric':<20} | {'Original':<15} | {'Cleaned':<15}")
print("-" * 55)
print(f"{'Size (MB)':<20} | {raw_size:>12.2f} MB | {clean_size:>12.2f} MB")
print(f"{'Total Rows':<20} | {raw_rows:>15,} | {clean_rows:>15,}")
print(f"{'Reduction (%)':<20} | {'-':>15} | {((raw_size-clean_size)/raw_size)*100:>14.1f}%")

In [9]:
import pandas as pd
import numpy as np
import os

# 1. Paths
input_path = r"C:\Users\noahi\Downloads\customer_behavior.csv\customer_behavior.csv"
output_path = r"C:\Users\noahi\Downloads\customer_behavior_PROFESSIONAL.csv"

# 2. Retrieve & Initial Stats
df = pd.read_csv(input_path)
raw_size = os.path.getsize(input_path) / (1024 * 1024)
raw_rows = len(df)

# 3. Clean Duplicates
df = df.drop_duplicates()

# 4. Remove Blanks
# Dropping rows where essential tracking info is missing
df = df.dropna(subset=['customer_id', 'timestamp', 'action_type'])

# 5. Median Replacement for Outliers (session_duration)
df['session_duration'] = pd.to_numeric(df['session_duration'], errors='coerce')
duration_median = df['session_duration'].median()

# Use IQR to define "Sane" session limits
Q1 = df['session_duration'].quantile(0.25)
Q3 = df['session_duration'].quantile(0.75)
IQR = Q3 - Q1
upper_limit = Q3 + 1.5 * IQR

# Surgical fix: Replace NaNs and Outliers with Median
df.loc[(df['session_duration'].isna()) | (df['session_duration'] > upper_limit), 'session_duration'] = duration_median

# 6. Save Professional Version
df.to_csv(output_path, index=False)
clean_size = os.path.getsize(output_path) / (1024 * 1024)
clean_rows = len(df)

# 7. FINAL AUDIT REPORT
print("--- CUSTOMER BEHAVIOR: SURGICAL AUDIT ---")
print(f"{'Metric':<25} | {'Original':<15} | {'Cleaned':<15}")
print("-" * 60)
print(f"{'File Size (MB)':<25} | {raw_size:>12.2f} MB | {clean_size:>12.2f} MB")
print(f"{'Total Row Count':<25} | {raw_rows:>15,} | {clean_rows:>15,}")
print(f"{'Median Duration':<25} | {'-':>15} | {duration_median:>12.2f} s")
print(f"{'Rows Removed':<25} | {'-':>15} | {raw_rows - clean_rows:>15,}")
print(f"{'Efficiency Gain':<25} | {'-':>15} | {((raw_size - clean_size)/raw_size)*100:>14.1f}%")

--- CUSTOMER BEHAVIOR: SURGICAL AUDIT ---
Metric                    | Original        | Cleaned        
------------------------------------------------------------
File Size (MB)            |        44.79 MB |        45.24 MB
Total Row Count           |         737,904 |         734,038
Median Duration           |               - |       150.39 s
Rows Removed              |               - |           3,866
Efficiency Gain           |               - |           -1.0%


In [10]:
import pandas as pd

path = r"C:\Users\noahi\Downloads\customer_behavior_PROFESSIONAL.csv"
df_preview = pd.read_csv(path, nrows=5)

print(f"--- Dataset: customer_behavior_PROFESSIONAL.csv ---")
print(f"Columns: {df_preview.columns.tolist()}")
print("\n--- Data Sample ---")
print(df_preview.head())

--- Dataset: customer_behavior_PROFESSIONAL.csv ---
Columns: ['customer_id', 'timestamp', 'action_type', 'item_id', 'session_duration', 'device_type']

--- Data Sample ---
   customer_id            timestamp action_type  item_id  session_duration  \
0        571.0  2023-04-08 21:00:00       order    210.0         23.146134   
1        407.0  2022-10-20 03:38:00     abandon    176.0        250.761297   
2       6270.0  2023-08-30 19:48:00        view    499.0        134.534415   
3       9844.0  2023-01-10 17:17:00     abandon     72.0        214.301357   
4       1597.0  2024-11-05 21:21:00     abandon    276.0        276.243561   

  device_type  
0         app  
1      mobile  
2     desktop  
3     desktop  
4         app  


In [11]:
import pandas as pd

path = r"C:\Users\noahi\Downloads\predictive_maintenance.csv\predictive_maintenance.csv"
df_preview = pd.read_csv(path, nrows=5)

print(f"--- Dataset: predictive_maintenance.csv ---")
print(f"Columns: {df_preview.columns.tolist()}")
print("\n--- Data Sample ---")
print(df_preview.head())

--- Dataset: predictive_maintenance.csv ---
Columns: ['robot_id', 'timestamp', 'battery_level', 'motor_temp', 'distance_traveled', 'error_codes', 'maintenance_flag']

--- Data Sample ---
   robot_id            timestamp  battery_level  motor_temp  \
0      18.0  2023-12-05 03:24:00      70.992485   39.208862   
1      80.0  2021-09-22 12:10:00      29.772286   62.367604   
2      17.0  2023-07-03 03:56:00      62.718273   37.202737   
3      17.0  2024-11-13 11:11:00      89.210984   54.570332   
4      18.0  2022-09-26 00:40:00      98.077244   61.612533   

   distance_traveled error_codes  maintenance_flag  
0           3.706236        none               0.0  
1           8.277852        none               0.0  
2           4.668508        E001               0.0  
3           0.590138        none               0.0  
4           7.707339        none               0.0  


In [12]:
import pandas as pd
import numpy as np
import os

# 1. Paths
input_path = r"C:\Users\noahi\Downloads\predictive_maintenance.csv\predictive_maintenance.csv"
output_path = r"C:\Users\noahi\Downloads\predictive_maintenance_PROFESSIONAL.csv"

# 2. Retrieve & Initial Stats
df = pd.read_csv(input_path)
raw_size = os.path.getsize(input_path) / (1024 * 1024)
raw_rows = len(df)

# 3. Clean Duplicates
df = df.drop_duplicates()

# 4. Remove Blanks
# We drop rows if the Machine ID or the Failure Label is missing
df = df.dropna(subset=[df.columns[0], df.columns[-1]])

# 5. Median Replacement for Outliers
# Identifying all sensor data (numeric columns)
numeric_cols = df.select_dtypes(include=[np.number]).columns

for col in numeric_cols:
    col_median = df[col].median()
    
    # IQR Method to find sensor glitches
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_limit = Q1 - 1.5 * IQR
    upper_limit = Q3 + 1.5 * IQR
    
    # Swap outliers with median
    df.loc[(df[col] < lower_limit) | (df[col] > upper_limit) | (df[col].isna()), col] = col_median

# 6. Save Professional Version
df.to_csv(output_path, index=False)
clean_size = os.path.getsize(output_path) / (1024 * 1024)
clean_rows = len(df)

# 7. FINAL MAINTENANCE AUDIT
print("\n--- PREDICTIVE MAINTENANCE: SURGICAL AUDIT ---")
print(f"Original Rows:    {raw_rows:,}")
print(f"Cleaned Rows:     {clean_rows:,}")
print(f"Rows Removed:     {raw_rows - clean_rows:,}")
print(f"Final Size:       {clean_size:.2f} MB")


--- PREDICTIVE MAINTENANCE: SURGICAL AUDIT ---
Original Rows:    737,904
Cleaned Rows:     736,296
Rows Removed:     1,608
Final Size:       62.64 MB


In [13]:
import os

# Define the file paths
original_path = r"C:\Users\noahi\Downloads\predictive_maintenance.csv\predictive_maintenance.csv"
cleaned_path = r"C:\Users\noahi\Downloads\predictive_maintenance_PROFESSIONAL.csv"

def get_file_size_mb(path):
    """Returns file size in Megabytes."""
    return os.path.getsize(path) / (1024 * 1024)

# Calculate sizes
size_raw = get_file_size_mb(original_path)
size_clean = get_file_size_mb(cleaned_path)

# Print the comparison
print("--- DATA SIZE COMPARISON ---")
print(f"Original File: {size_raw:.2f} MB")
print(f"Cleaned File:  {size_clean:.2f} MB")
print(f"Difference:    {size_clean - size_raw:+.2f} MB")
print(f"Growth/Shrink: {((size_clean - size_raw) / size_raw) * 100:.1f}%")

--- DATA SIZE COMPARISON ---
Original File: 62.26 MB
Cleaned File:  62.64 MB
Difference:    +0.38 MB
Growth/Shrink: 0.6%


In [14]:
import os

# Looking for any file with 'order' in the name
download_path = r"C:\Users\noahi\Downloads"
all_files = os.listdir(download_path)
matching_files = [f for f in all_files if 'order' in f.lower()]

print("--- FILE SEARCH RESULTS ---")
if matching_files:
    for i, file in enumerate(matching_files):
        print(f"{i+1}. {file}")
else:
    print("No files containing 'order' were found. Please check the folder!")

--- FILE SEARCH RESULTS ---
1. personalized_orders.csv
2. personalized_orders.csv.zip


In [16]:
import pandas as pd

# Use the 'r' prefix to avoid that Unicode error
file_path = r"C:\Users\noahi\Downloads\personalized_orders.csv\personalized_orders.csv"

try:
    # Load only the header to peek at the columns
    df_preview = pd.read_csv(file_path, nrows=5)
    print("‚úÖ File loaded successfully!")
    print(f"--- Columns Found ---")
    print(df_preview.columns.tolist())
    print("\n--- Data Sample ---")
    print(df_preview.head(3))
except Exception as e:
    print(f"‚ùå Still having trouble: {e}")

‚úÖ File loaded successfully!
--- Columns Found ---
['customer_id', 'order_id', 'timestamp', 'item_id', 'quantity', 'category', 'price']

--- Data Sample ---
   customer_id  order_id            timestamp  item_id  quantity category  \
0       7362.0       1.0  2024-01-11 19:14:00    296.0       3.0     food   
1       7612.0       2.0  2020-04-11 05:45:00     56.0       4.0    snack   
2        393.0       3.0  2020-02-05 22:59:00     74.0       3.0    snack   

       price  
0   9.091472  
1  11.098171  
2   5.881802  


In [17]:
import pandas as pd
import numpy as np
import os

# 1. Setup Paths
input_path = r"C:\Users\noahi\Downloads\personalized_orders.csv\personalized_orders.csv"
output_path = r"C:\Users\noahi\Downloads\personalized_orders_PROFESSIONAL.csv"

try:
    df = pd.read_csv(input_path)
    raw_rows = len(df)

    # 2. DEDUPLICATION
    # We use order_id as the unique key to remove repeat entries
    df = df.drop_duplicates(subset=['order_id'])

    # 3. REMOVE BLANKS
    # Drop rows missing crucial ID or price info
    df = df.dropna(subset=['order_id', 'customer_id', 'price'])

    # 4. MEDIAN OUTLIER REPLACEMENT
    # Target columns: quantity and price
    cols_to_fix = ['quantity', 'price']
    
    for col in cols_to_fix:
        col_median = df[col].median()
        
        # Calculate IQR (Interquartile Range)
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        # Replace outliers with the median to keep the data realistic
        df.loc[(df[col] < lower_bound) | (df[col] > upper_bound), col] = col_median

    # 5. FINAL EXPORT
    df.to_csv(output_path, index=False)
    
    print("--- PERSONALIZED ORDERS: CLEANING AUDIT ---")
    print(f"Original Rows:    {raw_rows:,}")
    print(f"Cleaned Rows:     {len(df):,}")
    print(f"Glitches Removed: {raw_rows - len(df):,}")
    print(f"Status:           PROFESSIONAL GRADE ‚úÖ")

except Exception as e:
    print(f"‚ùå Error during cleanup: {e}")

--- PERSONALIZED ORDERS: CLEANING AUDIT ---
Original Rows:    737,904
Cleaned Rows:     734,166
Glitches Removed: 3,738
Status:           PROFESSIONAL GRADE ‚úÖ


In [18]:
import os

# Define the file paths
original_path = r"C:\Users\noahi\Downloads\personalized_orders.csv\personalized_orders.csv"
cleaned_path = r"C:\Users\noahi\Downloads\personalized_orders_PROFESSIONAL.csv"

def get_file_size_mb(path):
    """Returns file size in Megabytes."""
    return os.path.getsize(path) / (1024 * 1024)

# Calculate sizes
size_raw = get_file_size_mb(original_path)
size_clean = get_file_size_mb(cleaned_path)

# Print the comparison
print("--- DATA SIZE COMPARISON ---")
print(f"Original File: {size_raw:.2f} MB")
print(f"Cleaned File:  {size_clean:.2f} MB")
print(f"Difference:    {size_clean - size_raw:+.2f} MB")
print(f"Growth/Shrink: {((size_clean - size_raw) / size_raw) * 100:.1f}%")

--- DATA SIZE COMPARISON ---
Original File: 49.36 MB
Cleaned File:  49.77 MB
Difference:    +0.41 MB
Growth/Shrink: 0.8%
