In [3]:
# TASK 4.9 - PART 1: CUSTOMER DATA INTEGRATION

# Import libraries
import pandas as pd
import numpy as np
import os

# Set project path
path = '/Users/josephadamski/Instacart Basket Analysis'

In [None]:
# LOAD DATA
customers = pd.read_csv(os.path.join(path, 'Data', 'Original Data', 'customers.csv'))
ords_prods_merge = pd.read_pickle(os.path.join(path, 'Data', 'Prepared Data', 'ords_prods_merge.pkl'))

print("Customer data shape:", customers.shape)
print("\nCustomer columns:")
print(customers.columns.tolist())
print("\nFirst few rows:")
customers.head()

In [None]:
# CHECK FOR ISSUES
print("=== MISSING VALUES ===")
print(customers.isnull().sum())

print("\n=== DUPLICATES ===")
duplicates = customers[customers.duplicated()]
print(f"Number of duplicate rows: {len(duplicates)}")

print("\n=== CHECKING FOR MIXED TYPES ===")
for col in customers.columns:
    weird = (customers[[col]].map(type) != customers[[col]].iloc[0].apply(type)).any(axis=1)
    if len(customers[weird]) > 0:
        print(f"Mixed types found in column: {col}")

In [None]:
# FIX ISSUES
print("=== FIXING DATA QUALITY ISSUES ===\n")

print("1. Renaming 'Surnam' to 'Surname'")
customers = customers.rename(columns={'Surnam': 'Surname'})
print("   ✓ Column renamed")

print("\n2. Converting 'First Name' to string type")
customers['First Name'] = customers['First Name'].astype(str)
print("   ✓ Data type fixed")

print("\n3. Handling missing values in 'First Name'")
customers['First Name'] = customers['First Name'].replace('nan', 'Unknown')
unknown_count = (customers['First Name'] == 'Unknown').sum()
print(f"   ✓ {unknown_count} missing values marked as 'Unknown'")

print("\n=== UPDATED DATA ===")
print("Columns:", customers.columns.tolist())
print(f"Shape: {customers.shape}")

In [None]:
# MERGE
print("user_id dtype in ords_prods_merge:", ords_prods_merge['user_id'].dtype)
print("user_id dtype in customers:", customers['user_id'].dtype)

if ords_prods_merge['user_id'].dtype != customers['user_id'].dtype:
    customers['user_id'] = customers['user_id'].astype(ords_prods_merge['user_id'].dtype)

ords_prods_customers = ords_prods_merge.merge(customers, on='user_id', how='inner')

print(f"\nOriginal dataframe shape: {ords_prods_merge.shape}")
print(f"Customer dataframe shape: {customers.shape}")
print(f"Merged dataframe shape: {ords_prods_customers.shape}")
print("\nFirst few rows of merged data:")
ords_prods_customers.head()

In [None]:
#EXPORT
export_path = os.path.join(path, 'Data', 'Prepared Data', 'ords_prods_customers.pkl')
ords_prods_customers.to_pickle(export_path)

print(f"\n✓ Data exported successfully!")
print(f"File: ords_prods_customers.pkl")
print(f"Location: {export_path}")
print(f"Shape: {ords_prods_customers.shape}")

### Data Wrangling Summary

**Issues Addressed:**
1. Renamed 'Surnam' to 'Surname' for consistency
2. Fixed mixed data types in 'First Name' column
3. Replaced 11,259 missing 'First Name' values with 'Unknown'

**Rationale:** Preserving all customer records rather than dropping 5.5% of data maintains analytical integrity while clearly identifying incomplete records.