## CONTENTS

1. Import Libraries and Data
2. Data Security (PII Removal)
3. Regional Segmentation
4. Low-Activity Exclusion
5. Customer Profiling
6. Visualizations
7. Aggregations
8. Regional Analysis
9. Export

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Set display options
pd.options.display.max_columns = None
pd.options.display.max_rows = 100

# Load the merged dataset from Exercise 4.9
df = pd.read_pickle('../Data/Prepared Data/ords_prods_customers.pkl')

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
df.head()

In [None]:
# Check for PII columns
print("Checking for PII data...")
print(f"\nColumns in dataset: {df.columns.tolist()}")

# Identify PII columns (First Name, Surnam contain personal information)
pii_columns = [col for col in df.columns if col in ['First Name', 'Surname', 'email', 'phone']]

if pii_columns:
    print(f"\nPII columns found: {pii_columns}")
    print("Dropping PII columns for security...")
    df = df.drop(columns=pii_columns)
    print(f"Dataset shape after removing PII: {df.shape}")
else:
    print("\nNo direct PII columns found.")

In [None]:
def assign_region(state):
    """Assign US region based on state"""
    
    northeast = ['Connecticut', 'Maine', 'Massachusetts', 'New Hampshire', 
                'Rhode Island', 'Vermont', 'New Jersey', 'New York', 'Pennsylvania']
    
    midwest = ['Illinois', 'Indiana', 'Michigan', 'Ohio', 'Wisconsin',
               'Iowa', 'Kansas', 'Minnesota', 'Missouri', 'Nebraska',
               'North Dakota', 'South Dakota']
    
    south = ['Delaware', 'Florida', 'Georgia', 'Maryland', 'North Carolina',
             'South Carolina', 'Virginia', 'District of Columbia', 'West Virginia',
             'Alabama', 'Kentucky', 'Mississippi', 'Tennessee', 'Arkansas',
             'Louisiana', 'Oklahoma', 'Texas']
    
    west = ['Arizona', 'Colorado', 'Idaho', 'Montana', 'Nevada', 'New Mexico',
            'Utah', 'Wyoming', 'Alaska', 'California', 'Hawaii', 'Oregon', 'Washington']
    
    if state in northeast:
        return 'Northeast'
    elif state in midwest:
        return 'Midwest'
    elif state in south:
        return 'South'
    elif state in west:
        return 'West'
    else:
        return 'Unknown'

# Apply region assignment
df['region'] = df['STATE'].apply(assign_region)

# Check results
print("\nRegion distribution:")
print(df['region'].value_counts())
print(f"\nUnknown regions: {(df['region'] == 'Unknown').sum()}")

# Analyze spending habits by region (assuming spending_flag exists from 4.9)
if 'spending_flag' in df.columns:
    print("\nSpending habits by region:")
    region_spending = pd.crosstab(df['region'], df['spending_flag'], normalize='index') * 100
    print(region_spending.round(2))


In [None]:
# Count orders per customer
orders_per_customer = df.groupby('user_id')['order_id'].nunique()
print(f"\nOrders per customer statistics:")
print(orders_per_customer.describe())

# Create exclusion flag
df['low_activity'] = df['user_id'].map(orders_per_customer < 5)

print(f"\nLow-activity customers: {df['low_activity'].sum():,} records")
print(f"Percentage: {(df['low_activity'].sum() / len(df) * 100):.2f}%")

# Exclude low-activity customers
df_active = df[df['low_activity'] == False].copy()
print(f"\nDataset shape after excluding low-activity customers: {df_active.shape}")


In [None]:
def create_customer_profile(row):
    """
    Create customer profile based on:
    - Age
    - Income
    - Dependents
    - Department preferences (for parents/families)
    """
    
    age = row['Age']
    income = row['income']
    dependents = row['n_dependants']
    
    # Age-based categories
    if age < 30:
        age_group = 'young'
    elif age < 50:
        age_group = 'middle'
    else:
        age_group = 'senior'
    
    # Income-based categories
    if income < 50000:
        income_group = 'low'
    elif income < 100000:
        income_group = 'medium'
    else:
        income_group = 'high'
    
    # Family status
    if dependents == 0:
        family_status = 'single'
    elif dependents <= 2:
        family_status = 'small_family'
    else:
        family_status = 'large_family'
    
    # Profile combinations
    if age_group == 'young' and dependents == 0:
        return 'Young Single Adult'
    elif age_group == 'young' and dependents > 0:
        return 'Young Parent'
    elif age_group == 'middle' and dependents > 0:
        return 'Established Family'
    elif age_group == 'middle' and dependents == 0:
        return 'Middle-Aged Single'
    elif age_group == 'senior' and dependents == 0:
        return 'Senior Single'
    elif age_group == 'senior' and dependents > 0:
        return 'Senior with Dependents'
    else:
        return 'Other'

# Apply profiling
df_active['customer_profile'] = df_active.apply(create_customer_profile, axis=1)

print("\nCustomer profile distribution:")
print(df_active['customer_profile'].value_counts())
print(f"\nPercentage distribution:")
print((df_active['customer_profile'].value_counts(normalize=True) * 100).round(2))

In [None]:
plt.figure(figsize=(12, 6))
profile_counts = df_active['customer_profile'].value_counts()
plt.bar(range(len(profile_counts)), profile_counts.values, color='steelblue')
plt.xticks(range(len(profile_counts)), profile_counts.index, rotation=45, ha='right')
plt.xlabel('Customer Profile')
plt.ylabel('Number of Records')
plt.title('Distribution of Customer Profiles')
plt.tight_layout()
plt.savefig('../Analysis/Visualizations/customer_profile_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Calculate order frequency per customer
customer_metrics = df_active.groupby('user_id').agg({
    'order_id': 'nunique',  # Number of orders
    'prices': 'sum'  # Total expenditure
}).reset_index()

customer_metrics.columns = ['user_id', 'order_frequency', 'total_expenditure']

# Merge profile back to customer metrics
customer_profiles = df_active[['user_id', 'customer_profile']].drop_duplicates()
customer_metrics = customer_metrics.merge(customer_profiles, on='user_id')

# Aggregate by profile
profile_aggregations = customer_metrics.groupby('customer_profile').agg({
    'order_frequency': ['min', 'mean', 'max'],
    'total_expenditure': ['min', 'mean', 'max']
}).round(2)

print("\nProfile-level aggregations:")
print(profile_aggregations)

# Export to CSV for report
profile_aggregations.to_csv('../Analysis/Reports/profile_aggregations.csv')

In [None]:
# Load departments for better labels
depts = pd.read_csv('../Data/Prepared Data/departments_wrangled.csv')
df_active = df_active.merge(depts, on='department_id', how='left')

# Profile by Region
print("\nCustomer profiles by region:")
profile_region = pd.crosstab(df_active['customer_profile'], 
                             df_active['region'], 
                             normalize='columns') * 100
print(profile_region.round(2))

# Visualize profile distribution by region
plt.figure(figsize=(14, 8))
profile_region.T.plot(kind='bar', stacked=False, figsize=(14, 8))
plt.xlabel('Region')
plt.ylabel('Percentage (%)')
plt.title('Customer Profile Distribution by Region')
plt.legend(title='Customer Profile', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.savefig('../Analysis/Visualizations/profiles_by_region.png', dpi=300, bbox_inches='tight')
plt.show()

# Top departments by profile
print("\nTop 5 departments by customer profile:")
for profile in df_active['customer_profile'].unique():
    print(f"\n{profile}:")
    profile_depts = df_active[df_active['customer_profile'] == profile]['department'].value_counts().head(5)
    print(profile_depts)

# Department preferences by profile (top 3 for each)
profile_dept_summary = df_active.groupby(['customer_profile', 'department']).size().reset_index(name='count')
top_depts_by_profile = profile_dept_summary.sort_values(['customer_profile', 'count'], 
                                                         ascending=[True, False]).groupby('customer_profile').head(3)

print("\nTop 3 departments per profile:")
print(top_depts_by_profile)

# Visualize spending by profile and region
plt.figure(figsize=(14, 8))
spending_profile_region = df_active.groupby(['customer_profile', 'region'])['prices'].mean().reset_index()
sns.barplot(data=spending_profile_region, x='customer_profile', y='prices', hue='region')
plt.xlabel('Customer Profile')
plt.ylabel('Average Order Value ($)')
plt.title('Average Spending by Customer Profile and Region')
plt.xticks(rotation=45, ha='right')
plt.legend(title='Region')
plt.tight_layout()
plt.savefig('../Analysis/Visualizations/spending_by_profile_region.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Export the final active customer dataset
df_active.to_pickle('../Data/Prepared Data/ords_prods_customers_final.pkl')
print(f"\nFinal dataset exported: {df_active.shape}")

# Export a sample for reporting (CSV for easy viewing)
sample_for_report = df_active[['user_id', 'order_id', 'product_name', 'department', 
                                'region', 'customer_profile', 'Age', 'income', 
                                'n_dependants', 'prices']].sample(n=min(10000, len(df_active)), 
                                                                    random_state=42)
sample_for_report.to_csv('../Data/Prepared Data/final_sample_for_report.csv', index=False)
print("Sample dataset exported for reporting")

print("\n" + "="*80)
print("TASK 4.10 PART 1 COMPLETE")
print("="*80)
print(f"\nFinal dataset shape: {df_active.shape}")
print(f"Customer profiles created: {df_active['customer_profile'].nunique()}")
print(f"Regions analyzed: {df_active['region'].nunique()}")
print("\nNext steps: Complete Part 2 (Final Report Template)")