# Data Wrangling - Invoice Data
This notebook processes marketing invoice/budget data.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import seaborn as sns
from datetime import datetime, timedelta

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('pastel')

## Utility Functions

In [None]:
def missing_values(df, percentage):
    """
    Drop columns with missing values above the specified percentage.
    """
    percent_missing = df.isnull().sum() * 100 / len(df)
    missing_value_df = pd.DataFrame({
        'column_name': df.columns,
        'percent_missing': percent_missing
    })
    
    missing_drop = missing_value_df[missing_value_df.percent_missing > percentage]['column_name'].tolist()
    print(f"Dropping {len(missing_drop)} columns with >{percentage}% missing values")
    
    return df.drop(missing_drop, axis=1)


def convert_string_to_date(df_colname, format='%Y-%m-%d'):
    """
    Convert string column to datetime.
    """
    return pd.to_datetime(df_colname, format=format)

## Generate Sample Invoice Data

In [None]:
# Generate sample invoice/budget data
np.random.seed(42)

# Define date range
months = pd.date_range('2018-01-01', '2018-12-31', freq='MS')

# Define marketing mediums and vendors
invoice_mediums = ['SEM', 'Social Media', 'Display Ads', 'Email Marketing', 
                   'Print Ads', 'Events', 'PR', 'Newsletter', 'Webinar']
vendors = ['Google', 'Facebook', 'LinkedIn', 'Twitter', 'Magazine Publisher', 
           'Event Organizer', 'PR Firm', 'Email Platform', 'Webinar Platform']

# Create sample data with multiple entries per month
invoice_data = []
for month in months:
    # Generate 15-25 invoices per month
    n_invoices = np.random.randint(15, 26)
    for _ in range(n_invoices):
        invoice_data.append({
            'Invoice Date': month,
            'Invoice Medium': np.random.choice(invoice_mediums),
            'Vendor': np.random.choice(vendors),
            'Invoice Number': f'INV-{np.random.randint(10000, 99999)}',
            'Marketing Invoice': np.random.uniform(500, 15000),
            'Marketing Estimate': np.random.uniform(400, 14000),
            'Campaign Type': np.random.choice(['Brand Awareness', 'Lead Gen', 'Product Launch', 'Retention'])
        })

invoice_df = pd.DataFrame(invoice_data)

# Calculate variance
invoice_df['Variance'] = invoice_df['Marketing Invoice'] - invoice_df['Marketing Estimate']
invoice_df['Variance %'] = (invoice_df['Variance'] / invoice_df['Marketing Estimate'] * 100).round(2)

# Round currency values
invoice_df['Marketing Invoice'] = invoice_df['Marketing Invoice'].round(2)
invoice_df['Marketing Estimate'] = invoice_df['Marketing Estimate'].round(2)
invoice_df['Variance'] = invoice_df['Variance'].round(2)

print(f"Generated {len(invoice_df)} invoice records")
print(f"Date range: {invoice_df['Invoice Date'].min()} to {invoice_df['Invoice Date'].max()}")
invoice_df.head(10)

## Data Cleaning and Validation

In [None]:
# Check for missing values
print("Missing values per column:")
print(invoice_df.isnull().sum())

# Remove records with zero or negative amounts
invoice_valid_df = invoice_df[
    (invoice_df['Marketing Invoice'] > 0) & 
    (invoice_df['Marketing Estimate'] > 0)
].copy()

print(f"\nValid records: {len(invoice_valid_df)} out of {len(invoice_df)}")
print(f"Records removed: {len(invoice_df) - len(invoice_valid_df)}")

## Add Time-based Features

In [None]:
# Create month-based features
invoice_valid_df['Year'] = invoice_valid_df['Invoice Date'].dt.year
invoice_valid_df['Quarter'] = invoice_valid_df['Invoice Date'].dt.quarter
invoice_valid_df['Month_Name'] = invoice_valid_df['Invoice Date'].dt.strftime('%B')
invoice_valid_df['Month'] = invoice_valid_df['Invoice Date']

print("Date features added successfully")
invoice_valid_df[['Invoice Date', 'Year', 'Quarter', 'Month_Name']].head()

## Aggregate by Medium

In [None]:
# Group by invoice medium
invoice_by_medium_df = invoice_valid_df.groupby('Invoice Medium').agg({
    'Marketing Invoice': 'sum',
    'Marketing Estimate': 'sum',
    'Variance': 'sum',
    'Invoice Number': 'count'
}).round(2)

invoice_by_medium_df.rename(columns={'Invoice Number': 'Invoice Count'}, inplace=True)
invoice_by_medium_df['Variance %'] = (
    invoice_by_medium_df['Variance'] / invoice_by_medium_df['Marketing Estimate'] * 100
).round(2)
invoice_by_medium_df = invoice_by_medium_df.sort_values('Marketing Invoice', ascending=False)
invoice_by_medium_df = invoice_by_medium_df.reset_index()

print("\nInvoice Summary by Medium:")
print(invoice_by_medium_df)

## Aggregate by Month

In [None]:
# Group by month
invoice_by_month_df = invoice_valid_df.groupby('Month').agg({
    'Marketing Invoice': 'sum',
    'Marketing Estimate': 'sum',
    'Variance': 'sum'
}).round(2)

invoice_by_month_df = invoice_by_month_df.reset_index()

print("\nInvoice Summary by Month:")
print(invoice_by_month_df)

## Visualizations

In [None]:
# Marketing Invoice by Medium
fig, ax = plt.subplots(figsize=(12, 6))

sns.barplot(
    data=invoice_by_medium_df,
    x='Invoice Medium',
    y='Marketing Invoice',
    ax=ax,
    palette='pastel'
)

ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
ax.yaxis.set_major_formatter(mtick.StrMethodFormatter('${x:,.0f}'))
ax.set_title('Marketing Invoice by Medium', fontsize=16, fontweight='bold')
ax.set_xlabel('Marketing Medium', fontsize=12)
ax.set_ylabel('Total Invoice Amount ($)', fontsize=12)

plt.tight_layout()
plt.show()

print(f"\nTotal Marketing Invoice: ${invoice_by_medium_df['Marketing Invoice'].sum():,.2f}")

In [None]:
# Budget vs Actual by Month
fig, ax = plt.subplots(figsize=(14, 6))

x = range(len(invoice_by_month_df))
width = 0.35

bars1 = ax.bar(
    [i - width/2 for i in x],
    invoice_by_month_df['Marketing Estimate'],
    width,
    label='Budget (Estimate)',
    color='lightblue',
    alpha=0.8
)

bars2 = ax.bar(
    [i + width/2 for i in x],
    invoice_by_month_df['Marketing Invoice'],
    width,
    label='Actual (Invoice)',
    color='coral',
    alpha=0.8
)

ax.set_xticks(x)
ax.set_xticklabels(
    [d.strftime('%b %Y') for d in invoice_by_month_df['Month']],
    rotation=45,
    ha='right'
)
ax.yaxis.set_major_formatter(mtick.StrMethodFormatter('${x:,.0f}'))
ax.set_title('Monthly Budget vs Actual Spending', fontsize=16, fontweight='bold')
ax.set_xlabel('Month', fontsize=12)
ax.set_ylabel('Amount ($)', fontsize=12)
ax.legend()
ax.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

In [None]:
# Budget Variance Analysis
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Variance by Medium
colors = ['green' if x < 0 else 'red' for x in invoice_by_medium_df['Variance']]
axes[0].barh(invoice_by_medium_df['Invoice Medium'], invoice_by_medium_df['Variance'], color=colors, alpha=0.7)
axes[0].axvline(x=0, color='black', linestyle='-', linewidth=0.5)
axes[0].set_title('Budget Variance by Medium', fontweight='bold', fontsize=12)
axes[0].set_xlabel('Variance ($)', fontsize=10)
axes[0].xaxis.set_major_formatter(mtick.StrMethodFormatter('${x:,.0f}'))

# Variance % by Medium
colors = ['green' if x < 0 else 'red' for x in invoice_by_medium_df['Variance %']]
axes[1].barh(invoice_by_medium_df['Invoice Medium'], invoice_by_medium_df['Variance %'], color=colors, alpha=0.7)
axes[1].axvline(x=0, color='black', linestyle='-', linewidth=0.5)
axes[1].set_title('Budget Variance % by Medium', fontweight='bold', fontsize=12)
axes[1].set_xlabel('Variance (%)', fontsize=10)
axes[1].xaxis.set_major_formatter(mtick.StrMethodFormatter('{x:.1f}%'))

plt.tight_layout()
plt.show()

print("\nGreen = Under Budget, Red = Over Budget")

In [None]:
# Spending by Campaign Type
campaign_spending = invoice_valid_df.groupby('Campaign Type').agg({
    'Marketing Invoice': 'sum'
}).sort_values('Marketing Invoice', ascending=False)

fig, ax = plt.subplots(figsize=(10, 6))

wedges, texts, autotexts = ax.pie(
    campaign_spending['Marketing Invoice'],
    labels=campaign_spending.index,
    autopct='%1.1f%%',
    startangle=90,
    colors=sns.color_palette('pastel')
)

for autotext in autotexts:
    autotext.set_color('white')
    autotext.set_fontweight('bold')

ax.set_title('Marketing Spend by Campaign Type', fontsize=16, fontweight='bold')

plt.tight_layout()
plt.show()

## Save Processed Data

In [None]:
# Save to CSV
invoice_valid_df.to_csv('invoice_data_processed.csv', index=False)
invoice_by_medium_df.to_csv('invoice_by_medium.csv', index=False)
invoice_by_month_df.to_csv('invoice_by_month.csv', index=False)

print("Data saved successfully!")
print(f"- invoice_data_processed.csv ({len(invoice_valid_df)} records)")
print(f"- invoice_by_medium.csv ({len(invoice_by_medium_df)} records)")
print(f"- invoice_by_month.csv ({len(invoice_by_month_df)} records)")