# Data Wrangling - Ads Data
This notebook processes advertising data including spend, conversions, impressions, and clicks.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import seaborn as sns
from datetime import datetime, timedelta

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('pastel')

## Utility Functions

In [None]:
def missing_values(df, percentage):
    """
    Drop columns with missing values above the specified percentage.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        Input dataframe
    percentage : float
        Threshold percentage for dropping columns
    
    Returns:
    --------
    pandas.DataFrame
        Dataframe with high-missing columns removed
    """
    percent_missing = df.isnull().sum() * 100 / len(df)
    missing_value_df = pd.DataFrame({
        'column_name': df.columns,
        'percent_missing': percent_missing
    })
    
    missing_drop = missing_value_df[missing_value_df.percent_missing > percentage]['column_name'].tolist()
    print(f"Dropping {len(missing_drop)} columns with >{percentage}% missing values")
    
    return df.drop(missing_drop, axis=1)


def convert_string_to_date(df_colname, format='%Y-%m-%d'):
    """
    Convert string column to datetime.
    
    Parameters:
    -----------
    df_colname : pandas.Series
        Column to convert
    format : str
        Date format string
    
    Returns:
    --------
    pandas.Series
        Converted datetime series
    """
    return pd.to_datetime(df_colname, format=format)

## Generate Sample Ads Data

In [None]:
# Generate sample advertising data
np.random.seed(42)

# Define date range
start_date = datetime(2018, 1, 1)
end_date = datetime(2018, 12, 31)
date_range = pd.date_range(start_date, end_date, freq='D')

# Define ad mediums and vendors
ad_mediums = ['Google Ads', 'Facebook', 'LinkedIn', 'Twitter', 'Instagram', 
              'Display Network', 'Retargeting', 'Newsletter']
vendors = ['Google', 'Facebook', 'LinkedIn', 'Twitter', 'Instagram', 
           'eEffective', 'Netline', 'Various']

# Create sample data
n_records = 500
ads_data = {
    'Date': np.random.choice(date_range, n_records),
    'Ads Medium': np.random.choice(ad_mediums, n_records),
    'Vendor': np.random.choice(vendors, n_records),
    'Campaign Name': [f'Campaign_{i}' for i in range(n_records)],
    'Ads Spend': np.random.uniform(100, 5000, n_records).round(2),
    'Impressions': np.random.randint(1000, 100000, n_records),
    'Clicks': np.random.randint(10, 5000, n_records),
    'Add To Cart': np.random.randint(0, 50, n_records),
    'Lead Submission': np.random.randint(0, 100, n_records),
    'Whitepaper Download': np.random.randint(0, 30, n_records)
}

ads_df = pd.DataFrame(ads_data)

# Calculate derived metrics
ads_df['CTR'] = (ads_df['Clicks'] / ads_df['Impressions'] * 100).round(2)
ads_df['CPC'] = (ads_df['Ads Spend'] / ads_df['Clicks']).round(2)
ads_df['Total Conversions'] = (ads_df['Add To Cart'] + 
                                ads_df['Lead Submission'] + 
                                ads_df['Whitepaper Download'])
ads_df['Conversion Rate'] = (ads_df['Total Conversions'] / ads_df['Clicks'] * 100).round(2)

print(f"Generated {len(ads_df)} ad records")
print(f"Date range: {ads_df['Date'].min()} to {ads_df['Date'].max()}")
ads_df.head()

## Data Cleaning and Validation

In [None]:
# Check for missing values
print("Missing values per column:")
print(ads_df.isnull().sum())

# Drop columns with >50% missing values
ads_df = missing_values(ads_df, 50)

# Remove records with zero spend or invalid data
ads_valid_df = ads_df[
    (ads_df['Ads Spend'] > 0) & 
    (ads_df['Impressions'] > 0) &
    (ads_df['Clicks'] > 0)
].copy()

print(f"\nValid records: {len(ads_valid_df)} out of {len(ads_df)}")
print(f"Records removed: {len(ads_df) - len(ads_valid_df)}")

## Add Time-based Features

In [None]:
# Create month-based features
ads_valid_df['Month'] = ads_valid_df['Date'].dt.to_period('M').dt.to_timestamp()
ads_valid_df['Year'] = ads_valid_df['Date'].dt.year
ads_valid_df['Quarter'] = ads_valid_df['Date'].dt.quarter
ads_valid_df['Month_Name'] = ads_valid_df['Date'].dt.strftime('%B')

print("Date features added successfully")
ads_valid_df[['Date', 'Month', 'Year', 'Quarter', 'Month_Name']].head()

## Aggregate by Medium

In [None]:
# Group by ad medium
adspend_by_medium_df = ads_valid_df.groupby('Ads Medium').agg({
    'Ads Spend': 'sum',
    'Impressions': 'sum',
    'Clicks': 'sum',
    'Total Conversions': 'sum',
    'Campaign Name': 'count'
}).round(2)

adspend_by_medium_df.rename(columns={'Campaign Name': 'Campaign Count'}, inplace=True)
adspend_by_medium_df = adspend_by_medium_df.sort_values('Ads Spend', ascending=False)
adspend_by_medium_df = adspend_by_medium_df.reset_index()

print("\nAd Spend by Medium:")
print(adspend_by_medium_df)

## Aggregate by Month

In [None]:
# Group by month
ads_by_month_df = ads_valid_df.groupby('Month').agg({
    'Ads Spend': 'sum',
    'Impressions': 'sum',
    'Clicks': 'sum',
    'Add To Cart': 'sum',
    'Lead Submission': 'sum',
    'Whitepaper Download': 'sum',
    'Total Conversions': 'sum'
}).round(2)

ads_by_month_df = ads_by_month_df.reset_index()

print("\nAd Performance by Month:")
print(ads_by_month_df)

## Visualizations

In [None]:
# Ad Spend by Medium
fig, ax = plt.subplots(figsize=(12, 6))

sns.barplot(
    data=adspend_by_medium_df,
    x='Ads Medium',
    y='Ads Spend',
    ax=ax,
    palette='pastel'
)

ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
ax.yaxis.set_major_formatter(mtick.StrMethodFormatter('${x:,.0f}'))
ax.set_title('Ad Spend by Medium', fontsize=16, fontweight='bold')
ax.set_xlabel('Advertising Medium', fontsize=12)
ax.set_ylabel('Total Spend ($)', fontsize=12)

plt.tight_layout()
plt.show()

print(f"\nTotal Ad Spend: ${adspend_by_medium_df['Ads Spend'].sum():,.2f}")

In [None]:
# Ad Spend by Month
fig, ax = plt.subplots(figsize=(14, 6))

ax.plot(
    ads_by_month_df['Month'],
    ads_by_month_df['Ads Spend'],
    marker='o',
    linewidth=2,
    markersize=8,
    color='steelblue'
)

ax.fill_between(
    ads_by_month_df['Month'],
    ads_by_month_df['Ads Spend'],
    alpha=0.3,
    color='steelblue'
)

ax.yaxis.set_major_formatter(mtick.StrMethodFormatter('${x:,.0f}'))
ax.set_title('Monthly Ad Spend Trend', fontsize=16, fontweight='bold')
ax.set_xlabel('Month', fontsize=12)
ax.set_ylabel('Ad Spend ($)', fontsize=12)
ax.grid(True, alpha=0.3)

plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Conversions by Medium
conversions_by_medium = ads_valid_df.groupby('Ads Medium').agg({
    'Add To Cart': 'sum',
    'Lead Submission': 'sum',
    'Whitepaper Download': 'sum'
})

fig, ax = plt.subplots(figsize=(12, 6))

conversions_by_medium.plot(
    kind='bar',
    stacked=True,
    ax=ax,
    color=['#FF6B6B', '#4ECDC4', '#45B7D1']
)

ax.set_title('Conversions by Medium and Type', fontsize=16, fontweight='bold')
ax.set_xlabel('Advertising Medium', fontsize=12)
ax.set_ylabel('Number of Conversions', fontsize=12)
ax.legend(title='Conversion Type', bbox_to_anchor=(1.05, 1), loc='upper left')
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')

plt.tight_layout()
plt.show()

In [None]:
# Performance metrics comparison
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# CTR by Medium
ctr_by_medium = ads_valid_df.groupby('Ads Medium')['CTR'].mean().sort_values(ascending=False)
axes[0, 0].barh(ctr_by_medium.index, ctr_by_medium.values, color='coral')
axes[0, 0].set_title('Average CTR by Medium', fontweight='bold')
axes[0, 0].set_xlabel('CTR (%)')

# CPC by Medium
cpc_by_medium = ads_valid_df.groupby('Ads Medium')['CPC'].mean().sort_values(ascending=True)
axes[0, 1].barh(cpc_by_medium.index, cpc_by_medium.values, color='lightgreen')
axes[0, 1].set_title('Average CPC by Medium', fontweight='bold')
axes[0, 1].set_xlabel('CPC ($)')
axes[0, 1].xaxis.set_major_formatter(mtick.StrMethodFormatter('${x:.2f}'))

# Conversion Rate by Medium
conv_rate_by_medium = ads_valid_df.groupby('Ads Medium')['Conversion Rate'].mean().sort_values(ascending=False)
axes[1, 0].barh(conv_rate_by_medium.index, conv_rate_by_medium.values, color='skyblue')
axes[1, 0].set_title('Average Conversion Rate by Medium', fontweight='bold')
axes[1, 0].set_xlabel('Conversion Rate (%)')

# Total Conversions by Medium
total_conv_by_medium = ads_valid_df.groupby('Ads Medium')['Total Conversions'].sum().sort_values(ascending=False)
axes[1, 1].barh(total_conv_by_medium.index, total_conv_by_medium.values, color='plum')
axes[1, 1].set_title('Total Conversions by Medium', fontweight='bold')
axes[1, 1].set_xlabel('Number of Conversions')

plt.tight_layout()
plt.show()

## Save Processed Data

In [None]:
# Save to CSV
ads_valid_df.to_csv('ads_data_processed.csv', index=False)
adspend_by_medium_df.to_csv('ads_by_medium.csv', index=False)
ads_by_month_df.to_csv('ads_by_month.csv', index=False)

print("Data saved successfully!")
print(f"- ads_data_processed.csv ({len(ads_valid_df)} records)")
print(f"- ads_by_medium.csv ({len(adspend_by_medium_df)} records)")
print(f"- ads_by_month.csv ({len(ads_by_month_df)} records)")