In [None]:
import pandas as pd
import numpy as np

marketing_data = pd.read_csv("Datasets/Marketing_campaign_dataset.csv")
print(f"Original Shape: {marketing_data.shape}")
print("Columns:", marketing_data.columns.tolist())

marketing_data['time'] = pd.to_datetime(marketing_data['time'])

#Since there is only one value to these columns they will be dropped
exchange_rate_unique = marketing_data["exchange_rate"].unique()
print("Unique values in exchange_rate:", exchange_rate_unique)
network_margin_unique = marketing_data["network_margin"].unique()
print("Unique values in network_margin:", network_margin_unique)
max_bid_unique = marketing_data["max_bid_cpm"].unique()
print("Unique values in max_bid_cpm:", max_bid_unique)

#Since these columns are empty they will be dropped
position_in_content_unique = marketing_data["position_in_content"].unique()
unique_reach_unique = marketing_data["unique_reach"].unique()
total_reach_unique = marketing_data["total_reach"].unique()

print("Unique values in position_in_content:", position_in_content_unique)
print("Unique values in unique_reach:", unique_reach_unique)
print("Unique values in total_reach:", total_reach_unique)

marketing_data.drop(columns=["ext_service_id", "creative_id", "creative_width", "creative_height", "search_tags",
                             "template_id", "landing_page", "advertiser_id", "advertiser_name", 
                             "network_id", "channel_id", "advertiser_currency", "channel_id", "max_bid_cpm", 
                             "network_margin", "stats_currency", "currency_code", "exchange_rate", "cmi_currency_code",
                             "position_in_content", "unique_reach", "total_reach"], inplace=True)

# Drop row duplicates (if any)
marketing_data = marketing_data.drop_duplicates()

print(f"New Shape: {marketing_data.shape}")
print("Columns:", marketing_data.columns.tolist())
marketing_data.head()
marketing_data.info()

Original Shape: (72612, 35)
Columns: ['campaign_item_id', 'no_of_days', 'time', 'ext_service_id', 'ext_service_name', 'creative_id', 'creative_width', 'creative_height', 'search_tags', 'template_id', 'landing_page', 'advertiser_id', 'advertiser_name', 'network_id', 'approved_budget', 'advertiser_currency', 'channel_id', 'channel_name', 'max_bid_cpm', 'network_margin', 'campaign_budget_usd', 'impressions', 'clicks', 'stats_currency', 'currency_code', 'exchange_rate', 'media_cost_usd', 'position_in_content', 'unique_reach', 'total_reach', 'search_tag_cat', 'cmi_currency_code', 'timezone', 'weekday_cat', 'keywords']
Unique values in exchange_rate: [1]
Unique values in network_margin: [0.]
Unique values in max_bid_cpm: [nan  1.]
Unique values in position_in_content: [nan]
Unique values in unique_reach: [nan]
Unique values in total_reach: [nan]
New Shape: (72612, 17)
Columns: ['campaign_item_id', 'no_of_days', 'time', 'ext_service_name', 'approved_budget', 'channel_name', 'campaign_budget_u

In [7]:
numeric_cols = marketing_data.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = marketing_data.select_dtypes(include=['object']).columns

for col in marketing_data.columns:
    print(f"number of missing values in {col}: {marketing_data[col].isnull().sum()}")

    # We want to handle Ordinal features specifically later.
    # So we skip them in this general loop.
    if col in ['time', 'keywords']:
        continue
    
    # Check if column has missing values
    if marketing_data[col].isnull().sum() > 0:
        
        # --- STRATEGY A: DROP (For Target Variables) ---
        if col in ['campaign_budget_usd', 'approved_budget']:
            # WHY: These variables define the 'Fitness Function' (Profit/Loss).
            # If we impute (guess) these values, we might accidentally label a non-converting user as profitable (e.g., filling with average ROI).
            print(f"Dropping rows with missing {col} (Critical Ground Truth)...")
            marketing_data = marketing_data.dropna(subset=[col])
        
        # --- STRATEGY B: IMPUTE MEDIAN (For Numerical Features) ---
        elif col in numeric_cols:
            # WHY MEDIAN: Marketing data often has outliers (e.g., 'view_time' where a user leaves a tab open for hours). 
            # The Mean is sensitive to these outliers (pulling the average up to unrealistic levels).
            # The Median represents the 'typical' user behavior more accurately.
            median_val = marketing_data[col].median()
            marketing_data[col] = marketing_data[col].fillna(median_val)
            
        # --- STRATEGY C: IMPUTE MODE (For Categorical Features) ---
        elif col in categorical_cols:
            # WHY MODE: For categories like 'Location' or 'Device', we cannot calculate an average. 
            # The safest statistical assumption is the most Frequent Value (Mode), as it is the most probable category for an unknown user.
            mode_val = marketing_data[col].mode()[0]
            marketing_data[col] = marketing_data[col].fillna(mode_val)

 # WHY WE DON'T DROP ROWS:
    # 1. Statistical Power: Dropping rows reduces sample size. If multiple columns have small gaps, we could lose 30-50% of the data.
    # 2. Bias Prevention: If data is missing systematically (e.g., mobile  devices failing to log 'view_time'), dropping rows would bias the 
    #    model against that group (Mobile users).

print("Missing values handled.")

number of missing values in campaign_item_id: 0
number of missing values in no_of_days: 0
number of missing values in time: 0
number of missing values in ext_service_name: 0
number of missing values in approved_budget: 406
Dropping rows with missing approved_budget (Critical Ground Truth)...
number of missing values in channel_name: 0
number of missing values in campaign_budget_usd: 0
number of missing values in impressions: 0
number of missing values in clicks: 0
number of missing values in media_cost_usd: 0
number of missing values in position_in_content: 72206
number of missing values in unique_reach: 72206
number of missing values in total_reach: 72206
number of missing values in search_tag_cat: 0
number of missing values in timezone: 0
number of missing values in weekday_cat: 0
number of missing values in keywords: 0
Missing values handled.


In [4]:
# Extract Day of Week (0=Monday, 6=Sunday)
# WHY: Weekend behavior (gaming, shopping) differs from workday behavior (news, business).
marketing_data['day_of_week'] = marketing_data['time'].dt.dayofweek

# Create a 'Is_Weekend' Binary Feature
# WHY: This simplifies the signal for the model. 
marketing_data['is_weekend'] = marketing_data['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)

# Drop original time-related columns that are no longer needed since we dont have an hour we cant use timezone and the weekday_cat can be seperated into day_of_week and is_weekend
marketing_data.drop(columns=["timezone", "weekday_cat"], inplace=True)

print("Time features extracted.")
marketing_data[['time', 'day_of_week', 'is_weekend']].head()
marketing_data.info()

Time features extracted.
<class 'pandas.core.frame.DataFrame'>
Index: 72206 entries, 0 to 72611
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   campaign_item_id     72206 non-null  int64         
 1   no_of_days           72206 non-null  int64         
 2   time                 72206 non-null  datetime64[ns]
 3   ext_service_name     72206 non-null  object        
 4   approved_budget      72206 non-null  float64       
 5   channel_name         72206 non-null  object        
 6   campaign_budget_usd  72206 non-null  float64       
 7   impressions          72206 non-null  int64         
 8   clicks               72206 non-null  int64         
 9   media_cost_usd       72206 non-null  float64       
 10  position_in_content  0 non-null      float64       
 11  unique_reach         0 non-null      float64       
 12  total_reach          0 non-null      float64       
 13  search_tag_

In [5]:
ext_service_name = marketing_data["ext_service_name"].unique()
channel_name = marketing_data["channel_name"].unique()
search_tag_cat = marketing_data["search_tag_cat"].unique()
keywords = marketing_data["keywords"].unique()

print("Unique Device Types:", ext_service_name.size)
print("Unique Channel Names:", channel_name.size)
print("Unique Search Tag Categories:", search_tag_cat.size)
print("Unique Keywords:", keywords.size)

Unique Device Types: 3
Unique Channel Names: 5
Unique Search Tag Categories: 5
Unique Keywords: 118
