In [None]:
# Meta advertising data 
# Anonymous organization's social media ad campaign
# Calculates CAC 
# https://www.kaggle.com/code/chrisbow/an-introduction-to-facebook-ad-analysis-using-r
# https://www.kaggle.com/datasets/loveall/clicks-conversion-tracking?resource=download 

# Features & Label
# Gender. Male or female. 
# Interest. A code specifying the category to which the person’s interest belongs (interests are as mentioned in the person’s Meta/Facebook public profile).
# Spent. Amount paid by company xyz to Meta/Facebook, to show that ad as CPM (cost per 1,000 impressions), CPC (cost per click), or CPA (cost per approved conversion).
# Impressions. The number of times the ad was shown.
# Clicks.  Number of clicks on for that ad.
# Total conversion. Total number of people who enquired about the product after seeing the ad.
# Approved conversion. Total number of people who bought the product after seeing the ad.

# Requirements
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso, Ridge, SGDRegressor, ElasticNet

In [18]:
df = pd.read_csv("sales_conversion.csv")

In [19]:
# CAC = Total Marketing & Sales Spend / Number of New Customers Acquired 
ad_spend = df['Spent'].sum()
print("Ad spend:", f"${ad_spend:,.2f}")
new_customers = df['Approved_Conversion'].sum()
print("New customers:", f"{new_customers:,}")
cac = ad_spend / new_customers 
print("Customer acquisition costs (CAC):", f"${cac:,.2f}")


Ad spend: $58,705.23
New customers: 1,079
Customer acquisition costs (CAC): $54.41


In [20]:
# CPL = Total Marketing & Sales Spend / New Leads
# LCR = New Customers / Total Leads 
leads = df['Total_Conversion'].sum()
print("Leads:", f"{leads:,}")
cpl = ad_spend / leads
print("Cost per lead (CPL):", f"${cpl:,.2f}")
lcr = (new_customers / leads) * 100
print("Lead conversion rate:", f"{lcr:,.2f}%")

Leads: 3,264
Cost per lead (CPL): $17.99
Lead conversion rate: 33.06%


In [21]:
# CPM = cost per 1,000 ad impressions
cpm = df[(df['Spent'] < 10) & (df['Spent'] > 0)]
cpm = (cpm['Spent'] / cpm['Impressions']) * 1000
print("Average cost per 1,000 impressions (CPM):", f"${cpm.mean():.2f}")

Average cost per 1,000 impressions (CPM): $0.31


In [22]:
#CTR (%) = clicks / impressions x 100 
'''
Meta ad CTRs range from 0.5% to 2%. 
A low CTR suggests: 
- The ads were note engaging (low-quality creative).
- Targeting was too broad or mismatched.
- Dataset might be synthetic or incorrect.
'''
clicks = df['Clicks'].sum()
print("Clicks:", f"{clicks:,}")
impressions = df['Impressions'].sum()
print("Impressions:", f"{impressions:,}")
ctr = (clicks / impressions ) * 100
print("Click through rate (CTR):", f"{ctr:,.3f}%")

Clicks: 38,165
Impressions: 213,434,828
Click through rate (CTR): 0.018%


In [23]:
print(df.columns)

Index(['ad_id', 'xyz_campaign_id', 'fb_campaign_id', 'age', 'gender',
       'interest', 'Impressions', 'Clicks', 'Spent', 'Total_Conversion',
       'Approved_Conversion'],
      dtype='object')


In [24]:
X = df.drop(columns = ['ad_id', 'xyz_campaign_id', 'fb_campaign_id', 'age', 'Approved_Conversion'])
y = df['Approved_Conversion']

In [25]:
categorical = ['gender']
numeric = ['interest', 'Spent', 'Impressions', 'Clicks', 'Total_Conversion']

In [52]:
# Preprocessing: OHE for categorical, passthrough for numeric
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical),
        ('num', 'passthrough', numeric),
    ]
)

In [None]:
# Data pipelines

# Linear Regression
mlr = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),          # Scale features
    ('regressor', LinearRegression())
])

# Ridge Regression
ridge_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('ridge', Ridge(alpha=1.0, random_state=42))  # L2 regularization
])

# SGD Regressor
sgd_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('sgd', SGDRegressor(
        loss="squared_error",    # linear regression
        penalty="l2",            # L2 regularization (like Ridge)
        alpha=0.0001,            # regularization strength
        max_iter=1000,
        tol=1e-3,
        random_state=42
    ))
])

In [54]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=32)

In [None]:
# Fit Models 
mlr.fit(X_train, y_train)
ridge_pipeline.fit(X_train, y_train)
sgd_pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('scaler', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,alpha,1.0
,l1_ratio,0.5
,fit_intercept,True
,precompute,False
,max_iter,1000
,copy_X,True
,tol,0.0001
,warm_start,False
,positive,False
,random_state,42


In [63]:
print("Baseline Model Training Accuracy (R²): ", f"{mlr.score(X_train, y_train)*100:,.2f}%")
print("Baseline Model Test Accuracy (R²): ", f"{mlr.score(X_test, y_test)*100:,.2f}%")

print("Ridge Training Accuracy (R²): ", f"{ridge_pipeline.score(X_train, y_train)*100:,.2f}%")
print("Ridge Test Accuracy (R²): ", f"{ridge_pipeline.score(X_test, y_test)*100:,.2f}%")

print("SGD Training Accuracy (R²): ", f"{sgd_pipeline.score(X_train, y_train)*100:,.2f}%")
print("SGD Test Accuracy (R²): ", f"{sgd_pipeline.score(X_test, y_test)*100:,.2f}%")

Baseline Model Training Accuracy (R²):  74.90%
Baseline Model Test Accuracy (R²):  77.62%
Ridge Training Accuracy (R²):  74.90%
Ridge Test Accuracy (R²):  77.67%
SGD Training Accuracy (R²):  74.76%
SGD Test Accuracy (R²):  77.54%


In [57]:
# Coefficients
print(mlr.named_steps['regressor'].coef_)

[-0.02349597  0.02349597 -0.06446615 -0.41666354  0.51987794 -0.11907362
  1.45966086]


In [58]:
# CAC
conversion = pd.DataFrame([{
    'gender': "M",
    'interest': 16,
    'Spent': 150,
    'Impressions': 1000000,
    'Clicks': 95,
    'Total_Conversion': 26
}])

prediction = mlr.predict(conversion)[0]
print("Predicted new customers:", prediction)

Predicted new customers: 9.534925864736039


In [None]:
# Next steps: 
# Get more data for more robust CTR data