In [1]:
import pandas as pd

# Load the cleaned data
df = pd.read_csv("../data/facebook_ads_cleaned.csv")
df.head()

Unnamed: 0,ad_id,reporting_start,reporting_end,campaign_id,fb_campaign_id,age,gender,interest1,interest2,interest3,impressions,clicks,spent,total_conversion,approved_conversion,CTR
0,708746,17/08/2017,17/08/2017,916,103916,30-34,M,15,17,17,7350.0,1,1.43,2.0,1.0,0.000136
1,708749,17/08/2017,17/08/2017,916,103917,30-34,M,16,19,21,17861.0,2,1.82,2.0,0.0,0.000112
2,708771,17/08/2017,17/08/2017,916,103920,30-34,M,20,25,22,693.0,0,0.0,1.0,0.0,0.0
3,708815,30/08/2017,30/08/2017,916,103928,30-34,M,28,32,32,4259.0,1,1.25,1.0,0.0,0.000235
4,708818,17/08/2017,17/08/2017,916,103928,30-34,M,28,33,32,4133.0,1,1.29,1.0,1.0,0.000242


In [2]:
# Convert to datetime
df['reporting_start'] = pd.to_datetime(df['reporting_start'], errors='coerce')

# Drop rows with invalid dates
df = df.dropna(subset=['reporting_start'])

  df['reporting_start'] = pd.to_datetime(df['reporting_start'], errors='coerce')


In [3]:
df['day_of_week'] = df['reporting_start'].dt.day_name()
df['hour'] = df['reporting_start'].dt.hour
df['is_weekend'] = df['day_of_week'].isin(['Saturday', 'Sunday']).astype(int)

df[['reporting_start', 'day_of_week', 'hour', 'is_weekend']].head()

Unnamed: 0,reporting_start,day_of_week,hour,is_weekend
0,2017-08-17,Thursday,0,0
1,2017-08-17,Thursday,0,0
2,2017-08-17,Thursday,0,0
3,2017-08-30,Wednesday,0,0
4,2017-08-17,Thursday,0,0


### Normalize Numeric Features

In [4]:
from sklearn.preprocessing import MinMaxScaler

# Initialize scaler
scaler = MinMaxScaler()

# Columns to normalize
cols_to_scale = ['spent', 'impressions', 'clicks', 'total_conversion', 'approved_conversion']

# Fit and transform
df_scaled = df.copy()
df_scaled[cols_to_scale] = scaler.fit_transform(df[cols_to_scale])

# Check result
df_scaled[cols_to_scale].describe()

Unnamed: 0,spent,impressions,clicks,total_conversion,approved_conversion
count,1139.0,1139.0,1139.0,761.0,761.0
mean,0.027595,0.022597,0.034315,0.036027,0.036606
std,0.075775,0.067833,0.080555,0.067703,0.078878
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,4.9e-05,0.002941,0.016667,0.0
50%,0.002406,0.001044,0.005882,0.016667,0.0
75%,0.013548,0.009233,0.025,0.033333,0.047619
max,1.0,1.0,1.0,1.0,1.0


In [5]:
df_scaled.to_csv("../data/facebook_ads_features_step2_scaled.csv", index=False)
print("✅ Scaled features saved to: facebook_ads_features_step2_scaled.csv")

✅ Scaled features saved to: facebook_ads_features_step2_scaled.csv


### One-hot Encoding

In [6]:
# Columns to one-hot encode
categorical_cols = ['gender', 'age', 'day_of_week', 'hour']

# Apply one-hot encoding
df_encoded = pd.get_dummies(df_scaled, columns=categorical_cols, drop_first=True)

# Check result
df_encoded.head()

Unnamed: 0,ad_id,reporting_start,reporting_end,campaign_id,fb_campaign_id,interest1,interest2,interest3,impressions,clicks,...,age_64,age_65,age_66,age_7,day_of_week_Monday,day_of_week_Saturday,day_of_week_Sunday,day_of_week_Thursday,day_of_week_Tuesday,day_of_week_Wednesday
0,708746,2017-08-17,17/08/2017,916,103916,15,17,17,0.002408,0.002941,...,False,False,False,False,False,False,False,True,False,False
1,708749,2017-08-17,17/08/2017,916,103917,16,19,21,0.005852,0.005882,...,False,False,False,False,False,False,False,True,False,False
2,708771,2017-08-17,17/08/2017,916,103920,20,25,22,0.000227,0.0,...,False,False,False,False,False,False,False,True,False,False
3,708815,2017-08-30,30/08/2017,916,103928,28,32,32,0.001395,0.002941,...,False,False,False,False,False,False,False,False,False,True
4,708818,2017-08-17,17/08/2017,916,103928,28,33,32,0.001354,0.002941,...,False,False,False,False,False,False,False,True,False,False


In [7]:
df_encoded.to_csv("../data/facebook_ads_final_preprocessed.csv", index=False)
print("✅ Final encoded dataset saved to: facebook_ads_final_preprocessed.csv")

✅ Final encoded dataset saved to: facebook_ads_final_preprocessed.csv
