In [5]:
import pandas as pd
import numpy as np

df = pd.read_csv("Datasets/advertising_dataset.csv")
print(f"Original Shape: {df.shape}")
print("Columns:", df.columns.tolist())

# We select columns that end with ".1"
# This drops 'click_through_rate.1' and 'conversion_rate.1 (Duplicate columns)'
df = df.loc[:, ~df.columns.str.endswith('.1')]

# Since you need to MERGE this later, we must NOT drop them yet.
# However, they are not useful for the ML model (Prediction).
# So We will set them as the Index, so they stay in the dataframe but don't interfere with calculations.
df.set_index(['user_id', 'ad_id'], inplace=True)

df['timestamp'] = pd.to_datetime(df['timestamp'])

# Drop row duplicates (if any)
# We do this because of the time stamp, there should be no exact duplicates.
df = df.drop_duplicates()

print(f"Shape after cleaning: {df.shape}")
print("Columns after cleaning:", df.columns.tolist())
df.head()

Original Shape: (1000, 18)
Columns: ['user_id', 'timestamp', 'device_type', 'location', 'age_group', 'gender', 'ad_id', 'content_type', 'ad_topic', 'ad_target_audience', 'click_through_rate', 'conversion_rate', 'engagement_level', 'view_time', 'cost_per_click', 'click_through_rate.1', 'conversion_rate.1', 'ROI']
Shape after cleaning: (1000, 14)
Columns after cleaning: ['timestamp', 'device_type', 'location', 'age_group', 'gender', 'content_type', 'ad_topic', 'ad_target_audience', 'click_through_rate', 'conversion_rate', 'engagement_level', 'view_time', 'cost_per_click', 'ROI']


Unnamed: 0_level_0,Unnamed: 1_level_0,timestamp,device_type,location,age_group,gender,content_type,ad_topic,ad_target_audience,click_through_rate,conversion_rate,engagement_level,view_time,cost_per_click,ROI
user_id,ad_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
896,A5630,2025-11-18 22:20:05,Tablet,UK,55+,Male,Image,Health,Young Adults,0.010588,0.0657,Ignored,21,0.35,0.2
827,A6480,2025-11-12 05:00:19,Desktop,Germany,45-54,Female,Text,Electronics,Family Oriented,0.022195,0.0237,Ignored,28,0.15,0.35
211,A7800,2025-10-22 05:21:08,Tablet,India,18-24,Female,Text,Health,Travel Lovers,0.081724,0.1055,Commented,47,1.28,0.67
897,A1514,2025-03-02 14:17:27,Desktop,USA,25-34,Male,Video,Health,Fitness Lovers,0.123995,0.0332,Liked,55,1.2,0.34
293,A8366,2025-07-25 09:47:22,Desktop,Germany,25-34,Male,Image,Health,Tech Enthusiasts,0.044757,0.1258,Shared,40,0.67,0.84


In [6]:
for col in df.columns:
    print(f"number of missing values in {col}: {df[col].isnull().sum()}")


number of missing values in timestamp: 0
number of missing values in device_type: 0
number of missing values in location: 0
number of missing values in age_group: 0
number of missing values in gender: 0
number of missing values in content_type: 0
number of missing values in ad_topic: 0
number of missing values in ad_target_audience: 0
number of missing values in click_through_rate: 0
number of missing values in conversion_rate: 0
number of missing values in engagement_level: 0
number of missing values in view_time: 0
number of missing values in cost_per_click: 0
number of missing values in ROI: 0


In [None]:
#TODO podemos ver se é no inicio do mes / num feriado 
#TODO testar varias versoes diferentes ex: horas, altura do dia, fim de semana, etc

# Extract Hour of Day (0-23)
# WHY: Bidding strategies rely heavily on time. Ads at 3 AM perform differently than at 7 PM.
df['hour_of_day'] = df['timestamp'].dt.hour

# Extract Day of Week (0=Monday, 6=Sunday)
# WHY: Weekend behavior (gaming, shopping) differs from workday behavior (news, business).
df['day_of_week'] = df['timestamp'].dt.dayofweek

# Create a 'Is_Weekend' Binary Feature
# WHY: This simplifies the signal for the model. 
df['is_weekend'] = df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)

print("Time features extracted.")
df[['timestamp', 'hour_of_day', 'day_of_week', 'is_weekend']].head()

Time features extracted.


Unnamed: 0_level_0,Unnamed: 1_level_0,timestamp,hour_of_day,day_of_week,is_weekend
user_id,ad_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
896,A5630,2025-11-18 22:20:05,22,1,0
827,A6480,2025-11-12 05:00:19,5,2,0
211,A7800,2025-10-22 05:21:08,5,2,0
897,A1514,2025-03-02 14:17:27,14,6,1
293,A8366,2025-07-25 09:47:22,9,4,0


In [None]:
device = df["device_type"].unique()
location = df["location"].unique()
gender = df["gender"].unique()
ad_topic = df["ad_topic"].unique()
ad_target_audience = df["ad_target_audience"].unique()

print("Unique Device Types:", device)
print("Unique Locations:", location)
print("Unique Genders:", gender)
print("Unique Ad Topics:", ad_topic)
print("Unique Ad Target Audiences:", ad_target_audience)

#TODO faz sentido adicionar OTHER em todos!

Unique Device Types: ['Tablet' 'Desktop' 'Mobile']
Unique Locations: ['UK' 'Germany' 'India' 'USA' 'Canada']
Unique Genders: ['Male' 'Female']
Unique Ad Topics: ['Health' 'Electronics' 'Fashion' 'Travel' 'Automotive']
Unique Ad Target Audiences: ['Young Adults' 'Family Oriented' 'Travel Lovers' 'Fitness Lovers'
 'Tech Enthusiasts']


In [None]:
from sklearn.preprocessing import LabelEncoder

# ORDINAL ENCODING (For Age)
# WHY: 'Age Group' has a hierarchy. 55+ is 'older' than 18-24. 
# We map this manually to preserve that order (0, 1, 2, 3, 4).
# If we used One-Hot encoding, the model might lose the concept that 35 is "between" 25 and 45.
age_map = {
    '18-24': 0,
    '25-34': 1,
    '35-44': 2,
    '45-54': 3,
    '55+': 4
}

# Map the column
# Existing NaNs stay NaN. New NaNs might appear if a text value doesn't match the map.
df['age_group_encoded'] = df['age_group'].map(age_map)

# Drop the original text column to clean up
df = df.drop('age_group', axis=1)

# ORDINAL ENCODING (For Engagement)
# WHY: 'Engagement Level' represents a hierarchy of user interest (Intensity). 'Shared' implies higher value than 'Ignored'.
# We map this manually to enforce the correct logical order (0 to 3).
# If we used LabelEncoder, it would sort Alphabetically (Commented=0, Ignored=1), which is mathematically incorrect.
engagement_map = {
    'Ignored': 0,
    'Liked': 5,
    'Commented': 15,
    'Shared': 50
}
#TODO tentar ver melhor como definir os valores do engagement e justificar

# Map the column
# Existing NaNs stay NaN. New NaNs might appear if a text value doesn't match the map.
df['engagement_level_encoded'] = df['engagement_level'].map(engagement_map)

# Drop the original text column to clean up
df = df.drop('engagement_level', axis=1)

# C. ONE-HOT ENCODING (For Nominal Categories)
# WHY: For 'Device', 'Location', 'Gender', there is no order (UK is not 'greater than' USA).
# We convert these into binary columns (is_UK: 0 or 1, is_Mobile: 0 or 1).
# This prevents the model from assuming false relationships.

# Identify columns to encode
nominal_cols = ['device_type', 'location', 'gender', 'content_type', 'ad_topic', 'ad_target_audience']

# Apply One-Hot Encoding
# drop_first=True reduces redundancy (e.g., if is_Male=0, we know it's Female).
df = pd.get_dummies(df, columns=nominal_cols, drop_first=True)

# Convert boolean (True/False) to Integer (1/0) for consistency
df = df.replace({True: 1, False: 0})

print(f"Encoding Complete. Final Column Count: {len(df.columns)}")
df.head()

Encoding Complete. Final Column Count: 28


  df = df.replace({True: 1, False: 0})


Unnamed: 0_level_0,Unnamed: 1_level_0,timestamp,click_through_rate,conversion_rate,view_time,cost_per_click,ROI,hour_of_day,day_of_week,is_weekend,age_group_encoded,...,content_type_Text,content_type_Video,ad_topic_Electronics,ad_topic_Fashion,ad_topic_Health,ad_topic_Travel,ad_target_audience_Fitness Lovers,ad_target_audience_Tech Enthusiasts,ad_target_audience_Travel Lovers,ad_target_audience_Young Adults
user_id,ad_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
896,A5630,2025-11-18 22:20:05,0.010588,0.0657,21,0.35,0.2,22,1,0,4,...,0,0,0,0,1,0,0,0,0,1
827,A6480,2025-11-12 05:00:19,0.022195,0.0237,28,0.15,0.35,5,2,0,3,...,1,0,1,0,0,0,0,0,0,0
211,A7800,2025-10-22 05:21:08,0.081724,0.1055,47,1.28,0.67,5,2,0,0,...,1,0,0,0,1,0,0,0,1,0
897,A1514,2025-03-02 14:17:27,0.123995,0.0332,55,1.2,0.34,14,6,1,1,...,0,1,0,0,1,0,1,0,0,0
293,A8366,2025-07-25 09:47:22,0.044757,0.1258,40,0.67,0.84,9,4,0,1,...,0,0,0,0,1,0,0,1,0,0


In [10]:
# --- FINAL SANITY CHECK ---
# Ensure all data is numeric (except timestamp) and no NaNs exist.
print(f"Final Shape: {df.shape}")
print(f"Remaining Missing Values: {df.isnull().sum().sum()}")

# Check data types to ensure everything is int/float
print(df.dtypes)

# --- SAVE THE DATASET ---
df.to_csv("Cleaned_Datasets/advertising_cleaned_prepared.csv")

print("Dataset successfully cleaned, encoded, and saved.")

Final Shape: (1000, 28)
Remaining Missing Values: 0
timestamp                              datetime64[ns]
click_through_rate                            float64
conversion_rate                               float64
view_time                                       int64
cost_per_click                                float64
ROI                                           float64
hour_of_day                                     int32
day_of_week                                     int32
is_weekend                                      int64
age_group_encoded                               int64
engagement_level_encoded                        int64
device_type_Mobile                              int64
device_type_Tablet                              int64
location_Germany                                int64
location_India                                  int64
location_UK                                     int64
location_USA                                    int64
gender_Male                   