# Will the Ad be clicked?

## Installing libraries, dependencies, and data

In [403]:
# Import the required libraries and dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder


In [404]:
# Read csv file as DataFrame, with the first column as row index
df = pd.read_csv("Resources/Dataset_Ads.csv")

# Preview the dataset
df.head()

Unnamed: 0,Age,Gender,Income,Location,Ad Type,Ad Topic,Ad Placement,Clicks,Click Time,Conversion Rate,CTR
0,61,Male,35717.43,Urban,Banner,Travel,Social Media,3,2024-01-18 20:45:56.898459,0.0981,0.0737
1,41,Male,47453.25,Rural,Video,Travel,Search Engine,5,2023-04-24 20:45:56.898459,0.0937,0.0592
2,49,Female,68126.35,Rural,Text,Food,Social Media,4,2024-02-24 20:45:56.898459,0.1912,0.0563
3,68,Female,64585.73,Suburban,Text,Health,Website,6,2023-12-13 20:45:56.898459,0.1122,0.0232
4,63,Male,21109.4,Urban,Native,Fashion,Search Engine,5,2023-07-02 20:45:56.898459,0.1426,0.0539


## Data Cleanup and Preparation

In [405]:
# Checking the data types
df.dtypes

Age                  int64
Gender              object
Income             float64
Location            object
Ad Type             object
Ad Topic            object
Ad Placement        object
Clicks               int64
Click Time          object
Conversion Rate    float64
CTR                float64
dtype: object

In [406]:
# Checking for null values
df.isna().sum() / len(df)

Age                0.0
Gender             0.0
Income             0.0
Location           0.0
Ad Type            0.0
Ad Topic           0.0
Ad Placement       0.0
Clicks             0.0
Click Time         0.0
Conversion Rate    0.0
CTR                0.0
dtype: float64

In [407]:
# Convert Click Time into Datetime
df["Click Time"] = pd.to_datetime(df["Click Time"], errors="coerce")
df.dtypes

Age                         int64
Gender                     object
Income                    float64
Location                   object
Ad Type                    object
Ad Topic                   object
Ad Placement               object
Clicks                      int64
Click Time         datetime64[ns]
Conversion Rate           float64
CTR                       float64
dtype: object

In [408]:
# Creating a DataFrame for the X (features) value
X = df.drop(columns=["Clicks", "Click Time", "Conversion Rate","CTR"])
X

Unnamed: 0,Age,Gender,Income,Location,Ad Type,Ad Topic,Ad Placement
0,61,Male,35717.43,Urban,Banner,Travel,Social Media
1,41,Male,47453.25,Rural,Video,Travel,Search Engine
2,49,Female,68126.35,Rural,Text,Food,Social Media
3,68,Female,64585.73,Suburban,Text,Health,Website
4,63,Male,21109.40,Urban,Native,Fashion,Search Engine
...,...,...,...,...,...,...,...
9995,7,Male,69925.29,Rural,Text,Fashion,Social Media
9996,35,Male,42182.75,Urban,Native,Technology,Website
9997,42,Male,55084.12,Suburban,Text,Finance,Search Engine
9998,34,Male,39382.80,Rural,Native,Technology,Social Media


In [409]:
# Creating DataFrames for the y (target) values
y_clicks = df["Clicks"]
y_conversion = df["Conversion Rate"]
y_ctr = df["CTR"]
y_clicks



0       3
1       5
2       4
3       6
4       5
       ..
9995    5
9996    9
9997    3
9998    4
9999    2
Name: Clicks, Length: 10000, dtype: int64

In [410]:
y_conversion

0       0.0981
1       0.0937
2       0.1912
3       0.1122
4       0.1426
         ...  
9995    0.2460
9996    0.2664
9997    0.2354
9998    0.1725
9999    0.0307
Name: Conversion Rate, Length: 10000, dtype: float64

In [411]:
y_ctr

0       0.0737
1       0.0592
2       0.0563
3       0.0232
4       0.0539
         ...  
9995    0.1045
9996    0.0407
9997    0.0390
9998    0.0455
9999    0.0323
Name: CTR, Length: 10000, dtype: float64

In [412]:
# Creating a OneHotEncoder column function
def oheEncodeColumn(X_column):
    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    encoded_column = ohe.fit(X_train[X_column].values.reshape(-1, 1))
    return encoded_column

In [413]:
# This is a column that might contain variables with a specific order
X["Location"].value_counts()

Location
Rural       3408
Suburban    3307
Urban       3285
Name: count, dtype: int64

In [414]:
# Creating an OrdinalEncoder column function
def ordEncodeColumn(X_column):
    ord = OrdinalEncoder(categories=[["Rural", "Suburban", "Urban"]], encoded_missing_value=-1, handle_unknown="use_encoded_value", unknown_value=-1)
    encoded_column = ord.fit(X_train[X_column].values.reshape(-1, 1))
    return encoded_column

In [415]:
# Creating an encode function
def X_preprocess(X_data):
    # Creating an encoder for each X column
    # and Transforming them into numpy arrays

    # Gender column
    gender_ohe = oheEncodeColumn("Gender")
    gender_encoded = gender_ohe.transform(X_data["Gender"].values.reshape(-1,1))
    # Location column
    location_ord = ordEncodeColumn("Location")
    location_encoded = location_ord.transform(X_data["Location"].values.reshape(-1,1))
    # Ad Type column
    type_ohe = oheEncodeColumn("Ad Type")
    type_encoded = type_ohe.transform(X_data["Ad Type"].values.reshape(-1,1))
    # Ad Topic column
    topic_ohe = oheEncodeColumn("Ad Topic")
    topic_encoded = topic_ohe.transform(X_data["Ad Topic"].values.reshape(-1,1))
    # Ad Placement column
    placement_ohe = oheEncodeColumn("Ad Placement")
    placement_encoded = placement_ohe.transform(X_data["Ad Placement"].values.reshape(-1,1))

    # Turning each numpy array into a DataFrame
    gender_df = pd.DataFrame(gender_encoded, columns=gender_ohe.get_feature_names_out())
    type_df = pd.DataFrame(type_encoded, columns=type_ohe.get_feature_names_out())
    topic_df = pd.DataFrame(topic_encoded, columns=topic_ohe.get_feature_names_out())
    placement_df = pd.DataFrame(placement_encoded, columns=placement_ohe.get_feature_names_out())

    # Creating an encoded DataFrame
    out_df = pd.concat([gender_df, type_df, topic_df, placement_df], axis=1)
    out_df["Location"] = location_encoded
    # Return the DataFrame
    return out_df



## Question 1: What factors make someone more likely to click on an ad?
Summary

In [416]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [417]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_clicks, random_state=42)

X_train.describe()



Unnamed: 0,Age,Income
count,7500.0,7500.0
mean,34.197067,50000.151733
std,14.765025,19848.020159
min,-21.0,-38932.64
25%,24.0,36943.58
50%,34.0,49973.3
75%,44.0,63023.9375
max,86.0,126635.8


In [418]:
y = df.drop(columns=["Age","Gender","Income","Location","Ad Type","Ad Topic","Ad Placement"])
y

Unnamed: 0,Clicks,Click Time,Conversion Rate,CTR
0,3,2024-01-18 20:45:56.898459,0.0981,0.0737
1,5,2023-04-24 20:45:56.898459,0.0937,0.0592
2,4,2024-02-24 20:45:56.898459,0.1912,0.0563
3,6,2023-12-13 20:45:56.898459,0.1122,0.0232
4,5,2023-07-02 20:45:56.898459,0.1426,0.0539
...,...,...,...,...
9995,5,2023-05-31 20:45:56.927349,0.2460,0.1045
9996,9,2023-09-01 20:45:56.927349,0.2664,0.0407
9997,3,2023-11-15 20:45:56.927349,0.2354,0.0390
9998,4,2023-05-23 20:45:56.927349,0.1725,0.0455


In [419]:
#Using PCA for feature analysis

In [420]:
# Saving the preprocess function to a variable
X = X_preprocess(X_train)

In [421]:
#Preprocess the training data
X_preprocess(X_train)

Unnamed: 0,x0_Female,x0_Male,x0_Other,x0_Banner,x0_Native,x0_Text,x0_Video,x0_Fashion,x0_Finance,x0_Food,x0_Health,x0_Technology,x0_Travel,x0_Search Engine,x0_Social Media,x0_Website,Location
0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,2.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0
2,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7495,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0
7496,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
7497,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
7498,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [422]:
# Preprocess the testing data
X_preprocess(X_test)

Unnamed: 0,x0_Female,x0_Male,x0_Other,x0_Banner,x0_Native,x0_Text,x0_Video,x0_Fashion,x0_Finance,x0_Food,x0_Health,x0_Technology,x0_Travel,x0_Search Engine,x0_Social Media,x0_Website,Location
0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
4,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2495,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0
2496,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,2.0
2497,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
2498,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,2.0


In [423]:

# Using StandardScaler() for scaling the data

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# PCA weight for the desired number of components (e.g., 3)
pca = PCA(n_components=3)

# Apply PCA transformation to the scaled data
X_reduced = pca.fit_transform(X_scaled)



#Predicting Clicks


#Predicting Conversion Rate (clicks are included as desired action from an Ad)


#Predicting Click-Through Rate (CTR)

## Question 2: What Regression Model is best at predicting the data?
Summary

## Question 3: Can a Classification model predict if someone clicks on an ad?
Summary

## Question 4: What is the most desirable ad?
Summary