In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('Clicked Ads Dataset.csv')

In [3]:
# Display all columns
pd.set_option('display.max_columns', None)

In [4]:
# Basic investigation
print("--- First 5 rows ---")
print(df.head())

print("\n--- Dataset Info ---")
df.info()

print("\n--- Summary Statistics ---")
print(df.describe())

print("\n--- Missing Values ---")
print(df.isnull().sum())

print("\n--- Duplicated Rows ---")
print(df.duplicated().sum())

--- First 5 rows ---
   Unnamed: 0  Daily Time Spent on Site  Age  Area Income  \
0           0                     68.95   35  432837300.0   
1           1                     80.23   31  479092950.0   
2           2                     69.47   26  418501580.0   
3           3                     74.15   29  383643260.0   
4           4                     68.37   35  517229930.0   

   Daily Internet Usage       Male        Timestamp Clicked on Ad  \
0                256.09  Perempuan   3/27/2016 0:53            No   
1                193.77  Laki-Laki    4/4/2016 1:39            No   
2                236.50  Perempuan  3/13/2016 20:35            No   
3                245.89  Laki-Laki   1/10/2016 2:31            No   
4                225.58  Perempuan    6/3/2016 3:36            No   

            city                       province    category  
0  Jakarta Timur  Daerah Khusus Ibukota Jakarta   Furniture  
1       Denpasar                           Bali        Food  
2       Sur

In [5]:
# Print distinct values
print("\nDistinct Values for Each Column:")
for col in df.columns:
    unique_values = df[col].unique()
    print(f"{col}: {unique_values[:20]}{'...' if len(unique_values) > 20 else ''} (Total: {len(unique_values)})")


Distinct Values for Each Column:
Unnamed: 0: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]... (Total: 1000)
Daily Time Spent on Site: [68.95 80.23 69.47 74.15 68.37 59.99 88.91 66.   74.53 69.88 47.64 83.07
 69.57 79.52 42.95 63.45 55.39 82.03 54.7  74.58]... (Total: 891)
Age: [35 31 26 29 23 33 48 30 20 49 37 24 41 36 40 52 28 34 22 57]... (Total: 43)
Area Income: [4.3283730e+08 4.7909295e+08 4.1850158e+08 3.8364326e+08 5.1722993e+08
 4.1833092e+08 3.7696995e+08 1.7215331e+08 4.8203400e+08 3.8949624e+08
 3.1942757e+08 4.3743707e+08 3.6145844e+08 3.6217741e+08 2.1683200e+08
 3.6527561e+08 1.6755802e+08 5.0057756e+08 2.1761278e+08 1.6675204e+08]... (Total: 988)
Daily Internet Usage: [256.09 193.77 236.5  245.89 225.58 226.74 208.36 131.76 221.51 183.82
 122.02 230.87 113.12 214.23 143.56 140.64 129.41 187.53    nan 135.51]... (Total: 956)
Male: ['Perempuan' 'Laki-Laki' nan] (Total: 3)
Timestamp: ['3/27/2016 0:53' '4/4/2016 1:39' '3/13/2016 20:35' '1/10/2016 2:31'
 '6/3/

In [6]:
# Handling Missing Values
# We will fill numerical values with the median and categorical values with the mode
for col in df.columns:
    if df[col].dtype == 'float64' or df[col].dtype == 'int64':
        df[col].fillna(df[col].median(), inplace=True)
    else:
        df[col].fillna(df[col].mode()[0], inplace=True)

# Checking Missing Values
print("Number of Missing Values:\n", df.isnull().sum())

Number of Missing Values:
 Unnamed: 0                  0
Daily Time Spent on Site    0
Age                         0
Area Income                 0
Daily Internet Usage        0
Male                        0
Timestamp                   0
Clicked on Ad               0
city                        0
province                    0
category                    0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


In [7]:
from sklearn.preprocessing import LabelEncoder

# Label Encoding for Binary Columns
le = LabelEncoder()
df['Male'] = le.fit_transform(df['Male'])
df['Clicked on Ad'] = le.fit_transform(df['Clicked on Ad'])

# One-Hot Encoding for Nominal Categorical Columns
df = pd.get_dummies(df, columns=['city', 'province', 'category'], drop_first=True)

In [8]:
# Splitting Features (X) and Target (y)
X = df.drop('Clicked on Ad', axis=1)
y = df['Clicked on Ad']

In [9]:
# Time Feature Extraction
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
df['Year'] = df['Timestamp'].dt.year
df['Month'] = df['Timestamp'].dt.month
df['Week'] = df['Timestamp'].dt.isocalendar().week
df['Day'] = df['Timestamp'].dt.day

# Dropping the 'Timestamp' Column
df.drop('Timestamp', axis=1, inplace=True)

In [11]:
# Dropping the 'Unnamed' Column
df.drop('Unnamed: 0', axis=1, inplace=True)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 63 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Daily Time Spent on Site                1000 non-null   float64
 1   Age                                     1000 non-null   int64  
 2   Area Income                             1000 non-null   float64
 3   Daily Internet Usage                    1000 non-null   float64
 4   Male                                    1000 non-null   int32  
 5   Clicked on Ad                           1000 non-null   int32  
 6   city_Bandar Lampung                     1000 non-null   bool   
 7   city_Bandung                            1000 non-null   bool   
 8   city_Banjarmasin                        1000 non-null   bool   
 9   city_Batam                              1000 non-null   bool   
 10  city_Bekasi                             1000 non-null   bool 