In [2]:
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [3]:
data = pd.read_csv('advertising.csv')

In [4]:
data.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Timestamp,Clicked on Ad
0,68.95,35,61833.9,256.09,Cloned 5thgeneration orchestration,Wrightburgh,0,Tunisia,2016-03-27 00:53:11,0
1,80.23,31,68441.85,193.77,Monitored national standardization,West Jodi,1,Nauru,2016-04-04 01:39:02,0
2,69.47,26,59785.94,236.5,Organic bottom-line service-desk,Davidton,0,San Marino,2016-03-13 20:35:42,0
3,74.15,29,54806.18,245.89,Triple-buffered reciprocal time-frame,West Terrifurt,1,Italy,2016-01-10 02:31:19,0
4,68.37,35,73889.99,225.58,Robust logistical utilization,South Manuel,0,Iceland,2016-06-03 03:36:18,0


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Daily Time Spent on Site  1000 non-null   float64
 1   Age                       1000 non-null   int64  
 2   Area Income               1000 non-null   float64
 3   Daily Internet Usage      1000 non-null   float64
 4   Ad Topic Line             1000 non-null   object 
 5   City                      1000 non-null   object 
 6   Male                      1000 non-null   int64  
 7   Country                   1000 non-null   object 
 8   Timestamp                 1000 non-null   object 
 9   Clicked on Ad             1000 non-null   int64  
dtypes: float64(3), int64(3), object(4)
memory usage: 78.3+ KB


In [6]:
data.describe().round()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Male,Clicked on Ad
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,65.0,36.0,55000.0,180.0,0.0,0.0
std,16.0,9.0,13415.0,44.0,0.0,1.0
min,33.0,19.0,13996.0,105.0,0.0,0.0
25%,51.0,29.0,47032.0,139.0,0.0,0.0
50%,68.0,35.0,57012.0,183.0,0.0,0.0
75%,79.0,42.0,65471.0,219.0,1.0,1.0
max,91.0,61.0,79485.0,270.0,1.0,1.0


In [7]:
data.describe(include = 'object')

Unnamed: 0,Ad Topic Line,City,Country,Timestamp
count,1000,1000,1000,1000
unique,1000,969,237,1000
top,Cloned 5thgeneration orchestration,Lisamouth,France,2016-03-27 00:53:11
freq,1,3,9,1


In [8]:
data.drop(['Ad Topic Line', 'City', 'Timestamp'], axis= 1, inplace= True)

In [9]:
data.duplicated().sum()

0

In [10]:
data.isna().sum()

Daily Time Spent on Site    0
Age                         0
Area Income                 0
Daily Internet Usage        0
Male                        0
Country                     0
Clicked on Ad               0
dtype: int64

## Preprocessing

In [11]:
# define features and Target
x = data.drop('Clicked on Ad',axis = 1)
y = data['Clicked on Ad']

In [12]:
x

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Male,Country
0,68.95,35,61833.90,256.09,0,Tunisia
1,80.23,31,68441.85,193.77,1,Nauru
2,69.47,26,59785.94,236.50,0,San Marino
3,74.15,29,54806.18,245.89,1,Italy
4,68.37,35,73889.99,225.58,0,Iceland
...,...,...,...,...,...,...
995,72.97,30,71384.57,208.58,1,Lebanon
996,51.30,45,67782.17,134.42,1,Bosnia and Herzegovina
997,51.63,51,42415.72,120.37,1,Mongolia
998,55.55,19,41920.79,187.95,0,Guatemala


In [13]:
y

0      0
1      0
2      0
3      0
4      0
      ..
995    1
996    1
997    1
998    0
999    1
Name: Clicked on Ad, Length: 1000, dtype: int64

In [14]:
x.shape,y.shape

((1000, 6), (1000,))

In [15]:
y.value_counts()
# balanced

Clicked on Ad
0    500
1    500
Name: count, dtype: int64

In [16]:
# split data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2, random_state= 0, stratify= y)


## Handle Numeric Features

In [17]:
scaling_cols = x_train.columns.drop(['Country', 'Male'])
scaling_cols

Index(['Daily Time Spent on Site', 'Age', 'Area Income',
       'Daily Internet Usage'],
      dtype='object')

In [18]:
from sklearn.preprocessing import RobustScaler

rc = RobustScaler()

x_train[scaling_cols] = rc.fit_transform(x_train[scaling_cols])
x_test[scaling_cols] = rc.transform(x_test[scaling_cols])

In [19]:
x_train.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Male,Country
600,0.59484,1.083333,-0.569248,-0.580319,1,Kyrgyz Republic
737,0.120824,1.416667,-0.863072,-0.750515,0,Sweden
33,-0.459354,-1.0,-1.483623,0.374008,0,Senegal
519,-1.224016,0.416667,-0.615119,-0.392377,1,Mongolia
341,0.157944,1.083333,-0.246821,-0.840987,0,Mexico


## Handle Categorical Features

In [21]:
from category_encoders import BinaryEncoder

be = BinaryEncoder()

be_train = be.fit_transform(x_train[['Country']])
be_test = be.transform(x_test[['Country']])

In [22]:
be_train

Unnamed: 0,Country_0,Country_1,Country_2,Country_3,Country_4,Country_5,Country_6,Country_7
600,0,0,0,0,0,0,0,1
737,0,0,0,0,0,0,1,0
33,0,0,0,0,0,0,1,1
519,0,0,0,0,0,1,0,0
341,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...
622,1,1,0,0,1,1,0,1
682,0,0,0,1,1,1,0,1
357,0,0,1,0,0,1,1,0
918,1,1,0,1,0,1,0,0


In [23]:
x_train = pd.concat([x_train, be_train], axis= 1).drop('Country', axis = 1)

x_test = pd.concat([x_test, be_test], axis= 1).drop('Country', axis = 1)

In [24]:
x_train

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Male,Country_0,Country_1,Country_2,Country_3,Country_4,Country_5,Country_6,Country_7
600,0.594840,1.083333,-0.569248,-0.580319,1,0,0,0,0,0,0,0,1
737,0.120824,1.416667,-0.863072,-0.750515,0,0,0,0,0,0,0,1,0
33,-0.459354,-1.000000,-1.483623,0.374008,0,0,0,0,0,0,0,1,1
519,-1.224016,0.416667,-0.615119,-0.392377,1,0,0,0,0,0,1,0,0
341,0.157944,1.083333,-0.246821,-0.840987,0,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
622,-0.287491,2.083333,-0.603832,-0.388754,1,1,1,0,0,1,1,0,1
682,-0.905902,0.083333,-0.343783,-0.717901,1,0,0,0,1,1,1,0,1
357,-0.673163,0.333333,-0.627936,-0.887973,0,0,0,1,0,0,1,1,0
918,0.401448,-0.250000,0.609726,0.407873,1,1,1,0,1,0,1,0,0


## Logistic Regression

In [36]:
from sklearn.linear_model import LogisticRegression

LG = LogisticRegression(penalty= None)
LG.fit(x_train, y_train)

y_prd = LG.predict(x_test)

In [37]:
print('Train Accuracy :',LG.score(x_train, y_train) * 100)
print('Test Accuracy :',LG.score(x_test, y_test) * 100)


Train Accuracy : 97.375
Test Accuracy : 95.0


In [38]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
# Precision: How many predicted positives are actually positive?
precision = precision_score(y_prd, y_test)

# Recall: How many actual positives were correctly predicted?
recall = recall_score(y_prd, y_test)

# F1 Score: Harmonic mean of precision and recall
f1 = f1_score(y_prd, y_test)

# Confusion Matrix: Shows TP, TN, FP, FN
cm = confusion_matrix(y_prd, y_test)


In [39]:
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:\n", cm)

Precision: 0.92
Recall: 0.9787234042553191
F1 Score: 0.9484536082474226
Confusion Matrix:
 [[98  8]
 [ 2 92]]


In [40]:
cm

array([[98,  8],
       [ 2, 92]], dtype=int64)