In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import cohen_kappa_score
from sklearn.linear_model import LogisticRegression

In [None]:
df = pd.read_csv('Clicked Ads Dataset.csv')
df.head()

Univariate analysis on age column, daily internet use, and daily time spent on

In [None]:
Uni = ['Age','Daily Internet Usage','Daily Time Spent on Site']

In [None]:
for i in range(len(Uni)):
    plt.figure(figsize=(15,5))
    sns.countplot(x=df[Uni[i]], data=df, color='green')

In [None]:
sns.displot(data=df, x="Age", hue="Clicked on Ad", kde=True)
plt.legend()
plt.show()


In [None]:
sns.displot(data=df, x="Daily Internet Usage", hue="Clicked on Ad", kde=True)
plt.legend()
plt.show()


In [None]:
sns.displot(data=df, x="Daily Time Spent on Site", hue="Clicked on Ad", kde=True)
plt.legend()
plt.show()


Bivariate analysis on variables that have been studied in univariate analysis

In [None]:
sns.scatterplot(data=df, x='Age', y='Daily Internet Usage', hue='Clicked on Ad')
plt.title('Age vs Daily Internet Usage')
plt.show()

In [None]:
sns.scatterplot(data=df, x='Age', y='Daily Time Spent on Site', hue='Clicked on Ad')
plt.title('Age vs Daily Time Spent on Site')
plt.show()


In [None]:
sns.scatterplot(data=df, x='Daily Internet Usage', y='Daily Time Spent on Site', hue='Clicked on Ad')
plt.title('Daily Internet Usage vs Daily Time Spent on Site')
plt.show()

Correlate between columns and perform multivariate analysis

In [None]:
df.corr()

In [None]:
plt.figure(figsize=(16, 6))
heatmap = sns.heatmap(df.corr(), vmin=-1, vmax=1, annot=True, cmap='BrBG')
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':18}, pad=12);
# save heatmap as .png file
# dpi - sets the resolution of the saved image in dots/inches
# bbox_inches - when set to 'tight' - does not allow the labels to be cropped
plt.savefig('heatmap.png', dpi=300, bbox_inches='tight')


In [None]:
# Change the data type
df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')

In [None]:
# rename column
df = df.rename(columns = {'Male' : 'Gender'} )


Clean the dataset from missing values ​​and duplicated values


In [None]:
df.isnull().sum()

In [None]:
new_df = df.dropna()

In [None]:
new_df.isnull().sum()

In [None]:
# check for duplicates of all columns

new_df.duplicated().sum()

### Feature encoding on the dataset


In [None]:
new_df.info()

In [None]:
Categorical = ['Gender','Timestamp','Clicked on Ad', 'city' , 'province', 'category']
Numerical = ['Daily Time Spent on Site', 'Age', 'Area Income', 'Daily Internet Usage']

In [None]:
df_enc = new_df.copy()
df_encoded = pd.get_dummies(df_enc, columns=Categorical)


df_encoded.head()

### Split data into features and targets

In [None]:
from sklearn.model_selection import train_test_split

X = df_encoded.drop(columns=['Clicked on Ad_Yes'])
y = df_encoded['Clicked on Ad_Yes']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

print("Total training data:", X_train.shape[0])
print("Number of test data:", X_test.shape[0])

Extract the column related to time

In [None]:
# year extraction
df_encoded['Tahun'] = df_enc['Timestamp'].dt.year

# Month extraction
df_encoded['Bulan'] = df_enc['Timestamp'].dt.month

# Week extraction
df_encoded['Pekan'] = df_enc['Timestamp'].dt.isocalendar().week

# Day extraction
df_encoded['Hari'] = df_enc['Timestamp'].dt.day
df_encoded

Share data separately which will later be used as training data and test data

In [None]:
from sklearn.model_selection import train_test_split

X = df_encoded.drop(columns=['Clicked on Ad_Yes'])
y = df_encoded['Clicked on Ad_Yes']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

print("Total training data:", X_train.shape[0])
print("Number of test data:", X_test.shape[0])

## Building the Model

### Before Normalization

Logistic Regression

In [None]:
lr = LogisticRegression(C = 1000.0, random_state = 0)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print('Accuracy: ')
print('{}'.format(accuracy_score(y_test, y_pred)))
print('Classification report: ')
print('{}'.format(classification_report(y_test, y_pred)))
print('Confusion Matrix')
print('{}'.format(confusion_matrix(y_test, y_pred)))
print('Cohen kappa score: ')
print('{}'.format(cohen_kappa_score(y_test, y_pred)))

Random Forest

In [None]:
rfc = RandomForestClassifier(criterion = 'entropy'
                                , n_estimators = 10
                                , random_state = 1
                                , n_jobs = 1)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)
print('Accuracy: ')
print('{}'.format(accuracy_score(y_test, y_pred)))
print('Classification report: ')
print('{}'.format(classification_report(y_test, y_pred)))
print('Confusion Matrix')
print('{}'.format(confusion_matrix(y_test, y_pred)))
print('Cohen kappa score: ')
print('{}'.format(cohen_kappa_score(y_test, y_pred)))



Support Vector Machines (SVM)

In [None]:
svc = SVC(kernel = 'rbf', C = 1.0, random_state = 0)
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
print('Accuracy: ')
print('{}'.format(accuracy_score(y_test, y_pred)))
print('Classification report: ')
print('{}'.format(classification_report(y_test, y_pred)))
print('Confusion Matrix')
print('{}'.format(confusion_matrix(y_test, y_pred)))
print('Cohen kappa score: ')
print('{}'.format(cohen_kappa_score(y_test, y_pred)))

### After Normalization

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Initialize the scaler object
scaler = MinMaxScaler()

# Normalization of training data
X_train_scaled = scaler.fit_transform(X_train)

# Normalization of test data
X_test_scaled = scaler.transform(X_test)

print("X_train without standardising features")
print("--------------------------------------")
print(X_train.loc[1:4, ])  # Using loc for label based indexes

print("")
print("X_train standardising features")
print("--------------------------------------")
print(X_train_scaled[1:5, :])




In [None]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler object
sc = StandardScaler()

# Features to be standardized

# Standardize the training data
X_train_standard = sc.fit_transform(X_train_scaled)

# Perform standardization on test data
X_test_standard = sc.transform(X_test)

print("X_train without standardising features")
print("--------------------------------------")
print(X_train.loc[1:4, ])  # Using loc for label based indexes
print("")
print("X_train standardising features")
print("--------------------------------------")
print(X_train_standard[1:5, :])


In [None]:
from sklearn.preprocessing import StandardScaler
import numpy as np

# Example of training data
example_train_data = np.array([[10], [20], [30]])

# Initialize the scaler object
example_sc = StandardScaler()

# Perform mean and standard deviation calculations on training data
example_sc.fit(example_train_data)

# Perform transformation on training data
example_train_data_scaled = example_sc.transform(example_train_data)

print("Example train data")
print("------------------")
print(example_train_data)
print("Example train data scaled")
print("------------------------")
print(example_train_data_scaled)
print("----------------------------------------------")
print("")
print("What would happen if, instead of scaling the test dataset with the training scaling parameters, we scaled")
print("with the test scaling parameters?")

# Sample test data
example_test_data = np.array([[5], [6], [7]])

# Perform transformation on test data
example_test_data_scaled = example_sc.transform(example_test_data)

print("Example test data")
print("-----------------")
print(example_test_data)
print("Example test data scaled")
print("-----------------------")
print(example_test_data_scaled)


Logistic Regression

In [None]:
lr = LogisticRegression(C = 1000.0, random_state = 0 )
lr.fit(X_train_standard, y_train)
Y_pred_Logit = lr.predict(X_test_standard)
print('Accuracy: ')
print('{}'.format(accuracy_score(y_test, Y_pred_Logit)))
print('Classification report: ')
print('{}'.format(classification_report(y_test, Y_pred_Logit)))
print('Confusion Matrix')
print('{}'.format(confusion_matrix(y_test, Y_pred_Logit)))
print('Cohen kappa score: ')
print('{}'.format(cohen_kappa_score(y_test, Y_pred_Logit)))



Random Forest

In [None]:
rfc = RandomForestClassifier(criterion = 'entropy'
                                , n_estimators = 10
                                , random_state = 1
                                , n_jobs = 1)
rfc.fit(X_train_standard, y_train)
Y_pred_RF = rfc.predict(X_test_standard)
print('Accuracy: ')
print('{}'.format(accuracy_score(y_test, Y_pred_RF)))
print('Classification report: ')
print('{}'.format(classification_report(y_test, Y_pred_RF)))
print('Confusion Matrix')
print('{}'.format(confusion_matrix(y_test, Y_pred_RF)))
print('Cohen kappa score: ')
print('{}'.format(cohen_kappa_score(y_test, Y_pred_RF)))

Support Vector Machines (SVM)

In [None]:
svm = SVC(kernel = 'linear', C = 1.0, random_state = 0)
svm.fit(X_train_standard, y_train)
Y_pred_SVM = svm.predict(X_test_standard)
print('Accuracy: ')
print('{}'.format(accuracy_score(y_test, Y_pred_SVM)))
print('Classification report: ')
print('{}'.format(classification_report(y_test, Y_pred_SVM)))
print('Confusion Matrix')
print('{}'.format(confusion_matrix(y_test, Y_pred_SVM)))
print('Cohen kappa score: ')
print('{}'.format(cohen_kappa_score(y_test, Y_pred_SVM)))

evaluate the model using the confusion matrix and show features
important from the model results

Logistic Regression

In [None]:
from sklearn.metrics import confusion_matrix
lr = LogisticRegression(C = 1000.0, random_state = 0 )
lr.fit(X_train_standard, y_train)
Y_pred_Logit = lr.predict(X_test_standard)
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, cmap="Blues", fmt="d", cbar=False)
plt.xlabel("Predictions")
plt.ylabel("Original value")
plt.title("Confusion Matrix")
plt.show()




Random Forest

In [None]:
from sklearn.metrics import confusion_matrix
rfc = RandomForestClassifier(criterion = 'entropy'
                                , n_estimators = 10
                                , random_state = 1
                                , n_jobs = 1)
rfc.fit(X_train_standard, y_train)
Y_pred_RF = rfc.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, cmap="Blues", fmt="d", cbar=False)
plt.xlabel("Predictions")
plt.ylabel("Original value")
plt.title("Confusion Matrix")
plt.show()



In [None]:
def show_feature_importance(rfc):
    feat_importances = pd.Series(rfc.feature_importances_, index=X.columns)
    ax = feat_importances.nlargest(25).plot(kind= 'barh', figsize=(10, 8))
    ax.invert_yaxis()


    plt.xlabel('score')
    plt.ylabel('feature')
    plt.title('feature importance score')

show_feature_importance(rfc)

Support Vector Machines (SVM)

In [None]:
from sklearn.metrics import confusion_matrix
svm = SVC(kernel = 'linear', C = 1.0, random_state = 0)
svm.fit(X_train_standard, y_train)
Y_pred_SVM = svm.predict(X_test_standard)
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, cmap="Blues", fmt="d", cbar=False)
plt.xlabel("Predictions")
plt.ylabel("Original value")
plt.title("Confusion Matrix")
plt.show()