In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import VotingClassifier

In [4]:
# Load the dataset
df = pd.read_csv('advertising_ef.csv')

# Display the first few rows of the dataset
df.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,City,Gender,Country,Clicked on Ad
0,68.95,35.0,61833.9,256.09,Wrightburgh,Female,Tunisia,0
1,,31.0,68441.85,193.77,West Jodi,Male,Nauru,0
2,69.47,26.0,59785.94,236.5,Davidton,Female,San Marino,0
3,74.15,29.0,54806.18,245.89,West Terrifurt,Male,Italy,0
4,68.37,35.0,73889.99,225.58,South Manuel,Female,Iceland,0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1009 entries, 0 to 1008
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Daily Time Spent on Site  1005 non-null   float64
 1   Age                       998 non-null    float64
 2   Area Income               998 non-null    float64
 3   Daily Internet Usage      1005 non-null   float64
 4   City                      998 non-null    object 
 5   Gender                    1009 non-null   object 
 6   Country                   996 non-null    object 
 7   Clicked on Ad             1009 non-null   int64  
dtypes: float64(4), int64(1), object(3)
memory usage: 63.2+ KB


In [8]:
df.shape

(1009, 8)

In [10]:
df.isnull().sum()

Daily Time Spent on Site     4
Age                         11
Area Income                 11
Daily Internet Usage         4
City                        11
Gender                       0
Country                     13
Clicked on Ad                0
dtype: int64

In [12]:
# Check for missing values
print(df.isnull().sum())

# Fill missing values with the mean for numerical columns and mode for categorical columns
df['Daily Time Spent on Site'].fillna(df['Daily Time Spent on Site'].mean(), inplace=True)
df['Area Income'].fillna(df['Area Income'].mean(), inplace=True)
df['City'].fillna(df['City'].mode()[0], inplace=True)
df['Country'].fillna(df['Country'].mode()[0], inplace=True)
df['Gender'].fillna(df['Gender'].mode()[0], inplace=True)

Daily Time Spent on Site     4
Age                         11
Area Income                 11
Daily Internet Usage         4
City                        11
Gender                       0
Country                     13
Clicked on Ad                0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Daily Time Spent on Site'].fillna(df['Daily Time Spent on Site'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Area Income'].fillna(df['Area Income'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work 

In [34]:
# Check for missing values again
print(df.isnull().sum())

Daily Time Spent on Site     0
Age                         11
Area Income                  0
Daily Internet Usage         4
City                         0
Gender                       0
Country                      0
Clicked on Ad                0
dtype: int64


In [36]:
# Drop rows with missing values
df.dropna(inplace=True)

In [38]:
# Check for missing values again
print(df.isnull().sum())

Daily Time Spent on Site    0
Age                         0
Area Income                 0
Daily Internet Usage        0
City                        0
Gender                      0
Country                     0
Clicked on Ad               0
dtype: int64


In [58]:
from sklearn.preprocessing import LabelEncoder

# Encode categorical variables
label_encoder = LabelEncoder()
df['City'] = label_encoder.fit_transform(df['City'])
df['Country'] = label_encoder.fit_transform(df['Country'])
df['Gender'] = label_encoder.fit_transform(df['Gender'])

In [60]:
# Define features (X) and target (y)
X = df.drop('Clicked on Ad', axis=1)
y = df['Clicked on Ad']

In [62]:
from sklearn.preprocessing import StandardScaler

# Standardize numerical features
scaler = StandardScaler()
X[['Daily Time Spent on Site', 'Age', 'Area Income', 'Daily Internet Usage']] = scaler.fit_transform(X[['Daily Time Spent on Site', 'Age', 'Area Income', 'Daily Internet Usage']])

In [76]:
# Separate numerical and categorical features
numerical_features = ['Daily Time Spent on Site', 'Age', 'Area Income', 'Daily Internet Usage']
categorical_features = ['City', 'Country', 'Gender']

X_numerical = X[numerical_features]
X_categorical = X[categorical_features]

In [78]:
from sklearn.preprocessing import StandardScaler

# Standardize numerical features
scaler = StandardScaler()
X_numerical_scaled = scaler.fit_transform(X_numerical)

In [82]:
from sklearn.preprocessing import MinMaxScaler

# Scale categorical features using MinMaxScaler (to avoid negative values)
minmax_scaler = MinMaxScaler()
X_categorical_scaled = minmax_scaler.fit_transform(X_categorical)

In [84]:
import numpy as np

# Combine numerical and categorical features
X_combined = np.hstack((X_numerical_scaled, X_categorical_scaled))

In [106]:
from sklearn.model_selection import train_test_split

# Split the combined dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.3, random_state=42)

In [108]:
# Separate categorical features from the combined dataset
X_train_categorical = X_train[:, len(numerical_features):]  # Categorical features are after numerical features
X_test_categorical = X_test[:, len(numerical_features):]

In [110]:
from sklearn.naive_bayes import MultinomialNB

# Multinomial Naive Bayes
mnb = MultinomialNB()
mnb.fit(X_train_categorical, y_train)  # Use only categorical features for MultinomialNB
y_pred_mnb = mnb.predict(X_test_categorical)

# Evaluate Multinomial Naive Bayes
print("Multinomial Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_mnb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_mnb))
print("Classification Report:\n", classification_report(y_test, y_pred_mnb))

Multinomial Naive Bayes Accuracy: 0.49498327759197325
Confusion Matrix:
 [[ 52 101]
 [ 50  96]]
Classification Report:
               precision    recall  f1-score   support

           0       0.51      0.34      0.41       153
           1       0.49      0.66      0.56       146

    accuracy                           0.49       299
   macro avg       0.50      0.50      0.48       299
weighted avg       0.50      0.49      0.48       299



In [112]:
from sklearn.naive_bayes import GaussianNB, BernoulliNB

# Gaussian Naive Bayes
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred_gnb = gnb.predict(X_test)

# Evaluate Gaussian Naive Bayes
print("Gaussian Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_gnb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_gnb))
print("Classification Report:\n", classification_report(y_test, y_pred_gnb))

# Bernoulli Naive Bayes
bnb = BernoulliNB()
bnb.fit(X_train, y_train)
y_pred_bnb = bnb.predict(X_test)

# Evaluate Bernoulli Naive Bayes
print("Bernoulli Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_bnb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_bnb))
print("Classification Report:\n", classification_report(y_test, y_pred_bnb))

Gaussian Naive Bayes Accuracy: 0.9698996655518395
Confusion Matrix:
 [[148   5]
 [  4 142]]
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.97      0.97       153
           1       0.97      0.97      0.97       146

    accuracy                           0.97       299
   macro avg       0.97      0.97      0.97       299
weighted avg       0.97      0.97      0.97       299

Bernoulli Naive Bayes Accuracy: 0.9264214046822743
Confusion Matrix:
 [[144   9]
 [ 13 133]]
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.94      0.93       153
           1       0.94      0.91      0.92       146

    accuracy                           0.93       299
   macro avg       0.93      0.93      0.93       299
weighted avg       0.93      0.93      0.93       299



In [114]:
from sklearn.ensemble import VotingClassifier

# Ensemble the models using VotingClassifier
ensemble_model = VotingClassifier(estimators=[
    ('gnb', gnb),
    ('mnb', mnb),
    ('bnb', bnb)
], voting='soft')

ensemble_model.fit(X_train, y_train)
y_pred_ensemble = ensemble_model.predict(X_test)

# Evaluate the Ensemble Model
print("Ensemble Model Accuracy:", accuracy_score(y_test, y_pred_ensemble))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_ensemble))
print("Classification Report:\n", classification_report(y_test, y_pred_ensemble))

ValueError: Negative values in data passed to MultinomialNB (input X)

In [56]:
# Bernoulli Naive Bayes
bnb = BernoulliNB()
bnb.fit(X_train, y_train)
y_pred_bnb = bnb.predict(X_test)

# Evaluate Bernoulli Naive Bayes
print("Bernoulli Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_bnb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_bnb))
print("Classification Report:\n", classification_report(y_test, y_pred_bnb))

Bernoulli Naive Bayes Accuracy: 0.9264214046822743
Confusion Matrix:
 [[144   9]
 [ 13 133]]
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.94      0.93       153
           1       0.94      0.91      0.92       146

    accuracy                           0.93       299
   macro avg       0.93      0.93      0.93       299
weighted avg       0.93      0.93      0.93       299



In [104]:
from sklearn.ensemble import VotingClassifier

# Ensemble the models using VotingClassifier
ensemble_model = VotingClassifier(estimators=[
    ('gnb', gnb),
    ('mnb', mnb),
    ('bnb', bnb)
], voting='soft')

ensemble_model.fit(X_train, y_train)
y_pred_ensemble = ensemble_model.predict(X_test)

# Evaluate the Ensemble Model
print("Ensemble Model Accuracy:", accuracy_score(y_test, y_pred_ensemble))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_ensemble))
print("Classification Report:\n", classification_report(y_test, y_pred_ensemble))

ValueError: Negative values in data passed to MultinomialNB (input X)

In [74]:
# Get probabilities from each model
probs_gnb = gnb.predict_proba(X_test)
probs_mnb = mnb.predict_proba(X_test)
probs_bnb = bnb.predict_proba(X_test)

# Combine probabilities (e.g., average)
combined_probs = (probs_gnb + probs_mnb + probs_bnb) / 3

# Predict the class with the highest combined probability
y_pred_combined = np.argmax(combined_probs, axis=1)

# Evaluate the combined model
print("Combined Model Accuracy:", accuracy_score(y_test, y_pred_combined))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_combined))
print("Classification Report:\n", classification_report(y_test, y_pred_combined))

AttributeError: 'MultinomialNB' object has no attribute 'feature_log_prob_'