In [44]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.naive_bayes import GaussianNB
# Uncomment the following if you want to use cross-validation
from sklearn.model_selection import cross_val_score
warnings.filterwarnings('ignore')

In [45]:
# Load dataset from zip file
df = pd.read_csv("Social_Network_Ads.csv")

In [46]:
# Exploratory Data Analysis
print("First 5 rows of the dataset:\n", df.head())

First 5 rows of the dataset:
     User ID  Gender  Age  EstimatedSalary  Purchased
0  15624510    Male   19            19000          0
1  15810944    Male   35            20000          0
2  15668575  Female   26            43000          0
3  15603246  Female   27            57000          0
4  15804002    Male   19            76000          0


In [47]:
df.tail(10)

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
390,15807837,Male,48,33000,1
391,15592570,Male,47,23000,1
392,15748589,Female,45,45000,1
393,15635893,Male,60,42000,1
394,15757632,Female,39,59000,0
395,15691863,Female,46,41000,1
396,15706071,Male,51,23000,1
397,15654296,Female,50,20000,1
398,15755018,Male,36,33000,0
399,15594041,Female,49,36000,1


In [48]:
df.sample(10)

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
396,15706071,Male,51,23000,1
345,15716781,Female,41,63000,0
383,15707634,Male,49,28000,1
261,15680587,Male,36,144000,1
323,15619465,Female,48,30000,1
27,15633531,Female,47,30000,1
355,15606472,Male,60,34000,1
133,15638963,Female,21,68000,0
93,15699284,Female,29,28000,0
286,15631070,Female,37,62000,0


In [49]:
df.shape

(400, 5)

In [50]:
df.dtypes

Unnamed: 0,0
User ID,int64
Gender,object
Age,int64
EstimatedSalary,int64
Purchased,int64


In [51]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   User ID          400 non-null    int64 
 1   Gender           400 non-null    object
 2   Age              400 non-null    int64 
 3   EstimatedSalary  400 non-null    int64 
 4   Purchased        400 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 15.8+ KB


In [52]:
print("\nDataset statistics:\n", df.describe())


Dataset statistics:
             User ID         Age  EstimatedSalary   Purchased
count  4.000000e+02  400.000000       400.000000  400.000000
mean   1.569154e+07   37.655000     69742.500000    0.357500
std    7.165832e+04   10.482877     34096.960282    0.479864
min    1.556669e+07   18.000000     15000.000000    0.000000
25%    1.562676e+07   29.750000     43000.000000    0.000000
50%    1.569434e+07   37.000000     70000.000000    0.000000
75%    1.575036e+07   46.000000     88000.000000    1.000000
max    1.581524e+07   60.000000    150000.000000    1.000000


In [53]:
# Data Cleaning
df.drop_duplicates()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
...,...,...,...,...,...
395,15691863,Female,46,41000,1
396,15706071,Male,51,23000,1
397,15654296,Female,50,20000,1
398,15755018,Male,36,33000,0


In [54]:
df.isnull().sum()

Unnamed: 0,0
User ID,0
Gender,0
Age,0
EstimatedSalary,0
Purchased,0


In [55]:
df.isna().sum()

Unnamed: 0,0
User ID,0
Gender,0
Age,0
EstimatedSalary,0
Purchased,0


In [56]:
# check number of zeroes
print("No of zero  values in Estimated Salary",df[df['EstimatedSalary']==0].shape[0])

No of zero  values in Estimated Salary 0


In [57]:
print("No of zero  values in Purchased",df[df['Purchased']==0].shape[0])

No of zero  values in Purchased 257


In [58]:
# Replace zero values with mean
# df['EstimatedSalary'].replace(0,df['EstimatedSalary'].mean(),inplace=True)

In [59]:
# Encode categorical variables (Gender)
label_encoder = LabelEncoder()
df['Gender'] = label_encoder.fit_transform(df['Gender'])  # Male: 1, Female: 0

In [60]:
# Separate target variable and features
target_name = 'Purchased'
target = df[target_name]
data = df.drop(columns=[target_name, 'User ID'])  # Dropping 'User ID' as it’s likely irrelevant

In [61]:
# Scale numerical features
scaler = StandardScaler()
data[['Age', 'EstimatedSalary']] = scaler.fit_transform(data[['Age', 'EstimatedSalary']])

In [62]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)

In [63]:
# Implement Naive Bayes algorithm
nb = GaussianNB()
nb.fit(X_train, y_train)

In [64]:
# Making Predictions
nb_pred = nb.predict(X_test)
nb_pred.shape

(80,)

In [65]:
# Model evaluation
print("Model Training Complete")
print("Train set accuracy:", nb.score(X_train, y_train))
print("Test set accuracy:", nb.score(X_test, y_test))

Model Training Complete
Train set accuracy: 0.8625
Test set accuracy: 0.9375


In [66]:
# Confusion matrix
conf_matrix=confusion_matrix(y_test,nb_pred)
print("Confusion Matrix:\n",conf_matrix)
report=classification_report(y_test,nb_pred)
print("Classification Report:\n",report)


Confusion Matrix:
 [[50  2]
 [ 3 25]]
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.96      0.95        52
           1       0.93      0.89      0.91        28

    accuracy                           0.94        80
   macro avg       0.93      0.93      0.93        80
weighted avg       0.94      0.94      0.94        80

