In [14]:
import numpy as np
import pandas as pd

# Create synthetic Social Network Ads dataset since CSV file is not available
print("Creating synthetic Social Network Ads dataset...")

# Set random seed for reproducibility
np.random.seed(42)

# Create synthetic data
n_samples = 400

# Generate User IDs
user_ids = np.random.randint(15000000, 16000000, n_samples)

# Generate Gender
genders = np.random.choice(['Male', 'Female'], n_samples)

# Generate Age (18-65)
ages = np.random.randint(18, 65, n_samples)

# Generate Estimated Salary (15000-150000)
salaries = np.random.randint(15000, 150000, n_samples)

# Generate Purchase decision based on age and salary with some logic
# Higher age and higher salary tend to purchase more
purchase_probability = (ages - 18) / 47 * 0.3 + (salaries - 15000) / 135000 * 0.7
purchase_probability = np.clip(purchase_probability, 0.1, 0.9)  # Keep between 0.1 and 0.9

# Add some randomness
purchased = np.random.binomial(1, purchase_probability, n_samples)

# Create DataFrame
dataset = pd.DataFrame({
    'User ID': user_ids,
    'Gender': genders,
    'Age': ages,
    'EstimatedSalary': salaries,
    'Purchased': purchased
})

print(f"Dataset created successfully with {len(dataset)} samples")
print(f"Purchase rate: {dataset['Purchased'].mean():.2%}")
print("\nDataset preview:")
dataset

Creating synthetic Social Network Ads dataset...
Dataset created successfully with 400 samples
Purchase rate: 50.00%

Dataset preview:


Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15121958,Male,58,15846,0
1,15671155,Female,36,25954,0
2,15131932,Female,29,94239,0
3,15365838,Female,26,61514,0
4,15259178,Female,24,103236,0
...,...,...,...,...,...
395,15912495,Male,32,98691,0
396,15657917,Male,23,73606,0
397,15824792,Female,47,117665,1
398,15737378,Female,55,120373,1


In [15]:
#Identifying dependent and independent variables
X=dataset.iloc[:,[2,3]].values # Independent variables, Extract all rows from 2 and 3 columns
y=dataset.iloc[:,4].values #dependent variable, Extract all rows from 4 column
print(X)

[[    58  15846]
 [    36  25954]
 [    29  94239]
 [    26  61514]
 [    24 103236]
 [    45 112579]
 [    31  57101]
 [    48  79323]
 [    36 106922]
 [    64 142534]
 [    33  82584]
 [    22 119537]
 [    52  93477]
 [    29  57533]
 [    42  16828]
 [    38  16917]
 [    53  35618]
 [    40  47254]
 [    33 120292]
 [    56  62362]
 [    62  35128]
 [    59 138781]
 [    56  80545]
 [    31  97180]
 [    48 102958]
 [    22 148291]
 [    52  54809]
 [    40 131748]
 [    46  20287]
 [    60  23512]
 [    28 147414]
 [    35 123872]
 [    64 108070]
 [    29  94767]
 [    26 112829]
 [    27  57078]
 [    61 121288]
 [    34  98524]
 [    55  27219]
 [    24  15235]
 [    63  57929]
 [    30 142309]
 [    57  82444]
 [    59 116834]
 [    26  99555]
 [    44  88698]
 [    19 139450]
 [    22  58925]
 [    46 102388]
 [    54  23007]
 [    55  73596]
 [    36 136101]
 [    25 103861]
 [    62 101740]
 [    18  30524]
 [    39 146149]
 [    34 140501]
 [    24  35611]
 [    42  2906

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test,y_train,y_test=train_test_split(X,y,test_size=1/3,random_state=42)

In [17]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.transform(X_test)  # Use transform, not fit_transform for test data
print("Feature scaling completed!")
print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
print("\nFirst 5 rows of scaled training data:")
print(X_train[:5])

Feature scaling completed!
Training set shape: (266, 2)
Test set shape: (134, 2)

First 5 rows of scaled training data:
[[ 1.02029749 -0.32157485]
 [-1.47769128 -1.04040041]
 [ 0.21212465 -1.40664154]
 [ 0.06518414  1.56380493]
 [-0.7429887   0.1496695 ]]


In [18]:
from sklearn.naive_bayes import GaussianNB
classifier=GaussianNB() # create an instance
classifier.fit(X_train,y_train)

0,1,2
,priors,
,var_smoothing,1e-09


In [19]:
# Model testing/usage
y_pred=classifier.predict(X_test)

In [20]:
y_pred

array([1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
       1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 1], dtype=int32)

In [21]:
results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print("\nComparison of actual and predicted labels:")
print(results.head(50))


Comparison of actual and predicted labels:
    Actual  Predicted
0        1          1
1        0          0
2        0          1
3        1          1
4        0          0
5        0          1
6        0          0
7        1          0
8        0          0
9        1          1
10       1          1
11       0          1
12       0          1
13       1          1
14       0          0
15       1          1
16       1          0
17       0          1
18       1          0
19       1          1
20       0          0
21       1          0
22       0          1
23       0          1
24       0          0
25       0          0
26       1          0
27       0          0
28       0          0
29       1          1
30       0          1
31       0          0
32       1          1
33       1          0
34       1          1
35       0          1
36       1          1
37       0          1
38       0          0
39       1          0
40       1          1
41       1          1
42       1

In [22]:
# Model Evaluation
from sklearn.metrics import confusion_matrix
from sklearn import metrics
cm=confusion_matrix(y_test,y_pred)

In [23]:
cm

array([[40, 24],
       [26, 44]])

In [24]:
print("Confusion Matrix:\n", cm)
print("Accuracy:", metrics.accuracy_score(y_test,y_pred))
print("Precision:", metrics.precision_score(y_test,y_pred))
print("Recall:", metrics.recall_score(y_test,y_pred))
print("F1 score:", metrics.f1_score(y_test,y_pred))

Confusion Matrix:
 [[40 24]
 [26 44]]
Accuracy: 0.6268656716417911
Precision: 0.6470588235294118
Recall: 0.6285714285714286
F1 score: 0.6376811594202898


In [25]:
# Validate the model on Unseen data
print("Testing on new unseen data...")
test_new = np.array([[45, 89999]])
print(f"Original test sample: Age={test_new[0][0]}, Salary={test_new[0][1]}")

# Scale the test data using the same scaler
test_new_scaled = sc.transform(test_new)
print(f"Scaled test sample: {test_new_scaled[0]}")

# Make prediction
pred = classifier.predict(test_new_scaled)
pred_proba = classifier.predict_proba(test_new_scaled)

print(f"Prediction: {pred[0]} ({'Purchase' if pred[0] == 1 else 'No Purchase'})")
print(f"Prediction probabilities: No Purchase = {pred_proba[0][0]:.4f}, Purchase = {pred_proba[0][1]:.4f}")
print(f"Confidence: {max(pred_proba[0]):.4f}")

pred

Testing on new unseen data...
Original test sample: Age=45, Salary=89999
Scaled test sample: [0.28559491 0.17341154]
Prediction: 1 (Purchase)
Prediction probabilities: No Purchase = 0.4161, Purchase = 0.5839
Confidence: 0.5839


array([1], dtype=int32)

In [26]:
import numpy as np

# Assuming 'classifier' and 'sc' (scaler) are already defined and trained in the context

# Test multiple cases
print("Testing multiple unseen data samples...")
test_cases = [
    [25, 30000],    # Young, low salary
    [35, 75000],    # Mid-age, high salary
    [50, 40000],    # Older, medium salary
    [60, 120000],   # Old, very high salary
    [22, 25000],    # Very young, low salary
]

print("\nAge | Salary  | Prediction | Confidence | Decision")
print("-" * 55)

for age, salary in test_cases:
    test_sample = np.array([[age, salary]])
    test_scaled = sc.transform(test_sample)
    prediction = classifier.predict(test_scaled)[0]
    probabilities = classifier.predict_proba(test_scaled)[0]
    confidence = max(probabilities)
    decision = "Purchase" if prediction == 1 else "No Purchase"
    
    print(f"{age:2d}  | {salary:6d}  |     {prediction}      |   {confidence:.3f}    | {decision}")

print(f"\nModel Summary:")
print(f"- Dataset: Synthetic Social Network Ads (400 samples)")
print(f"- Features: Age, Estimated Salary")
print(f"- Algorithm: Gaussian Naive Bayes")
print(f"- Accuracy: 62.69%")
print(f"- The model predicts customer purchase behavior based on age and salary")

Testing multiple unseen data samples...

Age | Salary  | Prediction | Confidence | Decision
-------------------------------------------------------
25  |  30000  |     0      |   0.903    | No Purchase
35  |  75000  |     0      |   0.584    | No Purchase
50  |  40000  |     0      |   0.798    | No Purchase
60  | 120000  |     1      |   0.800    | Purchase
22  |  25000  |     0      |   0.925    | No Purchase

Model Summary:
- Dataset: Synthetic Social Network Ads (400 samples)
- Features: Age, Estimated Salary
- Algorithm: Gaussian Naive Bayes
- Accuracy: 62.69%
- The model predicts customer purchase behavior based on age and salary
