In [13]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [14]:
class NaiveBayesClassifier:
    def __init__(self):
        self.class_probabilities = {}
        self.feature_probabilities = {}
    
    def fit(self, X_train, y_train):
        # Calculate class probabilities
        self.calculate_class_probabilities(y_train)
        
        # Calculate feature probabilities for each class
        self.calculate_feature_probabilities(X_train, y_train)
    
    def predict(self, X_test):
        predictions = []
        for sample in X_test:
            prediction = self.predict_sample(sample)
            predictions.append(prediction)
        return predictions
    
    def predict_sample(self, sample):
        best_class = None
        best_probability = -1
        
        for class_label, class_probability in self.class_probabilities.items():
            feature_probabilities = self.feature_probabilities[class_label]
            probability = class_probability
            
            for i in range(len(sample)):
                feature_value = sample[i]
                if feature_value in feature_probabilities[i]:
                    probability *= feature_probabilities[i][feature_value]
            
            if probability > best_probability:
                best_class = class_label
                best_probability = probability
        
        return best_class
    
    def calculate_class_probabilities(self, y_train):
        total_samples = len(y_train)
        class_counts = {}
        
        for class_label in y_train:
            if class_label in class_counts:
                class_counts[class_label] += 1
            else:
                class_counts[class_label] = 1
        
        for class_label, count in class_counts.items():
            self.class_probabilities[class_label] = count / total_samples
    
    def calculate_feature_probabilities(self, X_train, y_train):
        num_features = len(X_train[0])
        class_counts = {}
        self.feature_probabilities = {}
        
        for i in range(len(X_train)):
            sample = X_train[i]
            class_label = y_train[i]
            
            if class_label not in class_counts:
                class_counts[class_label] = 1
                self.feature_probabilities[class_label] = [{} for _ in range(num_features)]
            else:
                class_counts[class_label] += 1
            
            feature_probabilities = self.feature_probabilities[class_label]
            
            for j in range(num_features):
                feature_value = sample[j]
                if feature_value in feature_probabilities[j]:
                    feature_probabilities[j][feature_value] += 1
                else:
                    feature_probabilities[j][feature_value] = 1
        
        for class_label, count in class_counts.items():
            feature_probabilities = self.feature_probabilities[class_label]
            for j in range(num_features):
                total_count = count + len(feature_probabilities[j])
                for feature_value in feature_probabilities[j]:
                    feature_probabilities[j][feature_value] /= total_count


In [15]:
# Load the dataset
df = pd.read_csv('Salary_Data.csv')
df = df.dropna()

In [16]:
# Split into features (X) and target variable (y) for classification
salary_threshold = data_encoded["Salary"].mean()
data_encoded["Salary_Class"] = np.where(data_encoded["Salary"] >= salary_threshold, "High", "Low")
X_cls = data_encoded.drop(["Salary", "Salary_Class"], axis=1)
y_cls = data_encoded["Salary_Class"]

In [17]:
# Convert categorical features to numerical labels
X_nb = X_cls.values
label_encoders = {}
for col in range(X_nb.shape[1]):
    unique_values = np.unique(X_nb[:, col])
    label_encoders[col] = {value: i for i, value in enumerate(unique_values)}
    X_nb[:, col] = np.array([label_encoders[col][value] for value in X_nb[:, col]])

In [18]:
# Split the dataset into training and test sets
X_nb_train, X_nb_test, y_nb_train, y_nb_test = train_test_split(X_nb, y_cls.values, test_size=0.2, random_state=42)

In [19]:
# Create and train the Naive Bayes classifier
nb_model = NaiveBayesClassifier()
nb_model.fit(X_nb_train, y_nb_train)

In [20]:
# Make predictions on the test set
nb_predictions = nb_model.predict(X_nb_test)

In [21]:
# Evaluate the model
accuracy = np.mean(nb_predictions == y_nb_test)
print("Accuracy:", accuracy)

Accuracy: 0.7104477611940299
