In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Load dataset from CSV file
def load_dataset(filename):
    df = pd.read_csv(filename)
    return df

# Train and evaluate Naïve Bayes classifier
def naive_bayes_classifier(filename):
    # Load dataset
    df = load_dataset(filename)
    
    # Encode categorical columns
    label_encoders = {}
    for column in df.select_dtypes(include=['object']).columns:
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])
        label_encoders[column] = le
    
    # Assuming the last column is the target (label) column
    X = df.iloc[:, :-1]  # Features
    y = df.iloc[:, -1]   # Target
    
    # Split into training and testing sets (80% training, 20% testing)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Initialize and train the Naïve Bayes classifier
    model = GaussianNB()
    model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy of Naïve Bayes Classifier: {accuracy * 100:.2f}%')

# Example usage (replace 'dataset.csv' with actual filename)
filename = 'modified.csv'
naive_bayes_classifier(filename)

Accuracy of Naïve Bayes Classifier: 64.12%


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Load dataset from CSV file
def load_dataset(filename):
    df = pd.read_csv(filename)
    return df

# Train and evaluate Naïve Bayes classifier
def naive_bayes_classifier(filename):
    # Load dataset
    df = load_dataset(filename)
    
    # Encode categorical columns
    label_encoders = {}
    for column in df.select_dtypes(include=['object']).columns:
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])
        label_encoders[column] = le
    
    # Assuming the last column is the target (label) column
    X = df.iloc[:, :-1]  # Features
    y = df.iloc[:, -1]   # Target
    
    # Split into training and testing sets (80% training, 20% testing)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Initialize and train the Naïve Bayes classifier
    model = GaussianNB()
    model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy of Naïve Bayes Classifier: {accuracy * 100:.2f}%')
    
    # Confusion Matrix
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    
    # Classification Report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    # Display Predicted vs Actual Labels for first 10 samples
    print("\nSample Predictions:")
    for i in range(min(10, len(y_test))):
        print(f'Actual: {y_test.iloc[i]}, Predicted: {y_pred[i]}, Probabilities: {np.round(y_prob[i], 2)}')

# Example usage (replace 'dataset.csv' with actual filename)
filename = 'modified.csv'
naive_bayes_classifier(filename)


Accuracy of Naïve Bayes Classifier: 64.12%

Confusion Matrix:
[[1119    6]
 [ 623    5]]

Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.99      0.78      1125
           1       0.45      0.01      0.02       628

    accuracy                           0.64      1753
   macro avg       0.55      0.50      0.40      1753
weighted avg       0.58      0.64      0.51      1753


Sample Predictions:
Actual: 0, Predicted: 0, Probabilities: [0.6 0.4]
Actual: 1, Predicted: 0, Probabilities: [0.58 0.42]
Actual: 1, Predicted: 0, Probabilities: [0.67 0.33]
Actual: 1, Predicted: 0, Probabilities: [0.67 0.33]
Actual: 0, Predicted: 0, Probabilities: [0.68 0.32]
Actual: 1, Predicted: 0, Probabilities: [0.68 0.32]
Actual: 1, Predicted: 0, Probabilities: [0.63 0.37]
Actual: 0, Predicted: 0, Probabilities: [0.6 0.4]
Actual: 0, Predicted: 0, Probabilities: [0.67 0.33]
Actual: 1, Predicted: 0, Probabilities: [0.72 0.28]


In [4]:
# Naïve Bayes Classifier without any libraries

def load_dataset(filename):
    dataset = []
    with open(filename, 'r') as file:
        lines = file.readlines()
        for line in lines[1:]:  # Skip header
            dataset.append(line.strip().split(','))
    return dataset

# Convert categorical to numerical
def encode_categorical(dataset):
    column_values = {}
    for col in range(len(dataset[0])):
        unique_values = list(set(row[col] for row in dataset))
        if any(not value.replace('.', '', 1).isdigit() for value in unique_values):
            column_values[col] = {value: i for i, value in enumerate(unique_values)}
    for row in dataset:
        for col, mapping in column_values.items():
            row[col] = mapping[row[col]]
    return dataset

# Split dataset into train and test sets manually
def split_dataset(dataset, test_size=0.2):
    for i in range(len(dataset)):
        j = i + (len(dataset) - i) % (i + 1)  # Simple shuffling
        dataset[i], dataset[j % len(dataset)] = dataset[j % len(dataset)], dataset[i]
    split_index = int(len(dataset) * (1 - test_size))
    return dataset[:split_index], dataset[split_index:]

# Separate data by class
def separate_by_class(dataset):
    separated = {}
    for row in dataset:
        class_value = row[-1]
        if class_value not in separated:
            separated[class_value] = []
        separated[class_value].append([float(x) for x in row[:-1]])
    return separated

# Compute mean and variance manually
def mean(numbers):
    total = sum(numbers)
    return total / len(numbers)

def variance(numbers, mean_value):
    return sum((x - mean_value) ** 2 for x in numbers) / len(numbers)

# Summarize dataset
def summarize_dataset(dataset):
    summaries = [(mean(column), variance(column, mean(column))) for column in zip(*dataset)]
    return summaries

# Calculate probability manually
def calculate_probability(x, mean, var):
    pi = 3.141592653589793
    exponent = 2.718281828459045 ** (-((x - mean) ** 2 / (2 * var)))
    return (1 / ((2 * pi * var) ** 0.5)) * exponent

# Compute class probabilities
def calculate_class_probabilities(summaries, row):
    probabilities = {}
    for class_value, class_summaries in summaries.items():
        probabilities[class_value] = 1
        for i in range(len(class_summaries)):
            mean_val, var_val = class_summaries[i]
            probabilities[class_value] *= calculate_probability(row[i], mean_val, var_val)
    return probabilities

# Make a prediction
def predict(summaries, row):
    probabilities = calculate_class_probabilities(summaries, row)
    return max(probabilities, key=probabilities.get)

# Evaluate model
def evaluate_model(filename):
    dataset = load_dataset(filename)
    dataset = encode_categorical(dataset)
    dataset = [[float(value) for value in row] for row in dataset]
    training_set, test_set = split_dataset(dataset)
    separated = separate_by_class(training_set)
    summaries = {class_value: summarize_dataset(rows) for class_value, rows in separated.items()}
    
    correct = 0
    for row in test_set:
        actual = row[-1]
        predicted = predict(summaries, row[:-1])
        if actual == predicted:
            correct += 1
    
    accuracy = (correct / len(test_set)) * 100
    print(f'Accuracy: {accuracy:.2f}%')

# Run model
filename = 'modified.csv'
evaluate_model(filename)


Accuracy: 54.14%
