In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
# Example documents and labels
documents = [
    "I love this movie",
    "This is a great movie",
    "This movie is terrible",
    "I hate this movie"
]
labels = ["positive", "positive", "negative", "negative"]

In [3]:
# Create a CountVectorizer to convert text documents to a matrix of token counts
vectorizer = CountVectorizer()

# Fit the vectorizer on the training documents and transform them to feature vectors
features = vectorizer.fit_transform(documents)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Create a Multinomial Naive Bayes classifier
classifier = MultinomialNB()

# Train the classifier
classifier.fit(X_train, y_train)

In [4]:
# Predict the labels for the test set
predictions = classifier.predict(X_test)

# Print the predicted labels and compare them to the actual labels
for predicted_label, actual_label in zip(predictions, y_test):
    print("Predicted label:", predicted_label)
    print("Actual label:", actual_label)
    print()

Predicted label: negative
Actual label: positive



In [5]:
# Predict the label for a new document
new_document = ["I like this movie"]
new_document_features = vectorizer.transform(new_document)
predicted_label = classifier.predict(new_document_features)
print("Predicted label:", predicted_label[0])

Predicted label: negative


In [11]:
import math

class NaiveBayesClassifier:
    def __init__(self):
        self.vocab = set()
        self.class_counts = {}
        self.word_counts = {}
        self.class_probabilities = {}

    def train(self, documents, labels):
        # Count the number of documents in each class
        for label in labels:
            if label in self.class_counts:
                self.class_counts[label] += 1
            else:
                self.class_counts[label] = 1

        # Count the number of occurrences of each word in each class
        for document, label in zip(documents, labels):
            words = document.split()
            for word in words:
                self.vocab.add(word)
                if label in self.word_counts:
                    if word in self.word_counts[label]:
                        self.word_counts[label][word] += 1
                    else:
                        self.word_counts[label][word] = 1
                else:
                    self.word_counts[label] = {word: 1}

        # Calculate the class probabilities
        total_documents = len(documents)
        for label, count in self.class_counts.items():
            self.class_probabilities[label] = count / total_documents

    def predict(self, document):
        words = document.split()
        scores = {}

        for label in self.class_counts.keys():
            score = math.log(self.class_probabilities[label])

            for word in words:
                if word in self.vocab:
                    word_count = self.word_counts[label].get(word, 0) + 1
                    word_total = sum(self.word_counts[label].values()) + len(self.vocab)
                    word_probability = word_count / word_total
                    score += math.log(word_probability)

            scores[label] = score

        # Find the label with the highest score
        predicted_label = max(scores, key=scores.get)
        return predicted_label

# Example usage
documents = [
    "I love this movie",
    "This is a great movie",
    "This movie is terrible",
    "I hate this movie"
]
labels = ["positive", "positive", "negative", "negative"]

# Create and train the classifier
classifier = NaiveBayesClassifier()
classifier.train(documents, labels)

# Predict the label for a new document
new_document = "I like this movie"
predicted_label = classifier.predict(new_document)
print("Predicted label:", predicted_label)


Predicted label: negative


In [12]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

# Example documents and labels
documents = [
    "I love this movie",
    "This is a great movie",
    "This movie is terrible",
    "I hate this movie"
]
labels = ["positive", "positive", "negative", "negative"]

# Create a CountVectorizer to convert text documents to a matrix of token counts
vectorizer = CountVectorizer()

# Fit the vectorizer on the documents and transform them to feature vectors
features = vectorizer.fit_transform(documents)

# Create a Multinomial Naive Bayes classifier
classifier = MultinomialNB()

# Train the classifier
classifier.fit(features, labels)

# Predict the labels for the training set
train_predictions = classifier.predict(features)

# Calculate the accuracy of the classifier
accuracy = (labels == train_predictions).mean()
print("Accuracy:", accuracy)


Accuracy: 1.0


In [13]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics

# Load the breast cancer dataset
data = load_breast_cancer()
X = data.data
y = data.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=1)

# Train the Gaussian Naive Bayes classifier
gnb = GaussianNB()
gnb.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = gnb.predict(X_test)

# Calculate the accuracy of the classifier
accuracy = metrics.accuracy_score(y_test, y_pred)
print("Gaussian Naive Bayes model accuracy: {:.2f}%".format(accuracy * 100))


Gaussian Naive Bayes model accuracy: 94.30%


In [17]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
import numpy as np

In [18]:

# Load the breast cancer dataset
data = load_breast_cancer()
X = data.data
y = data.target
feature_names = data.feature_names
target_names = data.target_names

In [19]:

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=1)

# Train the Gaussian Naive Bayes classifier
gnb = GaussianNB()
gnb.fit(X_train, y_train)

In [20]:

# Make predictions on the testing set
y_pred = gnb.predict(X_test)

# Calculate the accuracy of the classifier
accuracy = metrics.accuracy_score(y_test, y_pred)
print("Gaussian Naive Bayes model accuracy: {:.2f}%".format(accuracy * 100))

Gaussian Naive Bayes model accuracy: 94.30%


In [21]:

# Print classification report
print("\nClassification Report:")
report = metrics.classification_report(y_test, y_pred, target_names=target_names)
print(report)


Classification Report:
              precision    recall  f1-score   support

   malignant       0.91      0.93      0.92        80
      benign       0.96      0.95      0.96       148

    accuracy                           0.94       228
   macro avg       0.94      0.94      0.94       228
weighted avg       0.94      0.94      0.94       228



In [22]:
# Print confusion matrix
print("\nConfusion Matrix:")
confusion_matrix = metrics.confusion_matrix(y_test, y_pred)
print(confusion_matrix)


Confusion Matrix:
[[ 74   6]
 [  7 141]]


In [24]:
# Calculate class probabilities for a sample
sample = X_test[0]
class_probabilities = gnb.predict_proba([sample])[0]
class_labels = ["Negative", "Positive"]
print("\nClass Probabilities for Sample:")
for label, prob in zip(class_labels, class_probabilities):
    print("{}: {:.2f}%".format(label, prob * 100))


Class Probabilities for Sample:
Negative: 100.00%
Positive: 0.00%


In [25]:
# Analyze feature importance
feature_importances = gnb.theta_[1]
sorted_indices = np.argsort(feature_importances)[::-1]
print("\nFeature Importance:")
for i in sorted_indices:
    print("{}: {:.2f}".format(feature_names[i], feature_importances[i]))


Feature Importance:
worst area: 547.28
mean area: 454.20
worst perimeter: 86.02
mean perimeter: 77.33
worst texture: 23.35
area error: 21.23
mean texture: 17.93
worst radius: 13.24
mean radius: 12.03
perimeter error: 2.02
texture error: 1.24
radius error: 0.29
worst symmetry: 0.27
worst compactness: 0.18
mean symmetry: 0.17
worst concavity: 0.16
worst smoothness: 0.12
mean smoothness: 0.09
mean compactness: 0.08
worst fractal dimension: 0.08
worst concave points: 0.07
mean fractal dimension: 0.06
mean concavity: 0.05
concavity error: 0.03
mean concave points: 0.03
compactness error: 0.02
symmetry error: 0.02
concave points error: 0.01
smoothness error: 0.01
fractal dimension error: 0.00
