In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# 1. Implement in python program of the following problems using Bayes Theorem.
# a) Of the students in the college, 60% of the students reside in the hostel and 40% of the students are day
# scholars. Previous year results report that 30% of all students who stay in the hostel scored A Grade and 20%
# of day scholars scored A grade. At the end of the year, one student is chosen at random and found that he/she
# has an A grade. What is the probability that the student is a hosteler?
# b) Suppose you're testing for a rare disease, and you have the following information:
#  The disease has a prevalence of 0.01 (1% of the population has the disease).
#  The test is not perfect:
#  The test correctly identifies the disease (true positive) 99% of the time (sensitivity).
#  The test incorrectly indicates the disease (false positive) 2% of the time (1 - specificity).
# Calculate the probability of having the disease given a positive test result using Bayes' theorem.

In [2]:
prob_hosteler = 0.60
prob_day_scholar = 0.40
prob_A_given_hosteler = 0.30
prob_A_given_day_scholar = 0.20

prob_A = prob_A_given_hosteler * prob_hosteler + prob_A_given_day_scholar * prob_day_scholar

prob_hosteler_given_A = (prob_A_given_hosteler * prob_hosteler) / prob_A

print(f"Probability: {prob_hosteler_given_A:.4f}")

Probability: 0.6923


In [3]:
prob_disease = 0.01
prob_no_disease = 0.99
prob_positive_given_disease = 0.99
prob_positive_given_no_disease = 0.02

prob_positive = prob_positive_given_disease * prob_disease + prob_positive_given_no_disease * prob_no_disease

prob_disease_given_positive = (prob_positive_given_disease * prob_disease) / prob_positive

print(f"Probability: {prob_disease_given_positive:.4f}")

Probability: 0.3333


In [None]:
# 2. Develop a function python code for Naïve Bayes classifier from scratch without using scikit-learn library,
# to predict whether the buyer should buy computer or not. Consider a following sample training dataset stored
# in a CSV file containing information about following buyer conditions (such as “<=30,” “medium,” “Yes,”
# and “fair”) and whether the player played golf (“Yes” or “No”)

In [24]:
df1=pd.read_csv("data1.csv")
df1

Unnamed: 0,age,income,student,credit_rating,buys_computer
0,<=30,high,no,fair,no
1,<=30,high,no,excellent,no
2,31…40,high,no,fair,yes
3,>40,medium,no,fair,yes
4,>40,low,yes,fair,yes
5,>40,low,yes,excellent,no
6,31…40,low,yes,excellent,yes
7,<=30,medium,no,fair,no
8,<=30,low,yes,fair,yes
9,>40,medium,yes,fair,yes


In [23]:
class NaiveBayesClassifier:
    def __init__(self):
        self.class_probs = {}
        self.feature_probs = {}
        self.classes = None

    def fit(self, X, y):
        self.classes = y.unique()
        self.class_probs = y.value_counts(normalize=True).to_dict()

        self.feature_probs = {}
        for feature in X.columns:
            self.feature_probs[feature] = {}
            for cls in self.classes:
                subset = X[y == cls]
                feature_prob = subset.groupby(feature).size() / subset.shape[0]
                self.feature_probs[feature][cls] = feature_prob.to_dict()

    def predict(self, X):
        predictions = []
        for _, row in X.iterrows():
            class_probs = {}
            for cls in self.classes:
                prob = self.class_probs[cls]
                for feature in X.columns:
                    value = row[feature]
                    feature_prob = self.feature_probs.get(feature, {}).get(cls, {}).get(value, 0)
                    prob *= feature_prob
                print(prob)
                class_probs[cls] = prob
            predicted_class = max(class_probs, key=class_probs.get)
            predictions.append(predicted_class)
        return predictions

data = pd.read_csv('data1.csv')

X = data.drop('buys_computer', axis=1)
y = data['buys_computer']

nb_classifier = NaiveBayesClassifier()
nb_classifier.fit(X, y)

test_data = pd.DataFrame([{
    'age': '<=30',
    'income': 'medium',
    'student': 'yes',
    'credit_rating': 'fair'
}])

predictions = nb_classifier.predict(test_data)
print(f'Prediction for the test data: {predictions[0]}')

0.006857142857142858
0.02821869488536155
Prediction for the test data: yes


In [25]:
class NaiveBayesTextClassifier:
    def __init__(self):
        self.class_probs = {}
        self.word_probs = {}
        self.vocabulary = set()
        self.classes = None

    def fit(self, X, y):
        self.classes = y.unique()

        class_counts = y.value_counts()
        total_count = len(y)
        self.class_probs = {cls: count / total_count for cls, count in class_counts.items()}

        self.word_probs = {cls: {} for cls in self.classes}
        self.vocabulary = set()

        for cls in self.classes:
            subset = X[y == cls]
            words = ' '.join(subset).split()
            self.vocabulary.update(words)
            word_counts = pd.Series(words).value_counts()
            total_words = len(words)
            for word in self.vocabulary:
                # Applying Laplace smoothing
                self.word_probs[cls][word] = (word_counts.get(word, 0) + 1) / (total_words + len(self.vocabulary))

    def predict(self, X):
        predictions = []
        for text in X:
            words = text.split()
            class_probs = {}
            for cls in self.classes:
                prob = self.class_probs[cls]
                for word in words:
                    prob *= self.word_probs[cls].get(word, 1 / (sum(self.word_probs[cls].values()) + len(self.vocabulary)))
                class_probs[cls] = prob
            predicted_class = max(class_probs, key=class_probs.get)
            predictions.append(predicted_class)
        return predictions

df = pd.read_csv("data2.csv")

nb_classifier = NaiveBayesTextClassifier()
nb_classifier.fit(df['Text'], df['Tag'])

y_true = df['Tag']
y_pred = nb_classifier.predict(df['Text'])


test_sentence = ["A very close game"]
predicted_tag = nb_classifier.predict(test_sentence)
print(f'The sentence "A very close game" is classified as: {predicted_tag[0]}')

The sentence "A very close game" is classified as: Sports
