In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score

## Q1

In [2]:
# Given data
P_H = 0.60
P_D = 0.40
P_A_given_H = 0.30
P_A_given_D = 0.20

# Calculate P(A)
P_A = P_A_given_H * P_H + P_A_given_D * P_D

# Calculate P(H | A) using Bayes' theorem
P_H_given_A = (P_A_given_H * P_H) / P_A

print(f"P(H | A): {P_H_given_A:.4f}")


P(H | A): 0.6923


In [3]:
# Given data
P_D = 0.01
P_not_D = 1 - P_D
P_T_given_D = 0.99
P_T_given_not_D = 0.02

# Calculate P(T^+)
P_T = P_T_given_D * P_D + P_T_given_not_D * P_not_D

# Calculate P(D | T^+) using Bayes' theorem
P_D_given_T = (P_T_given_D * P_D) / P_T

print(f"P(D | T^+): {P_D_given_T:.4f}")


P(D | T^+): 0.3333


## Q2

In [4]:
# Load dataset
def load_data(file_path):
    return pd.read_csv(file_path)

# Encode categorical data
def encode_data(df):
    encoders = {}
    for column in df.columns:
        if df[column].dtype == 'object':
            unique_values = df[column].unique()
            encoders[column] = {value: idx for idx, value in enumerate(unique_values)}
            df[column] = df[column].map(encoders[column])
    return df, encoders

# Calculate probabilities
def calculate_probabilities(df):
    class_probs = df['buys_computer'].value_counts(normalize=True).to_dict()
    conditional_probs = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))

    for index, row in df.iterrows():
        label = row['buys_computer']
        for col in df.columns[:-1]:  # Exclude 'buys_computer'
            value = row[col]
            conditional_probs[col][value][label] += 1

    # Normalize probabilities
    total_counts = df['buys_computer'].value_counts().to_dict()
    for col in conditional_probs:
        for value in conditional_probs[col]:
            for label in conditional_probs[col][value]:
                conditional_probs[col][value][label] /= total_counts.get(label, 1)
    
    return class_probs, conditional_probs

# Predict function
def predict(class_probs, conditional_probs, encoders, data_point):
    labels = class_probs.keys()
    probabilities = {}

    for label in labels:
        prob = class_probs[label]
        for col in data_point.index:
            value = data_point[col]
            # Handle unseen values by using a small smoothing factor
            prob *= conditional_probs[col].get(value, {}).get(label, 1e-6)
        probabilities[label] = prob

    # Normalize probabilities
    total_prob = sum(probabilities.values())
    for label in probabilities:
        probabilities[label] /= total_prob

    return max(probabilities, key=probabilities.get)

# Main execution
def main():
    # Load the dataset
    df = load_data('comp.csv')
    
    # Encode the categorical features
    df_encoded, encoders = encode_data(df)
    
    # Calculate probabilities
    class_probs, conditional_probs = calculate_probabilities(df_encoded)

    # Test data points
    test_data = pd.DataFrame([
        {'age': 0, 'income': 1, 'student': 0, 'credit_rating': 0}, # Encoded values for '<=30', 'medium', 'no', 'fair'
        {'age': 2, 'income': 2, 'student': 1, 'credit_rating': 1}  # Encoded values for '>40', 'low', 'yes', 'excellent'
    ])

    # Predict and display results
    for i, row in test_data.iterrows():
        result = predict(class_probs, conditional_probs, encoders, row)
        print(f"Data point {i + 1}: {row.to_dict()} => Prediction: {result}")

if __name__ == "__main__":
    main()


Data point 1: {'age': 0, 'income': 1, 'student': 0, 'credit_rating': 0} => Prediction: 1
Data point 2: {'age': 2, 'income': 2, 'student': 1, 'credit_rating': 1} => Prediction: 0


## Q3

In [5]:
# Load the dataset
def load_data(file_path):
    return pd.read_csv(file_path)

# Preprocess and tokenize text
def preprocess_text(text):
    return text.lower().replace('"', '').split()

# Calculate probabilities
def calculate_probabilities(df):
    class_probs = df['Tag'].value_counts(normalize=True).to_dict()
    word_counts = {tag: defaultdict(int) for tag in class_probs.keys()}
    class_word_counts = defaultdict(int)
    
    for index, row in df.iterrows():
        tag = row['Tag']
        words = preprocess_text(row['Text'])
        for word in words:
            word_counts[tag][word] += 1
            class_word_counts[tag] += 1

    vocab = set(word for counts in word_counts.values() for word in counts)
    vocab_size = len(vocab)
    total_documents = len(df)
    
    conditional_probs = defaultdict(lambda: defaultdict(float))
    for tag, counts in word_counts.items():
        total_words = class_word_counts[tag]
        for word in vocab:
            conditional_probs[tag][word] = (counts[word] + 1) / (total_words + vocab_size)
    
    return class_probs, conditional_probs, vocab

# Predict function
def predict(class_probs, conditional_probs, vocab, text):
    words = preprocess_text(text)
    probabilities = {}
    
    for tag in class_probs:
        prob = class_probs[tag]
        for word in words:
            prob *= conditional_probs[tag].get(word, 1 / (sum(conditional_probs[tag].values()) + len(vocab)))
        probabilities[tag] = prob
    
    total_prob = sum(probabilities.values())
    for tag in probabilities:
        probabilities[tag] /= total_prob

    return max(probabilities, key=probabilities.get)

# Evaluate the model
def evaluate_model(df, class_probs, conditional_probs, vocab):
    y_true = df['Tag']
    y_pred = df['Text'].apply(lambda text: predict(class_probs, conditional_probs, vocab, text))
    
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, pos_label='Sports', average='binary')
    recall = recall_score(y_true, y_pred, pos_label='Sports', average='binary')
    
    return accuracy, precision, recall

# Main execution
def main():
    # Load the dataset
    df = load_data('text.csv')
    
    # Calculate probabilities
    class_probs, conditional_probs, vocab = calculate_probabilities(df)
    
    # Split dataset into training and testing
    train_df, test_df = train_test_split(df, test_size=0.4, random_state=42)
    
    # Evaluate the model
    accuracy, precision, recall = evaluate_model(test_df, class_probs, conditional_probs, vocab)
    
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    
    # Predict the tag for a new sentence
    test_sentence = "A very close game"
    prediction = predict(class_probs, conditional_probs, vocab, test_sentence)
    print(f"Sentence: '{test_sentence}' => Predicted Tag: {prediction}")

if __name__ == "__main__":
    main()


Accuracy: 1.0000
Precision: 0.0000
Recall: 0.0000
Sentence: 'A very close game' => Predicted Tag: Sports


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
