In [1]:
import pandas as pd
import re

class BayesianSentimentAnalysis:
    """
    Implements Bayes’ Theorem manually to compute
    P(Positive | keyword) for given keywords.
    """

    def __init__(self):
        self.data = None
        # Choose 2–4 positive and 2–4 negative keywords
        self.positive_keywords = ['excellent', 'amazing', 'wonderful', 'fantastic']
        self.negative_keywords = ['terrible', 'awful', 'boring', 'disappointing'] # Keep negative keywords defined but not used in analysis
        self.results = {}


    # Load dataset from CSV (for Google Colab)
    def load_dataset(self, path='/content/IMDB Dataset.csv'):
        """Load and prepare dataset from CSV."""
        self.data = pd.read_csv(path)
        self.data['sentiment'] = self.data['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
        self.data.dropna(subset=['review'], inplace=True)
        print(f"Loaded {len(self.data)} reviews "
              f"({self.data['sentiment'].sum()} positive, "
              f"{len(self.data)-self.data['sentiment'].sum()} negative).")
        return self.data


    # Text Preprocessing
    def preprocess(self, text):
        """Clean text: lowercase and remove punctuation."""
        text = text.lower()
        text = re.sub(r'[^a-z\s]', '', text)
        return text.strip()


    # Prior Probability: P(Positive)
    def compute_prior(self):
        """
        Prior = proportion of positive reviews in the dataset.
        Interprets our belief that a random review is positive,
        *before* any keyword.
        """
        total = len(self.data)
        positive = self.data['sentiment'].sum()
        prior = positive / total
        return prior


    # Likelihood: P(keyword | Positive)
    def compute_likelihood(self, keyword):
        """
        Likelihood = proportion of positive reviews that contain the keyword.
        This measures how strongly the keyword is associated
        with positive sentiment.
        """
        pos_reviews = self.data[self.data['sentiment'] == 1]['review']
        count = sum(keyword in self.preprocess(r) for r in pos_reviews)
        total_pos = len(pos_reviews)
        likelihood = count / total_pos if total_pos > 0 else 0
        return likelihood


    # Marginal Probability: P(keyword)
    def compute_marginal(self, keyword):
        """
        Marginal = proportion of all reviews that contain the keyword.
        This tells us how common the keyword is overall.
        """
        total = len(self.data)
        count = sum(keyword in self.preprocess(r) for r in self.data['review'])
        marginal = count / total
        return marginal


    # Posterior: P(Positive | keyword)
    def compute_posterior(self, keyword):
        """
        Apply Bayes’ theorem:
        P(Positive | keyword) = (P(keyword|Positive) * P(Positive)) / P(keyword)
        """
        print("\n" + "="*50)
        print(f"Calculating probabilities for '{keyword.upper()}'")
        print("="*50)

        prior = self.compute_prior()
        likelihood = self.compute_likelihood(keyword)
        marginal = self.compute_marginal(keyword)

        print(f"Prior P(Positive) = {len(self.data[self.data['sentiment'] == 1])}/{len(self.data)} = {prior:.4f}")
        pos_reviews = self.data[self.data['sentiment'] == 1]['review']
        count_pos = sum(keyword in self.preprocess(r) for r in pos_reviews)
        print(f"Likelihood P({keyword}|Positive) = {count_pos}/{len(pos_reviews)} = {likelihood:.4f}")
        total_reviews = len(self.data['review'])
        count_total = sum(keyword in self.preprocess(r) for r in self.data['review'])
        print(f"Marginal P({keyword}) = {count_total}/{total_reviews} = {marginal:.4f}")


        # Bayes’ Theorem
        posterior = (likelihood * prior) / marginal if marginal > 0 else 0

        print(f"\nApplying Bayes' Theorem:")
        print(f"P(Positive|{keyword}) = (P({keyword}|Positive) × P(Positive)) / P({keyword})")
        print(f"P(Positive|{keyword}) = ({likelihood:.4f} × {prior:.4f}) / {marginal:.4f} = {posterior:.4f}")

        # Store results for summary
        self.results[keyword] = {
            'prior': prior,
            'likelihood': likelihood,
            'marginal': marginal,
            'posterior': posterior
        }


    # Run analysis for all keywords
    def analyze_keywords(self):
        """Compute posterior for all selected positive keywords."""
        print("\nSelected positive keywords for analysis:")
        print(f"Positive: {self.positive_keywords}")
        # Only analyze positive keywords
        for kw in self.positive_keywords:
            self.compute_posterior(kw)


    # Display results clearly
    def display_results(self):
        """Show a summary table of probabilities for positive keywords."""
        print("\n" + "="*60)
        print("SUMMARY OF BAYESIAN ANALYSIS FOR POSITIVE KEYWORDS")
        print("="*60)
        # Filter results to only include positive keywords
        positive_results = {k: v for k, v in self.results.items() if k in self.positive_keywords}

        if not positive_results:
            print("No results to display for positive keywords.")
            print("="*60)
            return

        print(f"{'Keyword':<15}{'Prior':<10}{'Likelihood':<15}{'Marginal':<15}{'Posterior':<15}")
        print("-"*60)
        for k, v in positive_results.items():
            print(f"{k:<15}{v['prior']:<10.3f}{v['likelihood']:<15.3f}{v['marginal']:<15.3f}{v['posterior']:<15.3f}")
        print("="*60)

        print("\nInterpretation for Positive Keywords:")
        for k, v in positive_results.items():
            if v['posterior'] > 0.5:
                print(f"'{k}' → Strong Positive Indicator (P(Positive|{k})={v['posterior']:.3f})")
            else:
                print(f"'{k}' → Weaker Positive Indicator (P(Positive|{k})={v['posterior']:.3f})")



    # Run the full pipeline
    def run(self, path='/content/IMDB Dataset.csv'):
        """Execute the full Bayesian pipeline."""
        self.load_dataset(path)
        self.analyze_keywords()
        self.display_results()

# Run the analysis
if __name__ == "__main__":
    analysis = BayesianSentimentAnalysis()
    analysis.run()

Loaded 50000 reviews (25000 positive, 25000 negative).

Selected positive keywords for analysis:
Positive: ['excellent', 'amazing', 'wonderful', 'fantastic']

Calculating probabilities for 'EXCELLENT'
Prior P(Positive) = 25000/50000 = 0.5000
Likelihood P(excellent|Positive) = 2936/25000 = 0.1174
Marginal P(excellent) = 3625/50000 = 0.0725

Applying Bayes' Theorem:
P(Positive|excellent) = (P(excellent|Positive) × P(Positive)) / P(excellent)
P(Positive|excellent) = (0.1174 × 0.5000) / 0.0725 = 0.8099

Calculating probabilities for 'AMAZING'
Prior P(Positive) = 25000/50000 = 0.5000
Likelihood P(amazing|Positive) = 1850/25000 = 0.0740
Marginal P(amazing) = 2479/50000 = 0.0496

Applying Bayes' Theorem:
P(Positive|amazing) = (P(amazing|Positive) × P(Positive)) / P(amazing)
P(Positive|amazing) = (0.0740 × 0.5000) / 0.0496 = 0.7463

Calculating probabilities for 'WONDERFUL'
Prior P(Positive) = 25000/50000 = 0.5000
Likelihood P(wonderful|Positive) = 2665/25000 = 0.1066
Marginal P(wonderful) = 3