In [None]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
import pandas as pd
from collections import defaultdict
import math
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
import pandas as pd
from collections import defaultdict
import math
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag


class NaiveBayesFromScratch:
    def __init__(self):
        self.class_priors = defaultdict(float)
        self.feature_probs = defaultdict(lambda: defaultdict(float))
        self.vocabulary = set()
        self.total_samples = 0

    def tokenize(self, text):
        return text.lower().split()

    def train(self, labeled_data):
        label_counts = defaultdict(int)
        word_given_label_counts = defaultdict(lambda: defaultdict(int))

        for features, label in labeled_data:
            label_counts[label] += 1
            tokens = self.tokenize(features['book'])
            self.vocabulary.update(tokens)
            for word in tokens:
                word_given_label_counts[label][word] += 1

        self.total_samples = sum(label_counts.values())
        vocab_size = len(self.vocabulary)

        for label in label_counts:
            self.class_priors[label] = label_counts[label] / self.total_samples

        for label in label_counts:
            total_words = sum(word_given_label_counts[label].values())
            for word in self.vocabulary:
                word_count = word_given_label_counts[label][word]
                self.feature_probs[label][word] = (word_count + 1) / (total_words + vocab_size)

    def classify(self, features):
        tokens = self.tokenize(features['book'])
        scores = {}

        for label in self.class_priors:
            log_prob = math.log(self.class_priors[label])
            for word in tokens:
                if word in self.vocabulary:
                    log_prob += math.log(self.feature_probs[label][word])
            scores[label] = log_prob

        return max(scores, key=scores.get)



def extract_nouns(tokens):
    tagged_tokens = pos_tag(tokens)
    nouns = [word for word, pos in tagged_tokens if pos.startswith('N')]
    return nouns

def book_features(book_name):
    return {'book': book_name.lower()}

def predict_author(book_name):
    featureset = book_features(book_name)
    return author_classifier.classify(featureset)



df = pd.read_csv('books3.csv')

author_labeled_data = [(book_features(row['Book']), row['Author']) for _, row in df.iterrows()]

author_classifier = NaiveBayesFromScratch()
author_classifier.train(author_labeled_data)


print("Welcome! Ask who wrote a book, type 'exit' to quit.\n")

while True:
    user_input = input("Enter your query: ").strip().lower()
    if user_input in ["exit", "no thanks", "bye"]:
        print("Have a nice day! Goodbye!")
        break

    tokens = word_tokenize(user_input)
    processed_tokens = [token for token in tokens if token.isalnum()]
    entity_words = extract_nouns(processed_tokens)
    last_entity = entity_words[-1] if entity_words else None

    if not last_entity:
        print("Sorry, I couldn't understand your query.")
        continue

    if "author" in user_input and last_entity in df['Book'].values:
        author = predict_author(last_entity)
        print(f"The author of '{last_entity}' is: {author}")
    else:
        print(f"Sorry, no information available for '{last_entity}'.")
