In [1]:
import re
from collections import Counter
import math

# Sample paragraphs
paragraphs = {
    "Physics": "Quantum mechanics describes the behavior of matter at the atomic scale. The uncertainty principle and wave-particle duality challenge our understanding of reality. Recent experiments with quantum computers have demonstrated potential advantages over classical computing methods. Researchers continue to debate interpretations of quantum theory and its implications for our understanding of the physical world.",
    "Economics": "Market equilibrium occurs when supply and demand forces balance, establishing optimal prices. Macroeconomic policies influence inflation, unemployment, and economic growth across entire economies. Behavioral economics has revealed how psychological factors affect financial decision-making. As markets become increasingly global, economists develop models to account for international trade relationships.",
    "Biology": "Evolution has shaped the diversity of species through natural selection over millions of years. Cellular processes convert nutrients into energy through complex biochemical pathways. Genetic information flows from DNA to RNA to proteins, with regulatory mechanisms controlling gene expression. Conservation efforts focus on maintaining biodiversity in ecosystems to ensure their long-term stability.",
    "Psychology": "Cognitive development progresses through stages as children build more complex mental models. Social psychology studies how individuals are influenced by others in various contexts. The brain demonstrates plasticity, forming new neural connections throughout life during learning experiences. Mental health treatments integrate various therapeutic approaches with consideration of individual differences."
}

# Common stop words
stop_words = set([
    "the", "and", "of", "to", "in", "a", "is", "that", "it", "with", "as", "for", "on", "are", "this", "by", "be", "have", "has", "or", "at", "an", "from", "their", "which", "these", "those", "was", "were", "been", "being", "through", "during", "its", "how", "into", "over", "across", "each", "more", "other", "such", "some", "any", "only", "also", "when", "than", "but", "not", "they", "we", "our", "us", "you", "your", "he", "she", "his", "her", "them", "its", "my", "mine", "your", "yours", "our", "ours", "their", "theirs"
])

# Preprocessing function
def preprocess(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    words = text.split()  # Split into words
    words = [word for word in words if word not in stop_words]  # Remove stop words
    return words

# Preprocess all paragraphs
preprocessed_paragraphs = {field: preprocess(text) for field, text in paragraphs.items()}

# Create vocabulary
vocabulary = set()
for words in preprocessed_paragraphs.values():
    vocabulary.update(words)

print("Vocabulary:", vocabulary)

Vocabulary: {'methods', 'natural', 'markets', 'international', 'evolution', 'researchers', 'revealed', 'entire', 'dna', 'challenge', 'recent', 'differences', 'relationships', 'throughout', 'stability', 'physical', 'brain', 'diversity', 'proteins', 'forming', 'therapeutic', 'progresses', 'information', 'controlling', 'influenced', 'demonstrates', 'new', 'conservation', 'balance', 'continue', 'demand', 'account', 'potential', 'economics', 'expression', 'ecosystems', 'health', 'prices', 'connections', 'stages', 'macroeconomic', 'unemployment', 'develop', 'regulatory', 'pathways', 'economic', 'approaches', 'occurs', 'establishing', 'ensure', 'biochemical', 'matter', 'genetic', 'individual', 'quantum', 'complex', 'mechanics', 'computers', 'influence', 'global', 'factors', 'treatments', 'models', 'efforts', 'uncertainty', 'energy', 'advantages', 'forces', 'waveparticle', 'psychology', 'principle', 'build', 'social', 'economies', 'equilibrium', 'inflation', 'economists', 'various', 'longterm'

In [2]:
# Count word frequencies for each paragraph
word_counts = {field: Counter(words) for field, words in preprocessed_paragraphs.items()}

# Calculate relative frequency distribution
relative_frequencies = {}
for field, counts in word_counts.items():
    total_words = sum(counts.values())
    relative_frequencies[field] = {word: count / total_words for word, count in counts.items()}

print("Relative Frequencies:", relative_frequencies)

Relative Frequencies: {'Physics': {'quantum': 0.08823529411764706, 'mechanics': 0.029411764705882353, 'describes': 0.029411764705882353, 'behavior': 0.029411764705882353, 'matter': 0.029411764705882353, 'atomic': 0.029411764705882353, 'scale': 0.029411764705882353, 'uncertainty': 0.029411764705882353, 'principle': 0.029411764705882353, 'waveparticle': 0.029411764705882353, 'duality': 0.029411764705882353, 'challenge': 0.029411764705882353, 'understanding': 0.058823529411764705, 'reality': 0.029411764705882353, 'recent': 0.029411764705882353, 'experiments': 0.029411764705882353, 'computers': 0.029411764705882353, 'demonstrated': 0.029411764705882353, 'potential': 0.029411764705882353, 'advantages': 0.029411764705882353, 'classical': 0.029411764705882353, 'computing': 0.029411764705882353, 'methods': 0.029411764705882353, 'researchers': 0.029411764705882353, 'continue': 0.029411764705882353, 'debate': 0.029411764705882353, 'interpretations': 0.029411764705882353, 'theory': 0.029411764705

In [3]:
# Estimate parameters of a probability distribution for each field
def calculate_log_likelihood(text, field):
    words = preprocess(text)
    log_likelihood = 0
    for word in words:
        if word in relative_frequencies[field]:
            log_likelihood += math.log(relative_frequencies[field][word])
        else:
            log_likelihood += math.log(1e-10)  # Small probability for unseen words
    return log_likelihood

# Calculate log-likelihood for each paragraph
log_likelihoods = {}
for field in paragraphs:
    log_likelihoods[field] = calculate_log_likelihood(paragraphs[field], field)

print("Log-Likelihoods:", log_likelihoods)

Log-Likelihoods: {'Physics': -115.21412660982531, 'Economics': -138.22827406960258, 'Biology': -129.00668178441995, 'Psychology': -135.45568534736282}


In [4]:
# Unclassified paragraph
unclassified_paragraph = "Neural networks model relationships between variables through layers of connected nodes. Each connection has a weight that is adjusted during training to reduce errors. Learning algorithms optimize these weights based on patterns in the data."

# Calculate likelihood for each field
likelihoods = {}
for field in paragraphs:
    likelihoods[field] = calculate_log_likelihood(unclassified_paragraph, field)

# Determine the most probable classification
most_probable_field = max(likelihoods, key=likelihoods.get)

# Identify key words influencing the classification
key_words = [word for word in preprocess(unclassified_paragraph) if word in relative_frequencies[most_probable_field]]

print("Likelihoods:", likelihoods)
print("Most Probable Field:", most_probable_field)
print("Key Words:", key_words)

Likelihoods: {'Physics': -506.56872045869017, 'Economics': -487.1804556884761, 'Biology': -506.56872045869017, 'Psychology': -467.792190918262}
Most Probable Field: Psychology
Key Words: ['neural', 'learning']
