In [10]:
import numpy as np
import pandas as pd
import re
import math

In [11]:
stop_words = ["the", "and", "of""other", "such", "some", "any", "only", "also",'it','a']

In [12]:
def load_file(filename):
    f = open(filename,"r")
    return f.read()

def process_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]','',text)
    words = text.split()
    words_after_process = []
    for word in words:
        if word not in stop_words:
           words_after_process.append(word)
    return words_after_process
           
    
    
def build_vocab(filename):
    vocab = set()
    f = load_file(filename)
    words = process_text(f)
    vocab.update(words)
    return vocab

In [13]:
def word_frequencies(filename,vocab):
    word_counts = {}
    text = load_file(filename)
    words = process_text(text)
    for word in vocab:
        count = 0
        if word in words:
            count = words.count(word)
            word_counts[word] = count
        else:
            return
    return word_counts
    

In [14]:
def cal_rel_prob(word_counts):
    rel_freq = {}
    for word,count in word_counts.items():
        total_words = len(word)
        rel_freq[word] = count/total_words
    return rel_freq


In [15]:
text = load_file('technoloy.txt')
text = process_text(text)
vocab = build_vocab('technoloy.txt')
freq = word_frequencies('technoloy.txt',vocab)
print(freq)
prob_tech = cal_rel_prob(freq)
print(prob_tech)


{'new': 1, 'alike': 1, 'interactions': 1, 'businesses': 1, 'artificial': 1, 'industries': 1, 'with': 1, 'intelligence': 1, 'transforming': 1, 'natural': 1, 'redefining': 1, 'technology': 1, 'in': 1, 'from': 1, 'selfdriving': 1, 'language': 1, 'machine': 1, 'unlocking': 1, 'to': 1, 'learning': 1, 'aipowered': 1, 'processing': 1, 'automation': 1, 'for': 1, 'chatbots': 1, 'cars': 1, 'humancomputer': 1, 'advancements': 1, 'possibilities': 1, 'consumers': 1, 'efficiency': 1, 'improving': 1, 'is': 2, 'rapidly': 1}
{'new': 0.3333333333333333, 'alike': 0.2, 'interactions': 0.08333333333333333, 'businesses': 0.1, 'artificial': 0.1, 'industries': 0.1, 'with': 0.25, 'intelligence': 0.08333333333333333, 'transforming': 0.08333333333333333, 'natural': 0.14285714285714285, 'redefining': 0.1, 'technology': 0.1, 'in': 0.5, 'from': 0.25, 'selfdriving': 0.09090909090909091, 'language': 0.125, 'machine': 0.14285714285714285, 'unlocking': 0.1111111111111111, 'to': 0.5, 'learning': 0.125, 'aipowered': 0.11

In [16]:
text = load_file('health.txt')
text = process_text(text)
vocab = build_vocab('health.txt')
freq = word_frequencies('health.txt',vocab)
print(freq)
prob_health = cal_rel_prob(freq)
print(prob_health)

{'heart': 1, 'diet': 1, 'reduce': 1, 'mental': 1, 'chronic': 1, 'leading': 1, 'longer': 1, 'activity': 1, 'maintaining': 1, 'play': 1, 'in': 1, 'crucial': 1, 'can': 1, 'overall': 1, 'balanced': 1, 'active': 1, 'to': 1, 'show': 1, 'enhances': 1, 'physical': 2, 'health': 1, 'studies': 1, 'while': 1, 'more': 1, 'wellbeing': 1, 'diabetes': 1, 'as': 1, 'exercise': 1, 'risk': 1, 'proper': 1, 'even': 1, 'diseases': 1, 'role': 1, 'of': 1, 'life': 1, 'conditions': 1, 'nutrition': 1, 'that': 1, 'moderate': 1, 'regular': 1}
{'heart': 0.2, 'diet': 0.25, 'reduce': 0.16666666666666666, 'mental': 0.16666666666666666, 'chronic': 0.14285714285714285, 'leading': 0.14285714285714285, 'longer': 0.16666666666666666, 'activity': 0.125, 'maintaining': 0.09090909090909091, 'play': 0.25, 'in': 0.5, 'crucial': 0.14285714285714285, 'can': 0.3333333333333333, 'overall': 0.14285714285714285, 'balanced': 0.125, 'active': 0.16666666666666666, 'to': 0.5, 'show': 0.25, 'enhances': 0.125, 'physical': 0.25, 'health': 0.

In [17]:
text = load_file('politics.txt')
text = process_text(text)
vocab = build_vocab('politics.txt')
freq = word_frequencies('politics.txt',vocab)
print(freq)
prob_polit = cal_rel_prob(freq)
print(prob_polit)

{'remain': 1, 'private': 1, 'divided': 1, 'intervention': 1, 'shape': 1, 'cuts': 1, 'on': 1, 'financial': 1, 'programs': 1, 'outcome': 1, 'discussions': 1, 'increased': 1, 'heated': 1, 'boost': 1, 'deregulation': 1, 'years': 1, 'policies': 1, 'advocate': 1, 'economic': 1, 'tax': 2, 'social': 1, 'in': 1, 'nations': 1, 'sector': 1, 'to': 3, 'over': 1, 'government': 1, 'others': 1, 'push': 1, 'will': 1, 'public': 1, 'while': 1, 'come': 1, 'for': 3, 'debate': 1, 'growth': 1, 'support': 1, 'landscape': 1, 'of': 1, 'these': 1, 'reforms': 1, 'lawmakers': 1, 'spending': 1}
{'remain': 0.16666666666666666, 'private': 0.14285714285714285, 'divided': 0.14285714285714285, 'intervention': 0.08333333333333333, 'shape': 0.2, 'cuts': 0.25, 'on': 0.5, 'financial': 0.1111111111111111, 'programs': 0.125, 'outcome': 0.14285714285714285, 'discussions': 0.09090909090909091, 'increased': 0.1111111111111111, 'heated': 0.16666666666666666, 'boost': 0.2, 'deregulation': 0.08333333333333333, 'years': 0.2, 'polici

In [18]:
text = load_file('entertainement.txt')
text = process_text(text)
vocab = build_vocab('entertainement.txt')
freq = word_frequencies('entertainement.txt',vocab)
print(freq)
prob_entr = cal_rel_prob(freq)
print(prob_entr)

{'compelling': 1, 'worldwide': 1, 'media': 1, 'with': 1, 'captivating': 1, 'power': 1, 'breaking': 1, 'blockbuster': 1, 'sparked': 1, 'social': 1, 'in': 1, 'has': 2, 'by': 1, 'latest': 1, 'box': 1, 'taken': 1, 'film': 1, 'conversations': 1, 'cast': 1, 'storyline': 1, 'movie': 1, 'across': 1, 'office': 1, 'visuals': 1, 'starstudded': 1, 'again': 1, 'storytelling': 1, 'cinema': 1, 'once': 1, 'storm': 1, 'breathtaking': 1, 'records': 1, 'proving': 1, 'audiences': 1, 'of': 1}
{'compelling': 0.1, 'worldwide': 0.1111111111111111, 'media': 0.2, 'with': 0.25, 'captivating': 0.09090909090909091, 'power': 0.2, 'breaking': 0.125, 'blockbuster': 0.09090909090909091, 'sparked': 0.14285714285714285, 'social': 0.16666666666666666, 'in': 0.5, 'has': 0.6666666666666666, 'by': 0.5, 'latest': 0.16666666666666666, 'box': 0.3333333333333333, 'taken': 0.2, 'film': 0.25, 'conversations': 0.07692307692307693, 'cast': 0.25, 'storyline': 0.1111111111111111, 'movie': 0.2, 'across': 0.16666666666666666, 'office':

In [42]:
def calculate_likelihood(filename, domain_vocab):
    text = load_file(filename)
    unknown_words = process_text(text)
    match_count = 0
    for word in unknown_words:
        if word in domain_vocab:
            match_count += 1
    return match_count / len(unknown_words)

In [43]:
print(calculate_likelihood('unknown.txt', prob_health))
print(calculate_likelihood('unknown.txt', prob_polit))
print(calculate_likelihood('unknown.txt', prob_entr))
print(calculate_likelihood('unknown.txt', prob_tech))

0.3170731707317073
0.04878048780487805
0.07317073170731707
0.024390243902439025


In [44]:
print(calculate_likelihood('unknown2.txt', prob_tech))
print(calculate_likelihood('unknown2.txt', prob_health))
print(calculate_likelihood('unknown2.txt', prob_polit))
print(calculate_likelihood('unknown2.txt', prob_entr))

0.04878048780487805
1.0
0.0975609756097561
0.04878048780487805
