In [118]:
import re
import os
import numpy as np
from collections import Counter
from math import log

In [119]:
stop_words = set([
    "the", "and", "of", "to", "in", "a", "is", "that", "it", "with", "as", "for", "on", "are", "this", "by", "be", "have", "has", "or", "at", "an", "from", "their", "which", "these", "those", "was", "were", "been", "being", "through", "during", "its", "how", "into", "over", "across", "each", "more", "other", "such", "some", "any", "only", "also", "when", "than", "but", "not", "they", "we", "our", "us", "you", "your", "he", "she", "his", "her", "them", "its", "my", "mine", "your", "yours", "our", "ours", "their", "theirs"
])

In [120]:
def read_files(filename):
    with open(filename,'r') as f:
        return f.read()


In [121]:
text = read_files('Physics.txt')

In [122]:
def process_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]','',text)
    words = text.split()
    words_after_process = []
    for word in words:
       if word not in stop_words:
           words_after_process.append(word)
    return words_after_process
    

In [123]:
def build_vocab(filename):
    vocab = set()
    f = read_files(filename)
    words = process_text(f)
    vocab.update(words)
    return vocab

In [124]:
def word_frequencies(filename,vocab):
    word_counts = {}
    text = read_files(filename)
    words = process_text(text)
    for word in vocab:
        count = 0
        if word in words:
            count = words.count(word)
            word_counts[word] = count
        else:
            return
    return word_counts
    

In [125]:
def cal_rel_prob(word_counts):
    rel_freq = {}
    for word,count in word_counts.items():
        total_words = len(word)
        rel_freq[word] = count/total_words
    return rel_freq


In [138]:
def calc_log_likelihood(filename,rel_prob):
    text = read_files(filename)
    file_words = process_text(text)
    likelihoods = {}
    rel_words = []
    for word, prob in rel_prob.items():
        likelihood = 0
        rel_words.append(word)
        for word in file_words:
            if word in rel_words:
                likelihood += log(rel_prob.get(prob,1e-6))
                likelihoods[word] = likelihood
    return likelihoods
    

In [148]:
def classify_file(filename,rel_probs):
    for rel_prob in rel_probs:
        likelihoods = calc_log_likelihood(filename,rel_prob)
    return filename, max(likelihoods,key=likelihoods.get), likelihoods

In [128]:
files = ['Physics.txt','Biology.txt','Economics.txt','Psychology.txt']

In [149]:
rel_probs = []
for file in files:
    word_count = word_frequencies(file,build_vocab(file))
    rel_prob = cal_rel_prob(word_count)
    rel_probs.append(rel_prob)


In [150]:
classify_file('unknown.txt',rel_probs)

('unknown.txt', 'complex', {'complex': -13.815510557964274})

In [137]:
word_count = word_frequencies('Physics.txt',build_vocab('Physics.txt'))
rel_prob = cal_rel_prob(word_count)
classify_file('unknown.txt',rel_prob)

('evolution',
 {'evolution': -13.815510557964274,
  'shaped': -27.631021115928547,
  'diversity': -41.44653167389282,
  'species': -55.262042231857095,
  'natural': -69.07755278982137,
  'selection': -82.89306334778564,
  'millions': -96.70857390574992,
  'years': -110.52408446371419,
  'cellular': -124.33959502167846,
  'processes': -138.15510557964274,
  'convert': -151.970616137607,
  'nutrients': -165.78612669557128,
  'energy': -179.60163725353556,
  'complex': -193.41714781149983,
  'biochemical': -207.2326583694641,
  'pathways': -221.04816892742838,
  'genetic': -234.86367948539265,
  'information': -248.67919004335693,
  'flows': -262.4947006013212,
  'dna': -276.3102111592855,
  'rna': -290.12572171724975,
  'proteins': -303.941232275214,
  'regulatory': -317.7567428331783,
  'mechanisms': -331.57225339114257,
  'controlling': -345.38776394910684,
  'gene': -359.2032745070711,
  'expression': -373.0187850650354,
  'conservation': -386.83429562299966,
  'efforts': -400.6498061