In [1]:
import os
import pandas as pd
from collections import defaultdict, Counter

In [2]:
def back_to_list(s):
    end = s.replace('[', '').replace(']', '').replace('\'', '').replace(' ', '').split(',')
    return end

def load_csv(filename):
    df = pd.read_csv(os.path.join('..', 'data', filename))
    df.body = df.body.apply(lambda x: back_to_list(x))
    df.title = df.title.apply(lambda x: back_to_list(x))
    return df

def generate_vocab(df, include_headline=True, include_body=False):
    vocab = set()
    for row in df.title:
        vocab = vocab.union(set(row))
    if include_body:
        for row in df.body:
            vocab = vocab.union(set(row))       
    vocab = list(vocab)
    print(f'Vocab Size: {len(vocab)}')
    return vocab

def p_w_H_given_D(df, vocab):
    # denominator, numerator
    p_w_in_D, p_w_in_HD = {}, {}
    for word in vocab:
        dcount, hdcount = 1, 1
        for article in df.iterrows():
            if word in article[1].body:
                dcount += 1
                if word in article[1].title:
                    hdcount += 1
        p_w_in_D[word] = dcount
        p_w_in_HD[word] = hdcount
    return {word: p_w_in_HD[word] / p_w_in_D[word] for word in vocab}

def length_prob(df):
    length_list = [len(headline) for headline in df.title]
    length_counter = Counter(length_list)
    return {length: count/len(length_list) for length, count in length_counter.items()}

def freq_to_prob(whole, item):
    total = sum(whole[item].values())
    whole[item] = {k: v / total for k, v in whole[item].items()}
        
def bigramLM(df):
    transition = defaultdict(dict)
    for line in df.title: 
        prev_word = None
        for word in line:
            if not prev_word:
                prev_word = 'BOS'
            transition[prev_word][word] = transition[prev_word].get(word, 0) + 1
            prev_word = word
        transition[prev_word]['EOS'] = transition[prev_word].get('EOS', 0) + 1   
    _ = list(map(lambda item: freq_to_prob(transition, item), transition))
    # return transposed dataframe so that each row index represent the prev_words
    return pd.DataFrame(transition).T.fillna(0)

### Sample run with train[:2]

In [3]:
df = load_csv('reuters_train.csv')[:2]

In [4]:
vocab = generate_vocab(df)

Vocab Size: 15


In [5]:
p_w_H_given_D(df, vocab)

{'issues': 1.0,
 'sfr': 1.0,
 'sapporo': 1.0,
 'breweries': 1.0,
 'five': 1.0,
 'zenith': 1.0,
 'labs': 1.0,
 'zen': 1.0,
 'qtr': 1.0,
 'will': 1.0,
 'loss': 1.0,
 'year': 0.6666666666666666,
 'notes': 1.0,
 '4th': 1.0,
 'report': 1.0}

In [6]:
length_prob(df)

{7: 0.5, 8: 0.5}

In [7]:
bigramLM(df)

Unnamed: 0,sapporo,zenith,breweries,issues,five,year,sfr,notes,EOS,labs,zen,will,report,4th,qtr,loss
BOS,0.5,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
sapporo,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
breweries,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
issues,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
five,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
year,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
sfr,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
notes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zenith,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
labs,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
