In [1]:
import os
import pandas as pd
from collections import defaultdict, Counter
import numpy as np
from tqdm import tqdm

In [2]:
np.random.seed(2021)

In [10]:
def back_to_list(s):
    end = s.replace('[', '').replace(']', '').replace('\'', '').replace(' ', '').split(',')
    if '' in end:
        end.remove('')
    return end


def generate_vocab(df, include_headline=True, include_body=False):
    vocab = set()
    if include_headline:
        for row in df.title:
            vocab = vocab.union(set(row))
    if include_body:
        for row in df.body:
            vocab = vocab.union(set(row)) 
    #vocab.remove('')
    vocab = list(vocab)
    print(f'Vocab Size: {len(vocab)}')
    return vocab

### Sample run

In [63]:
#df = pd.read_csv('newshead/data/train.csv')[:500]
df = pd.read_csv('reuters/reuters_train.csv')[:1000]

In [64]:
df.rename(columns = {'text':'body'},inplace=True)
df

Unnamed: 0,body,title
0,"['sapporo', 'breweries', 'ltd', 'of', 'japan',...","['sapporo', 'breweries', 'issues', 'five', 'ye..."
1,"['zenith', 'laboratories', 'inc', 'said', 'the...","['zenith', 'labs', 'zen', 'will', 'report', '4..."
2,"['usx', 'corp', 'said', 'its', 'uss', 'divisio...","['usx', 'x', 'inland', 'iad', 'in', 'steel', '..."
3,"['dst', 'systems', 'inc', 'said', 'its', 'pres...","['dst', 'systems', 'dsts', 'president', 'dies']"
4,"['iraq', 'said', 'today', 'its', 'troops', 'ha...","['iraq', 'says', 'iran', 'offensive', 'on', 's..."
...,...,...
995,"['east', 'and', 'west', 'on', 'monday', 'decid...","['east', 'west', 'approve', 'nuclear', 'fusion..."
996,"['us', 'house', 'ways', 'and', 'means', 'chair...","['rostenkowski', 'questions', 'securities', 't..."
997,"['the', 'soviet', 'union', 'has', 'agreed', 't...","['ussr', 'to', 'cut', 'coal', 'price', 'for', ..."
998,"['shr', 'one', 'ct', 'vs', 'three', 'cts', 'ne...","['enzo', 'biochem', 'inc', 'enzo', '2nd', 'qtr..."


In [65]:
def load_csv(df):
    df.body = df.body.apply(lambda x: back_to_list(x))
    df.title = df.title.apply(lambda x: back_to_list(x))
    return df
df = load_csv(df)
df.head()

Unnamed: 0,body,title
0,"[sapporo, breweries, ltd, of, japan, is, issui...","[sapporo, breweries, issues, five, year, sfr, ..."
1,"[zenith, laboratories, inc, said, the, company...","[zenith, labs, zen, will, report, 4th, qtr, loss]"
2,"[usx, corp, said, its, uss, division, and, inl...","[usx, x, inland, iad, in, steel, coil, venture]"
3,"[dst, systems, inc, said, its, president, and,...","[dst, systems, dsts, president, dies]"
4,"[iraq, said, today, its, troops, had, killed, ...","[iraq, says, iran, offensive, on, southern, fr..."


In [66]:
df.title[0]

['sapporo', 'breweries', 'issues', 'five', 'year', 'sfr', 'notes']

In [67]:
vocabH = generate_vocab(df,include_body=False)

Vocab Size: 2849


In [68]:
vocabHD = generate_vocab(df,include_headline=True, include_body=True)

Vocab Size: 13530


In [69]:
#Create vocab indexer
vocab_lookupHD = {i:v for i,v in enumerate(vocabHD)}
vocab_indexerHD = {v:i for i,v in enumerate(vocabHD)}

In [70]:
#encode in training - where all are known words
def encode(df, vocab_indexer, set_D = True):
    df.title = df.title.apply(lambda x: [vocab_indexer[i] for i in x])
    df.body = df.body.apply(lambda x: [vocab_indexer[i] for i in x])
    df['body_full'] = df.body
    if set_D:
        df.body = df.body.apply(lambda x: set(x))
    return df

In [71]:
encoded_df = encode(df, vocab_indexerHD)
encoded_df

Unnamed: 0,body,title,body_full
0,"{13060, 9478, 10248, 13455, 12943, 7057, 4506,...","[11587, 13455, 1111, 4980, 8928, 10534, 3137]","[11587, 13455, 11969, 13060, 7914, 7057, 12943..."
1,"{6666, 5646, 3092, 11290, 2592, 7712, 545, 617...","[8145, 4331, 1887, 9002, 5185, 3843, 7100, 5495]","[8145, 3547, 4019, 4175, 9478, 3321, 9002, 518..."
2,"{4353, 13060, 9478, 10381, 4366, 399, 5646, 13...","[6995, 6290, 1240, 12940, 10174, 5670, 182, 10...","[6995, 11606, 4175, 12477, 794, 6090, 7971, 12..."
3,"{12289, 13060, 9478, 11014, 12809, 4506, 1563,...","[8012, 7350, 4093, 9980, 13089]","[8012, 7350, 4019, 4175, 12477, 9980, 7971, 11..."
4,"{2434, 13060, 12036, 9478, 652, 8718, 13328, 1...","[11416, 11786, 8948, 453, 10391, 12496, 7516, 5]","[11416, 4175, 2655, 12477, 3436, 4432, 7530, 2..."
...,...,...,...
995,"{7687, 8, 10765, 9234, 2068, 1556, 1047, 4121,...","[13446, 1165, 4234, 9676, 9577, 10522, 2305]","[13446, 7971, 1165, 10391, 10872, 11635, 5502,..."
996,"{13060, 9478, 2694, 4363, 10381, 5133, 2450, 9...","[2047, 11976, 3238, 3701]","[105, 12622, 4462, 7971, 5308, 10228, 3432, 20..."
997,"{10880, 9472, 2946, 7428, 13060, 9478, 6412, 1...","[4628, 5502, 6362, 9398, 4925, 6328, 7428, 6451]","[9478, 2946, 7570, 10381, 5460, 5502, 6362, 12..."
998,"{4102, 8716, 4506, 6179, 677, 7338, 12080, 568...","[13199, 905, 4019, 13199, 6137, 7100, 12139, 8...","[1333, 5245, 10069, 5685, 7223, 6179, 1272, 41..."


In [72]:
#Compare body and body_full
len(encoded_df.body[1]), len(encoded_df.body_full[1])

(78, 110)

In [96]:
#Compute p_w_H_given_D:
Hn = len(vocabHD)
LM = np.zeros((Hn, Hn)) #LM #transition_matrix
HD = np.zeros((Hn, Hn))    #H|D #emission matrix
pi = np.zeros(Hn)

def train(D, D_full, H, Hn):
        """
        Method that fits the model on training data and compute transition and emission matrices
        @param X (D): array-like with dimension [# of examples, # of length]
        @param z (H): array-like with dimension [# of examples, # of length]
        """
        
        pi = np.zeros(Hn)+1
        wHD = np.zeros(Hn)+1
        wD = np.zeros(Hn)+1
        LM = np.zeros((Hn, Hn))+1e-10
        # Find pi and LM
        
        for i in range(len(H)): #loop through the headline
            for j in range(len(H[i])): #loop through each word in the headline
                # update initial state probability
                if j==0:
                    pi[H[i][j]] += 1
                # update transition matrix
                if j < len(H[i])-1:
                    LM[H[i][j], H[i][j+1]] += 1
                    
        for i in tqdm(range(len(D_full))): #loop through the document
            for j in range(len(D_full[i])): #loop through each word in the document
                # update initial state probability
                if j==0:
                    pi[D_full[i][j]] += 1
                # update transition matrix
                if j < len(D_full[i])-1:
                    LM[D_full[i][j], D_full[i][j+1]] += 1
                    
        # Find p(w in H| w in D)
        for i in tqdm(range(Hn)): #loop through the headline vocab
            for j in range(len(D)): #loop through each article 
                if i in D[j]: #now a set
                    wD[i]+=1
                    if i in H[j]:
                        wHD[i]+=1              
              
        # Compute probability
        # Initial vector
        #pi = np.where(pi ==0.0, 1., pi)
        pi /= np.sum(pi)
        
        # Transition matrix or LM model
        #LM = np.where(LM ==0.0, 1., LM) #can't do this
        row_sums_LM = np.sum(LM, axis=1)
        LM = LM/ row_sums_LM[:, np.newaxis]
        
        # p(w in H | w in D):
        wHD = wHD / wD
        
        # Find length:
        Hlen = [len(h) for h in H]
        Hlen_count = Counter(Hlen)
        max_len = max(Hlen_count.keys())
        Hlen_dist = {leng: count for leng, count in Hlen_count.items()}
        pHlen = np.zeros(max_len)
        for i in range(max_len):
            if i in Hlen_dist:
                pHlen[i] = Hlen_dist[i]
        pHlen+= 1e-10  
        pHlen/=max_len
        
        return wHD, pi, LM, pHlen


In [97]:
wHD, pi, LM, pHlen = train(encoded_df.body,encoded_df['body_full'], encoded_df.title, Hn)

100%|█████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:03<00:00, 278.84it/s]
100%|███████████████████████████████████████████████████████████████████████████| 13530/13530 [01:36<00:00, 139.98it/s]


In [98]:
vocab_indexerHD['millions']

10590

In [99]:
vocab_lookupHD[10590]

'millions'

In [100]:
[vocab_lookupHD[i] for i in np.where(LM[10590,:] == np.max(LM[10590,:]))[0]]

['dart', 'in', 'of']

In [101]:
# Viterbi algorithm

def viterbi_headline_gen(vocab, D, Hn, pi, LM, wHD, pHlen, n, vocab_lookup):
        """
        Use Viterbi to pre+1e-10dict a headline (w in H sequence) for each document
        H = vocab
        D = test document
        LM = transition_matrix #Hn x Hn
        wHD = emission_matrix #Hn x n
        Hn = number of all words in training (H+D)
        """
        #check input
        print('Length of vocab: ',len(vocab))
        print('Length of test document vocab: ',len(D))
        print('Max length of headline: ', n)
        
        # init holders        

        # Forward
        # Initialize
        H = vocab
        V = np.zeros((n, Hn))
        best_state = np.zeros(n)
        best_states = np.zeros((n, Hn))
        z = []
        
        #V[0,:] = wHD * pi
        for i in range(n): #n = length      
            for j in D: #loop through each word in document (because we only care about those in D)
                # Use log form
                p_wHD = wHD[j]
                
                if i==0:                
                    V[i][j] = p_wHD #+np.log(pi[j]) #initiate p(first position = j) * p(w in H | w in D)
                    best_states[i][j] = int(np.argmax(p_wHD)) 
                else:
                    p_wLM = V[i-1,:] * LM[:,j]
                    V[i][j] = p_wHD * np.max(p_wLM) #p_wHD
                    #print('j', j, int(np.argmax(p_wLM)))
                    best_states[i][j] = int(np.argmax(p_wLM)) 
                #print('Time:', i, vocab_lookup[j], V[i][j])   
                
            #best_state[i] = int(np.argmax(V[i,:]))
               

        #z is the best headline generated for each len -> need to figure out how to compare among the z(s)
        #Backward
        #print('final state: ',np.argmax(V[-1,:]))
        z.append(np.argmax(V[-1,:]))
        for i in reversed(range(1,n)):
            z_prev = z[-1]
            #print('z_prev', z_prev, int(best_states[i][z_prev]))
            z.append(int(best_states[i][z_prev]))
        
        z.reverse()
        #h_candidate[n] = best_state
        return z , V

In [102]:
#test
Dtest = encoded_df['body_full']
Htest = encoded_df.title
dtest = Dtest[0]
htest = Htest[0]

In [103]:
#check dtest
dtest_word = [vocab_lookupHD[i] for i in dtest]
dtest_title = [vocab_lookupHD[i] for i in htest]

In [104]:
dtest_title

['sapporo', 'breweries', 'issues', 'five', 'year', 'sfr', 'notes']

In [109]:
test_candidate, V = viterbi_headline_gen(vocabHD, dtest, Hn, pi, LM, wHD, pHlen, 6, vocab_lookupHD)

Length of vocab:  13530
Length of test document vocab:  44
Max length of headline:  6


In [110]:
#deccode
print('Correct is:')
print(dtest_title)
print('Predicted is:')
headline = [vocab_lookupHD[s] for s in test_candidate]
print(headline)


Correct is:
['sapporo', 'breweries', 'issues', 'five', 'year', 'sfr', 'notes']
Predicted is:
['sapporo', 'breweries', 'ltd', 'and', '10014', 'issue']
