In [79]:
import os
import pandas as pd
from collections import defaultdict, Counter
import numpy as np
from tqdm import tqdm

In [81]:
np.random.seed(2021)

In [82]:
def back_to_list(s):
    end = s.replace('[', '').replace(']', '').replace('\'', '').replace(' ', '').split(',')
    if '' in end:
        end.remove('')
    return end


def generate_vocab(df, include_headline=True, include_body=False):
    vocab = set()
    if include_headline:
        for row in df.title:
            vocab = vocab.union(set(row))
    if include_body:
        for row in df.body:
            vocab = vocab.union(set(row)) 
    #vocab.remove('')
    vocab = list(vocab)
    print(f'Vocab Size: {len(vocab)}')
    return vocab

### Sample run

In [83]:
#df = pd.read_csv('newshead/data/train.csv')[:100]
df = pd.read_csv('reuters/reuters_train.csv')[:1000]

In [84]:
df.rename(columns = {'text':'body'},inplace=True)
df

Unnamed: 0,body,title
0,"['sapporo', 'breweries', 'ltd', 'of', 'japan',...","['sapporo', 'breweries', 'issues', 'five', 'ye..."
1,"['zenith', 'laboratories', 'inc', 'said', 'the...","['zenith', 'labs', 'zen', 'will', 'report', '4..."
2,"['usx', 'corp', 'said', 'its', 'uss', 'divisio...","['usx', 'x', 'inland', 'iad', 'in', 'steel', '..."
3,"['dst', 'systems', 'inc', 'said', 'its', 'pres...","['dst', 'systems', 'dsts', 'president', 'dies']"
4,"['iraq', 'said', 'today', 'its', 'troops', 'ha...","['iraq', 'says', 'iran', 'offensive', 'on', 's..."
...,...,...
995,"['east', 'and', 'west', 'on', 'monday', 'decid...","['east', 'west', 'approve', 'nuclear', 'fusion..."
996,"['us', 'house', 'ways', 'and', 'means', 'chair...","['rostenkowski', 'questions', 'securities', 't..."
997,"['the', 'soviet', 'union', 'has', 'agreed', 't...","['ussr', 'to', 'cut', 'coal', 'price', 'for', ..."
998,"['shr', 'one', 'ct', 'vs', 'three', 'cts', 'ne...","['enzo', 'biochem', 'inc', 'enzo', '2nd', 'qtr..."


In [85]:
def load_csv(df):
    df.body = df.body.apply(lambda x: back_to_list(x))
    df.title = df.title.apply(lambda x: back_to_list(x))
    return df
df = load_csv(df)
df.head()

Unnamed: 0,body,title
0,"[sapporo, breweries, ltd, of, japan, is, issui...","[sapporo, breweries, issues, five, year, sfr, ..."
1,"[zenith, laboratories, inc, said, the, company...","[zenith, labs, zen, will, report, 4th, qtr, loss]"
2,"[usx, corp, said, its, uss, division, and, inl...","[usx, x, inland, iad, in, steel, coil, venture]"
3,"[dst, systems, inc, said, its, president, and,...","[dst, systems, dsts, president, dies]"
4,"[iraq, said, today, its, troops, had, killed, ...","[iraq, says, iran, offensive, on, southern, fr..."


In [86]:
df.title[0]

['sapporo', 'breweries', 'issues', 'five', 'year', 'sfr', 'notes']

In [87]:
vocabH = generate_vocab(df,include_body=False)

Vocab Size: 2849


In [88]:
vocabHD = generate_vocab(df,include_headline=True, include_body=True)

Vocab Size: 13530


In [89]:
#Create vocab indexer
vocab_lookupHD = {i:v for i,v in enumerate(vocabHD)}
vocab_indexerHD = {v:i for i,v in enumerate(vocabHD)}

In [90]:
#encode in training - where all are known words
def encode(df, vocab_indexer, set_D = True):
    df.title = df.title.apply(lambda x: [vocab_indexer[i] for i in x])
    df.body = df.body.apply(lambda x: [vocab_indexer[i] for i in x])
    df['body_full'] = df.body
    if set_D:
        df.body = df.body.apply(lambda x: set(x))
    return df

In [91]:
encoded_df = encode(df, vocab_indexerHD)
encoded_df

Unnamed: 0,body,title,body_full
0,"{2816, 4623, 5264, 10385, 1433, 12580, 7078, 9...","[317, 7078, 1265, 12000, 5264, 5159, 12860]","[317, 7078, 4705, 1341, 4205, 941, 7726, 79, 4..."
1,"{4623, 9237, 1046, 8727, 9755, 9254, 8234, 102...","[9070, 9887, 13394, 5424, 7124, 2711, 9367, 4194]","[9070, 8406, 3319, 1111, 12539, 9794, 5424, 71..."
2,"{1666, 8962, 8073, 12425, 7692, 8332, 9614, 12...","[8073, 5020, 856, 5299, 9074, 3039, 8949, 2369]","[8073, 9599, 1111, 8301, 9688, 7018, 10291, 85..."
3,"{2816, 11906, 12676, 517, 7943, 4488, 2442, 11...","[9876, 8818, 3174, 2442, 9994]","[9876, 8818, 3319, 1111, 8301, 2442, 10291, 86..."
4,"{2565, 10501, 7943, 8460, 9997, 8846, 911, 948...","[5996, 2506, 13099, 3666, 5529, 1252, 7203, 12...","[5996, 1111, 911, 8301, 11737, 2590, 90, 2402,..."
...,...,...,...
995,"{3584, 2049, 7686, 5638, 10760, 3083, 1549, 10...","[1948, 1605, 4614, 1816, 7016, 3883, 12989]","[1948, 10291, 1605, 5529, 12693, 13068, 3063, ..."
996,"{2432, 2816, 6403, 11528, 10121, 1162, 3083, 3...","[12483, 4711, 12449, 7914]","[6389, 4372, 1325, 10291, 9082, 12712, 8260, 1..."
997,"{2816, 2561, 5504, 12928, 7172, 8067, 12038, 8...","[4289, 3063, 1179, 2561, 3132, 10136, 10721, 1...","[12539, 2331, 8586, 10066, 9749, 3063, 1179, 8..."
998,"{3586, 7043, 6410, 6411, 4623, 7572, 5781, 386...","[2555, 4433, 3319, 2555, 894, 9367, 4753, 5480...","[8880, 2968, 9167, 2624, 12257, 5320, 6087, 63..."


In [92]:
#Compute p_w_H_given_D:
Hn = len(vocabHD)
LM = np.zeros((Hn, Hn)) #LM #transition_matrix
HD = np.zeros((Hn, Hn))    #H|D #emission matrix
pi = np.zeros(Hn)

def train(D, D_full, H, Hn):
        """
        Method that fits the model on training data and compute transition and emission matrices
        @param X (D): array-like with dimension [# of examples, # of length]
        @param z (H): array-like with dimension [# of examples, # of length]
        """
        
        pi = np.zeros(Hn)
        wHD = np.zeros(Hn)+1
        wD = np.zeros(Hn)+1
        LM = np.zeros((Hn, Hn))
        # Find pi and LM
        
        for i in range(len(H)): #loop through the headline
            for j in range(len(H[i])): #loop through each word in the headline
                # update initial state probability
                if j==0:
                    pi[H[i][j]] += 1
                # update transition matrix
                if j < len(H[i])-1:
                    LM[H[i][j], H[i][j+1]] += 1
                    
        for i in tqdm(range(len(D_full))): #loop through the document
            for j in range(len(D_full[i])): #loop through each word in the document
                # update initial state probability
                if j==0:
                    pi[D_full[i][j]] += 1
                # update transition matrix
                if j < len(D_full[i])-1:
                    LM[D_full[i][j], D_full[i][j+1]] += 1
                    
        # Find p(w in H| w in D)
        for i in tqdm(range(Hn)): #loop through the headline vocab
            for j in range(len(D)): #loop through each article 
                if i in D[j]: #now a set
                    wD[i]+=1
                    if i in H[j]:
                        wHD[i]+=1              
              
        # Compute probability
        # Initial vector
        pi = np.where(pi ==0.0, 1., pi)
        pi /= np.sum(pi)
        
        # Transition matrix or LM model
        LM = np.where(LM ==0.0, 1., LM)
        row_sums_LM = np.sum(LM, axis=1)
        LM = LM/ row_sums_LM[:, np.newaxis]
        
        # p(w in H | w in D):
        wHD = wHD / wD
        
        # Find length:
        Hlen = [len(h) for h in H]
        Hlen_count = Counter(Hlen)
        max_len = max(Hlen_count.keys())
        Hlen_dist = {leng: count for leng, count in Hlen_count.items()}
        pHlen = np.zeros(max_len)
        for i in range(max_len):
            if i in Hlen_dist:
                pHlen[i] = Hlen_dist[i]
        pHlen+= 1e-10  
        pHlen/=max_len
        
        return wHD, pi, LM, pHlen


In [93]:
wHD, pi, LM, pHlen = train(encoded_df.body,encoded_df['body_full'], encoded_df.title, Hn)

100%|█████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:01<00:00, 854.03it/s]
100%|███████████████████████████████████████████████████████████████████████████| 13530/13530 [00:39<00:00, 342.70it/s]


In [97]:
vocab_indexerHD['millions']

5233

In [98]:
vocab_lookupHD[5233]

'millions'

In [104]:
l = np.where(LM[5233,:] == np.max(LM[5233,:]))
l[:3]

(array([    0,     1,     2, ..., 13527, 13528, 13529], dtype=int64),)

In [None]:
[vocab_lookupHD[i] for i in 

In [70]:
# Viterbi algorithm

def viterbi_headline_gen(vocab, D, Hn, pi, LM, wHD, pHlen, n, vocab_lookup):
        """
        Use Viterbi to pre+1e-10dict a headline (w in H sequence) for each document
        H = vocab
        D = test document
        LM = transition_matrix #Hn x Hn
        wHD = emission_matrix #Hn x n
        Hn = number of all words in training (H+D)
        """
        #check input
        print('Length of vocab: ',len(vocab))
        print('Length of test document vocab: ',len(D))
        print('Max length of headline: ', n)
        
        # init holders        

        # Forward
        # Initialize
        H = vocab
        V = np.zeros((n, Hn))
        best_state = np.zeros(n)
        best_states = np.zeros((n, Hn))
        z = []
        
        #V[0,:] = wHD * pi
        for i in range(n): #n = length      
            for j in D: #loop through each word in document (because we only care about those in D)
                # Use log form
                p_wHD = wHD[j]
                
                if i==0:                
                    V[i][j] = p_wHD #+np.log(pi[j]) #initiate p(first position = j) * p(w in H | w in D)
                    best_states[i][j] = int(np.argmax(p_wHD)) 
                else:
                    p_wLM = V[i-1,:] * LM[:,j]
                    V[i][j] = p_wHD * np.max(p_wLM) #p_wHD
                    #print('j', j, int(np.argmax(p_wLM)))
                    best_states[i][j] = int(np.argmax(p_wLM)) 
                print('Time:', i, vocab_lookup[j], V[i][j])   
                
            #best_state[i] = int(np.argmax(V[i,:]))
               

        #z is the best headline generated for each len -> need to figure out how to compare among the z(s)
        #Backward
        print('final state: ',np.argmax(V[-1,:]))
        z.append(np.argmax(V[-1,:]))
        for i in reversed(range(1,n)):
            z_prev = z[-1]
            print('z_prev', z_prev, int(best_states[i][z_prev]))
            z.append(int(best_states[i][z_prev]))
        
        z.reverse()
        #h_candidate[n] = best_state
        return z , V

In [71]:
#test
Dtest = encoded_df['body_full']
Htest = encoded_df.title
dtest = Dtest[0]
htest = Htest[0]

In [74]:
#check dtest
dtest_word = [vocab_lookupHD[i] for i in dtest]
dtest_title = [vocab_lookupHD[i] for i in htest]

In [75]:
dtest_title

['sapporo', 'breweries', 'issues', 'five', 'year', 'sfr', 'notes']

In [76]:
test_candidate, V = viterbi_headline_gen(vocabHD, dtest, Hn, pi, LM, wHD, pHlen, 6, vocab_lookupHD)

Length of vocab:  13530
Length of test document vocab:  44
Max length of headline:  6
Time: 0 sapporo 1.0
Time: 0 breweries 1.0
Time: 0 ltd 0.03333333333333333
Time: 0 of 0.056372549019607844
Time: 0 japan 0.38181818181818183
Time: 0 is 0.007246376811594203
Time: 0 issuing 0.08
Time: 0 100 0.09090909090909091
Time: 0 mln 0.08908685968819599
Time: 0 swiss 0.3125
Time: 0 francs 0.07407407407407407
Time: 0 of 0.056372549019607844
Time: 0 five 0.04672897196261682
Time: 0 year 0.04633204633204633
Time: 0 notes 0.35294117647058826
Time: 0 with 0.044854881266490766
Time: 0 a 0.004273504273504274
Time: 0 458 0.3333333333333333
Time: 0 pct 0.08387096774193549
Time: 0 coupon 0.08333333333333333
Time: 0 and 0.03537414965986395
Time: 0 10014 0.5
Time: 0 issue 0.19117647058823528
Time: 0 price 0.056074766355140186
Time: 0 lead 0.044444444444444446
Time: 0 manager 0.02702702702702703
Time: 0 swiss 0.3125
Time: 0 bank 0.24615384615384617
Time: 0 corp 0.027149321266968326
Time: 0 said 0.005
Time: 0 th

In [59]:
#deccode
print('Correct is:')
print(dtest_title)
print('Predicted is:')
headline = [vocab_lookupHD[s] for s in test_candidate]
print(headline)


Correct is:
['sapporo', 'breweries', 'issues', 'five', 'year', 'sfr', 'notes']
Predicted is:
['bank', 'of', 'the', 'bank', 'of', 'japan']
