In [1]:
# Imports necessary libraries
import torch
import os
from typing import List, Tuple
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import matplotlib as mpl
from nltk import word_tokenize, sent_tokenize
import nltk
# nltk.download('averaged_perceptron_tagger')

import transformers
import random

import torch.nn as nn
import torch.optim as optim

from pathlib import Path
from tqdm.notebook import tqdm, trange
from typing import Dict, List, Set, Tuple

from torch.nn import init
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
from transformers import AdamW

from transformers import RobertaConfig, RobertaModel, RobertaTokenizer

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  dtype=np.int):
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  method='lar', copy_X=True, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  method='lar', copy_X=True, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_Gram=True, verbose=0,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.floa

Makes paths to training, development, and test data

In [2]:
# Creates paths to data
train_path = os.path.join(os.getcwd(), "Data", "train")
dev_path = os.path.join(os.getcwd(), "Data", "dev")
test_path = os.path.join(os.getcwd(), "Data", "test")

Credits: Tokenizer code was mostly made by Professor Claire Cardie from Cornell University

Tokenizer function
The following function read the files at the given paths and create two lists for each path. The first list contains a list of list of strings (one list of strings for each example), and the second one contains a list of list of ints (on list of ints for each example).

In [3]:
def read_txt(fname):
    with open(fname, encoding='utf-8') as open_article:
        lines = open_article.read()
    return lines

def read_labels(labels : str) -> List[Tuple[int, int]]:
    "processing of labels file"
    labels = labels.split("\n")[:-1]
    labels = [tuple(map(int, l.split("\t")[-2:])) for l in labels]
    return labels

def sort_and_merge_labels(labels : List[Tuple[int, int]]) -> List[Tuple[int, int]]:
    "sort labels, necessary for later splitting"
    if len(labels) == 0:
        return labels
    labels = list(sorted(labels, key = lambda t: t[0]))
    # merge
    curr = labels[0]
    merged = []
    for l in labels[1:]:
        # if distinct, add
        if l[0] > curr[1]:
            merged.append(curr)
            curr = l
        # else merge
        else:
            curr = (curr[0], max(curr[1], l[1]))
    merged.append(curr)
    return merged

def split_with_labels(labels : List[Tuple[int, int]], article : str) -> Tuple[List[str], List[int]]:
    "split text into segments based upon labels"
    if len(labels) == 0:
        return [article], [0]
    segments = []
    binary_class = []
    start = 0
    for l_start, l_end in labels:
        std_seg = article[start:l_start]
        prop_seg = article[l_start:l_end]
        segments.append(std_seg)
        binary_class.append(0)
        segments.append(prop_seg)
        binary_class.append(1)
        start = l_end
    last_seg = article[start:]
    segments.append(last_seg)
    binary_class.append(0)
    return segments, binary_class

def remove_newline_fix_punc_seg(segments):
    "preprocessing necessary for tokenization to be consistent"
    segments = [s.replace("\n", " ").replace(".", " .") for s in segments]
    return segments

def remove_newline_fix_punc_art(article):
    "preprocessing necessary for tokenization to be consistent"
    article = article.replace("\n", " ").replace(".", " .")
    return article

def get_toks(input):
    output = []
    for toks in [list(map(str.lower, word_tokenize(sent))) for sent in sent_tokenize(input)]:
        output += toks
    return output

# This is the function you may need to call
def tokenize_article(article_file):
    "calls all functions above and perform sanity checks"
    article = read_txt(article_file)
    article = remove_newline_fix_punc_art(article)
    art_toks = get_toks(article)
    return art_toks

# This is the function you may need to call
def master_tokenizer(article_file, labels_file):
    "calls all functions above and perform sanity checks"
    # read and get labels
    article = read_txt(article_file)
    labels = read_txt(labels_file)
    labels = read_labels(labels)
    labels = sort_and_merge_labels(labels)
    segments, binary_class = split_with_labels(labels, article)
    article = remove_newline_fix_punc_art(article)
    segments = remove_newline_fix_punc_seg(segments)
    # sanity check
    reconstructed = ""
    for seg, lab in zip(segments, binary_class):
        reconstructed += seg
    assert reconstructed == article
    # tokenize
    seg_toks = []
    new_labels = []
    for seg, label in zip(segments, binary_class):
        new_toks = get_toks(seg)
        seg_toks += new_toks
        new_labels += [label for _ in range(len(new_toks))]
        # sanity check
    art_toks = get_toks(article)
    sanity = True
    if len(art_toks) != len(seg_toks):
        sanity = False
    for i, (at, st, lab) in enumerate(zip(art_toks, seg_toks, new_labels)):
        if at != st:
            sanity = False
            break
    return seg_toks, new_labels, sanity

The next section uses the functions above to read the training and development data and save them as 4 variables: tokenized_train_articles, tokenized_train_labels, tokenized_dev_articles, tokenized_dev_labels

In [4]:
# list -- file names of each article, sorted alphabetically
train_articles_list = []
dev_articles_list = []

# list -- file names of each corresponding labels file, sorted alphabetically
train_labels_list = []
dev_labels_list = []

for file_name in os.listdir(train_path):
    if file_name[-11:] == '.labels.tsv':
        train_labels_list.append(file_name)
    elif file_name != 'test.task-SLC.labels' and file_name != '.article999001621.labels.tsv.swo':
        train_articles_list.append(file_name)

for file_name in os.listdir(dev_path):
    if file_name[-11:] == '.labels.tsv':
        dev_labels_list.append(file_name)
    else:
        dev_articles_list.append(file_name)

train_articles_list = sorted(train_articles_list)
train_labels_list = sorted(train_labels_list)
dev_articles_list = sorted(dev_articles_list)
dev_labels_list = sorted(dev_labels_list)

# list of lists -- each list is a tokenized article
tokenized_train_articles = []
# list of lists -- each list is the labels corresponding to the article 
tokenized_train_labels = []

# Creates a two lists: one containing each article as a list of strings, and one
# containing the corresponding labels as a list of ints
for i in range(len(train_articles_list)):
    article_file = os.path.join(train_path, train_articles_list[i])
    labels_file = os.path.join(train_path, train_labels_list[i])
    art, lab, _ = master_tokenizer(article_file, labels_file)
    tokenized_train_articles.append(art)
    tokenized_train_labels.append(lab)

# list of lists -- each list is a tokenized article
tokenized_dev_articles = []
# list of lists -- each list is the labels corresponding to the article 
tokenized_dev_labels = []

# Creates a two lists: one containing each article as a list of strings, and one
# containing the corresponding labels as a list of ints
for i in range(len(dev_articles_list)):
    article_file = os.path.join(dev_path, dev_articles_list[i])
    labels_file = os.path.join(dev_path, dev_labels_list[i])
    art, lab, _ = master_tokenizer(article_file, labels_file)
    tokenized_dev_articles.append(art)
    tokenized_dev_labels.append(lab)

In [5]:
# Takes in a set of binary tags and changes it to a set of BIO tags by changing every 1 that is not a leading 1 to a 2.
def binary_tags_to_BIO(binary_labels):
    for idx in range(1,len(binary_labels)):
        if binary_labels[idx] == 1 and (binary_labels[idx-1] != 0):
            binary_labels[idx] = 2
    return binary_labels

# Adds beginning of article and end of article labels
def add_boa_and_eoa_tags(binary_labels, boa_label=3, eoa_label=4):
    binary_labels.insert(0, boa_label)
    binary_labels.append(eoa_label)
    return binary_labels

In [6]:
tokenized_train_labels = list(map(add_boa_and_eoa_tags, list(map(binary_tags_to_BIO, tokenized_train_labels))))
tokenized_dev_labels = list(map(add_boa_and_eoa_tags, list(map(binary_tags_to_BIO, tokenized_dev_labels))))

Makes trainloader and validation loader for the model

In [7]:
def get_word_to_idx(examples):
    word_to_idx = {}
    cur_idx = 1
    for example in examples:
        example_words = list(set(example))
        for word in example_words:
            if word not in word_to_idx.keys():
                word_to_idx[word] = cur_idx
                cur_idx += 1
    word_to_idx['<unk>'] = cur_idx
    return word_to_idx

def vectorize_article(word_to_idx, lst, max_length):
    result = torch.zeros(1,max_length)
    vocab = word_to_idx.keys()
    replace_unknowns = lambda word : word if word in vocab else '<unk>'
    lst = list(map(replace_unknowns, lst))
    for idx, word in enumerate(lst):
        result[0,idx] = word_to_idx[word]
    return result

def vectorize_label(labels, max_length):
    result = torch.zeros(1,max_length) - 1
    for idx, word in enumerate(labels):
        result[0,idx] = labels[idx]
        
    return result

def vectorize_example(word_to_idx, example, max_length):
    article = example[0]
    labels = example[1]
    
    return vectorize_article(word_to_idx, article, max_length), vectorize_label(labels, max_length)


In [8]:
word_to_idx = get_word_to_idx(tokenized_train_articles)
idx_to_word = {value:key for (key, value) in word_to_idx.items()}
max_length = max(list(map(len, tokenized_train_labels)))

vectorized_train_data = [vectorize_example(word_to_idx, ex, max_length) for ex in zip(tokenized_train_articles, tokenized_train_labels)]
vectorized_dev_data = [vectorize_example(word_to_idx, ex, max_length) for ex in zip(tokenized_dev_articles, tokenized_dev_labels)]
print(vectorized_train_data[5][0].size())

torch.Size([1, 8862])


In [9]:
class ModelDataset(Dataset):
    """ModelDataset is a torch dataset to interact with the propoganda data.

    :param data: The un-vectorized dataset with input and expected output values
    :type data: List[Tuple[List[String], List[Int]]]
    """
    def __init__(self, data):
        self.X = torch.cat([X for X, _ in data], dim=0)
        self.Y = torch.cat([Y for _, Y in data], dim=0)
        self.len = len(data)
    
    def __len__(self):
        """__len__ returns the number of samples in the dataset.

        :returns: number of samples in dataset
        :rtype: int
        """
        return self.len
    
    def __getitem__(self, index):
        """__getitem__ returns the inputs, outputs pair for a given index

        :param index: index within dataset to return
        :type index: int
        :returns: A tuple (x, y) where x is model inputs (pair of string-lists)
                    and y is a bool and int-list pair
        :rtype: Tuple[Tuple[List[String], List[String]], Tuple[Bool, List[int]]]
        """
        return self.X[index], self.Y[index]

def get_data_loaders(train, val, batch_size=8):
    """
    """
    # First we create the dataset given our train and validation lists
    dataset = ModelDataset(train+val)
    
    # Then, we create a list of indices for all samples in the dataset
    train_indices = [i for i in range(len(train))]
    val_indices = [i for i in range(len(train), len(train) + len(val))]
    
    # Now we define samplers and loaders for train and val
    train_sampler = SubsetRandomSampler(train_indices)
    train_loader = DataLoader(dataset, batch_size=batch_size, sampler=train_sampler)
    
    val_sampler = SubsetRandomSampler(val_indices)
    val_loader = DataLoader(dataset, batch_size=batch_size, sampler=val_sampler)

    return train_loader, val_loader

In [10]:
train_loader, val_loader = get_data_loaders(vectorized_train_data, vectorized_dev_data)

Fine tunes a RoBERTa network using the testing data

In [11]:
def argmax(vec):
    # return the argmax as a python int
    _, idx = torch.max(vec, 1)
    return idx.item()

# Compute log sum exp in a numerically stable way for the forward algorithm
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + \
        torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

In [12]:
class RoBERTa_CRF_Model_1(nn.Module):
    def __init__(self, num_classes, tag_to_idx):
        super().__init__()
        
        self.tokenizer = RobertaTokenizer.from_pretrained("roberta-large", add_prefix_space=True, model_max_length=512)
        self.roberta = RobertaModel.from_pretrained('roberta-large')
        self.encoding_to_probs = nn.Linear(1024, num_classes)
        self.dropout = nn.Dropout(p=0.1)
        
        self.num_classes = num_classes
        self.tag_to_idx = tag_to_idx
        
        self.transitions = nn.Parameter(torch.randn(self.num_classes, self.num_classes))

        self.transitions.data[self.tag_to_idx['START_TAG'], :] = -10000
        self.transitions.data[:, self.tag_to_idx['END_TAG']] = -10000
        
        
    def _forward_alg(self, roberta_output):
        
        init_alphas = torch.full((1, self.num_classes), -10000.).to('cuda')
        
        init_alphas[0][self.tag_to_idx['START_TAG']] = 0.
        
        forward_var = init_alphas
        
         # Iterate through the sentence
        for feat in roberta_output:
            
            alphas_t = []  # The forward tensors at this timestep
            
            for next_tag in range(self.num_classes):
                
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.num_classes)
                
                trans_score = self.transitions[next_tag].view(1, -1)
                
                next_tag_var = forward_var + trans_score + emit_score
                
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
                
            forward_var = torch.cat(alphas_t).view(1, -1)
            
        terminal_var = forward_var + self.transitions[self.tag_to_idx['END_TAG']]
        alpha = log_sum_exp(terminal_var)
        return alpha
        
    def _get_roberta_probabilities(self, article):
        tokenized_article = self.tokenizer(article, is_split_into_words=True, return_tensors='pt')
        _ , article_length = tokenized_article['input_ids'].size()
        input_ids = (tokenized_article['input_ids']).to('cuda')
        if article_length > 512:
            indices = torch.tensor([i for i in range(512)])
            encodings = self.roberta(torch.index_select(input_ids, 1,indices.to('cuda'))).last_hidden_state
            print(encodings.size())
            n = 512
            i=0
            while n < article_length:
                i += 1
                print('Iteration: ' + str(i))
                old_n = n
                n = min(n+512, article_length)
                print(old_n)
                print(n)
                indices = torch.tensor([i for i in range(old_n, n)]).to('cuda')
                print(torch.index_select(input_ids, 1, indices).size())
                cur_encodings = self.roberta(torch.index_select(input_ids, 1,indices)).last_hidden_state
                print(cur_encodings.size())
                print(encodings.size())
                encodings = torch.cat((encodings, cur_encodings), 1)
                print(encodings.size())
        else:
            encodings = self.roberta(**tokenized_article).last_hidden_state
        
        probabilities = self.encoding_to_probs(encodings)
        probabilities = self.dropout(probabilities)
        
        return probabilities.reshape((article_length, self.num_classes))
    
    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1)
        tags = torch.cat([torch.tensor([self.tag_to_idx['START_TAG']], dtype=torch.long), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_idx['END_TAG'], tags[-1]]
        return score
        
    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.num_classes), -10000.)
        init_vvars[0][self.tag_to_idx['START_TAG']] = 0

        forward_var = init_vvars
        
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.num_classes):
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
                
                
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        terminal_var = forward_var + self.transitions[self.tag_to_idx['END_TAG']]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
            
        start = best_path.pop()
        best_path.reverse()
        return path_score, best_path
        
    def neg_log_likelihood(self, sentences, tag_sequences):
        first = True
        for (sentence, tags) in zip(sentences, tag_sequences):
            probs = self._get_roberta_probabilities(sentence)
            forward_score = self._forward_alg(probs)
            gold_score = self._score_sentence(probs, tags)
            if first:
                loss = torch.sum(forward_score - gold_score)
                first = False
            else:
                lost += torch.sum(forward_score - gold_score)
        return loss
        
    def forward(self, articles):
        
        results = []
        for article in articles:
            roberta_probs = self._get_roberta_probabilities(article)

            score, tag_seq = self._viterbi_decode(roberta_probs)
            results.append((score, tag_seq))
        return results
    
    def load_model(self, save_path):
        self.load_state_dict(torch.load(save_path))
        
    def save_model(self, save_path):
        torch.save(self.state_dict(), save_path)

In [13]:
def train_epoch(model, train_data, optimizer):
    model.train()
    total = 0
    loss = 0
    correct = 0
    
    for (input_batch, expected_out) in tqdm(train_loader, leave=False, desc="Training Batches"):
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        
        model.zero_grad()
        inputs = [""]*len(input_batch)
        outputs = [torch.empty(1)]*len(input_batch)
        for idx, seq in enumerate(input_batch):
            last_index = int((seq == 0).nonzero(as_tuple=True)[0][0])
            inputs[idx] = " ".join(list(map( lambda x : idx_to_word[x], seq[:last_index].tolist())))

        
        for idx, seq in enumerate(expected_out):
            try:
                last_index = int((seq == -1).nonzero(as_tuple=True)[0][0])
            except:
                print(last_index)
            outputs[idx] = seq[:last_index]
        
        # Step 3. Run our forward pass.
        loss = model.neg_log_likelihood(inputs, outputs)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()

def evaluation(model, val_loader, optimizer):
    model.eval()
    total = 0
    loss = 0
    correct = 0
    
    for (input_batch, expected_out) in tqdm(train_loader, leave=False, desc="Training Batches"):

        sentence_in = prepare_sequence(input_batch, word_to_ix)
        targets = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long)
        
        loss += model.neg_log_likelihood(sentence_in, expected_out)
            
    # Return loss
    return loss

def train_and_evaluate(number_of_epochs, model, train_loader, val_loader):
    optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
    train_accuracies = []
    train_losses = []
    val_losses = []
    for epoch in trange(number_of_epochs, desc="Epochs"):
        train_acc, train_loss = train_epoch(model, train_loader, optimizer)
        val_loss = evaluation(model, val_loader, optimizer)
        train_accuracies.append(train_acc)
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        print("Epoch : "+str(epoch+1)+" | Train loss: "+str(train_loss)+" | Val loss: "+str(val_loss))
    return train_accuracies, train_losses, val_losses

In [14]:
tag_to_idx = {'B': 1, 'I': 2, 'O': 0, 'START_TAG': 3, 'END_TAG': 4}
model = RoBERTa_CRF_Model_1(5, tag_to_idx).to('cuda')

In [15]:
train_and_evaluate(3, model, train_loader, val_loader)

HBox(children=(IntProgress(value=0, description='Epochs', max=3, style=ProgressStyle(description_width='initia…

HBox(children=(IntProgress(value=0, description='Training Batches', max=37, style=ProgressStyle(description_wi…

Token indices sequence length is longer than the specified maximum sequence length for this model (4960 > 512). Running this sequence through the model will result in indexing errors


torch.Size([1, 512, 1024])
Iteration: 1
512
1024
torch.Size([1, 512])


RuntimeError: CUDA out of memory. Tried to allocate 16.00 MiB (GPU 0; 6.00 GiB total capacity; 4.66 GiB already allocated; 14.44 MiB free; 4.68 GiB reserved in total by PyTorch)

In [None]:
articles = ['Hello, my name is Bob', 'Hello Sammy, my name is Samuel']
print(articles)
print()
probs = model._get_roberta_probabilities(articles[0])
print(probs)
prediction = model.forward(articles)
print(prediction)
# [0, 1437, 0, 2] '<s>'
# [0, 1437, 2, 2] '</s>'
# [0, 1437, 0, 1437, 2, 2] '<s> </s>'
# [0, 1437, 0, 20920, 1437, 2, 2] '<s> Hello </s>'
# [0, 1437, 0, 20920, 6, 127, 766, 16, 3045, 1437, 2, 2] '<s> Hello, my name is Bob </s>'

Creates functions that take in a sentence and returns a list of dictionaries, one for each token, containing features for each token

In [None]:
import torchtext
glove25 = torchtext.vocab.GloVe(name='6B', dim=50)

In [None]:
# Creates a dictionary containing the features for the token at index in a given
# article. This does not add part-of-speech tags because doing that on a token-
# by-token basis was too innefficient. Instead, part-of-speech tags were added
# in the next function
def extractFeatureForToken(index, article):
    global glove25
    word = article[index]
    embedding = glove25[word]
    dic = {}
    for i in range(len(embedding)):
        dic[str(i)] = float(embedding[i])
#     maxIndex = len(article)-1
#     if (index-4 >= 0):
#         prevWord4 = article[index-4]
#     else:
#         prevWord4 = ""

#     if (index-3 >= 0):
#         prevWord3 = article[index-3]
#     else:
#         prevWord3 = ""

#     if (index-2 >= 0):
#         prevWord2 = article[index-2]
#     else:
#         prevWord2 = ""

#     if (index-1 >= 0):
#         prevWord1 = article[index-1]
#     else:
#         prevWord1 = ""

#     if (index+1 <= maxIndex):
#         nextWord1 = article[index+1]
#     else:
#         nextWord1 = ""

#     if (index+2 <= maxIndex):
#         nextWord2 = article[index+2]
#     else:
#         nextWord2 = ""

#     if (index+3 <= maxIndex):
#         nextWord3 = article[index+3]
#     else:
#         nextWord3 = ""

#     if (index+4 <= maxIndex):
#         nextWord4 = article[index+4]
#     else:
#         nextWord4 = ""
  
#    dic = {
#         "prevWord4" : prevWord4,
#         "prevWord3" : prevWord3,
#         "prevWord2" : prevWord2,
#         "prevWord1" : prevWord1,
#         "word" : article[index],
#         "nextWord1" : nextWord1,
#         "nextWord2" : nextWord2,
#         "nextWord3" : nextWord3,
#         "nextWord4" : nextWord4
#     }
    return dic

# Extracts a feature dictionary for each token in an article and returns a list
# of all dictionaries. 
def extractFeaturesForArticle(article):
    indices = list(range(len(article)))
    max_index = len(article)-1

    pos = nltk.pos_tag(article)
    featurelist = list(map(lambda x: extractFeatureForToken(x, article),indices))
    for i in indices:
#         phrase = ""
#         if i-2 >=0:
#             featurelist[i]["prevWord2POS"] = pos[i-2][1]
#             phrase += (pos[i-2][1])
#         else:
#             featurelist[i]["prevWord2POS"] = ""
#         if i-1 >=0:
#             featurelist[i]["prevWord1POS"] = pos[i-1][1]
#             phrase += (pos[i-1][1])
#         else:
#             featurelist[i]["prevWord1POS"] = ""
#             phrase += (pos[i][1])
#         if i+1 <= max_index:
#             featurelist[i]["nextWord1POS"] = pos[i+1][1]
#             phrase += (pos[i+1][1])
#         else:
#             featurelist[i]["nextWord1POS"] = ""
#         if i+2 <= max_index:
#             featurelist[i]["nextWord2POS"] = pos[i+2][1]
#             phrase += (pos[i+2][1])
#         else:
#             featurelist[i]["nextWord2POS"] = ""
#         featurelist[i]["phrase"] = phrase
        featurelist[i]["pos"] = pos[i][1]
    return featurelist

# Extracts all feature dictionaries from a list of articles and returns a list
# containing all dictionaries
def extractFeaturesForAllArticles(articles, labels):

    func = lambda x : extractFeaturesForArticle(articles[x])
    listOfLists = list(map(func, list(range(len(articles)))))
    finalList = []
    for lst in listOfLists:
        finalList += lst
    return finalList

# Flattens a list of lists into a single long list
def flattenLabels(list_of_lists):
    long_list = []
    for lst in list_of_lists:
        long_list += lst
    return long_list

# Takes in a list of articles and a list of list of labels and returns a list
# of pair containing feature dictionaries and labels for each token
def master_data_formatter(articles, labels):
    feature_list  = extractFeaturesForAllArticles(articles, labels)
    label_list = flattenLabels(labels)
    return [(feature_list[i], label_list[i]) for i in range(len(label_list))]

In [None]:
# Formats the data using the helper function
training_data = master_data_formatter(tokenized_train_articles, tokenized_train_labels)

print(training_data[0])

Creates a maximum entropy classifier using the list of training feature vectors and the list of training labels

In [None]:
# Creates and trains the classifier
classifier = MaxentClassifier.train(training_data, max_iter=2)

Creates a Maximum Entropy Markov Model (MEMM) using the viterbi algorithm, viterbi_start function, viterbi_next function, and 
the classifier as the output function.

In [None]:
print(classifier.show_most_informative_features(10))

In [None]:
features = []

def memm_viterbi_output(token_idx, sent, state):
    global features
    global classifier
    
    return max(10**(-99),classifier.prob_classify(features[token_idx]).prob(state))

def useMEMM(article):
    global features
    features = extractFeaturesForArticle(article)
    return (viterbi(article, [0,1], viterbi_next, viterbi_start, memm_viterbi_output))[0]

In [None]:
predictions = list(map(useMEMM, tokenized_dev_articles))

In [None]:
# Takes in a list of lists of predicted labels and a list of lists of true
# labels, and computed the micro f1 score of the predicitons
def f1(predictions, truth):
    tp = 0
    fp = 0
    fn = 0
    
    for article_idx in range(len(predictions)):
        predicted_labels = predictions[article_idx]
        true_labels = truth[article_idx]
        
        for label_idx in range(len(predicted_labels)):
            if (predicted_labels[label_idx] == 1) and (true_labels[label_idx] == 1):
                tp += 1
            elif (predicted_labels[label_idx] == 1):
                fp += 1
            elif (true_labels[label_idx] == 1):
                fn += 1
    
    # Prints statement when about to divide by 0
    if (tp + fp == 0):
        print('Did not make any positive guesses')
    elif (tp + fn == 0):
        print('No postive values in the true labels')
        
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    
    f1 = 2*precision*recall/(precision+recall)
    return f1

In [None]:
print(f1(predictions, tokenized_dev_labels))

In [None]:
# Gets the average accuracry between the number of 

In [None]:
# print("Accuracy of viterbi: " + str(weighted_accuracy(valid_labels, predictions)))

#for k in range(100):
#  classifier = MaxentClassifier.train(training_data, max_iter=k)
#  predictions = list(map(useMEMM, valid_articles))
#  print("Accuracy of viterbi after " + str(k) + " iterations: " + str(weighted_accuracy(valid_labels, predictions)))

# ERROR ANALYSIS: MEMM (ARTICLE 0)

#print("Correct labels:")
#print(tokenized_dev_labels[1])
#print("Output labels from Viterbi:")
#print(predictions[1])
#count_correct_labels = 0
#count_incorrect_labels = 0
#incorrect_labels = []
#for i in range(len(tokenized_dev_labels[0])):
#    if tokenized_dev_labels[0][i] == predictions[0][i]:
#        count_correct_labels += 1
#    else:
#        count_incorrect_labels += 1
#        incorrect_labels.append("\t" + str(tokenized_dev_labels[0][i]) + " was changed to " + str(predictions[0][i]))

#print("\nNumber of correct labels: %d" % count_correct_labels)
#print("Number of incorrect labels: %d" % count_incorrect_labels)
#print("Incorrectly labelled:")
#for x in incorrect_labels:
#    print(x)
n = 0
for prediction in predictions:
    for label in prediction:
        if label == 0:
            n += 1
print(n)