## <h1>Epic</h1>
## <h2>NLP Interactive Fiction with GPT2</h2>
### <h3>Environment Setup</h3>

In [1]:
import ipywidgets
from IPython import display
import os
import re
import glob
from tqdm import tqdm
import codecs
from chardet import detect
import pprint
from tqdm import tqdm_notebook

!apt install -qq enchant
!pip install pyenchant
from enchant.checker import SpellChecker
from enchant.tokenize import EmailFilter, URLFilter

import torch

!pip install transformers
from transformers import TextDataset,DataCollatorForLanguageModeling
from transformers import GPT2Model, GPT2Config
from transformers import AutoTokenizer, AutoModelWithLMHead
from transformers import Trainer, TrainingArguments
from transformers import AutoModelForSequenceClassification
from transformers import pipeline

display.clear_output()
from google.colab import drive
drive.mount('/content/drive')
display.clear_output()
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
print("\nRunning on device: ", str(device).upper())

if gpu_info.find('failed') >= 0 and ram_gb < 30:
  print('\nSelect the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
  print('\nTo enable a high-RAM runtime, select the Runtime > "Change runtime type"')
  print('menu, and then select High-RAM in the Runtime shape dropdown. Then, ')
  print('re-execute this cell.')
else:
  try:
      if gpu_info.find('failed') < 0:
         print(gpu_info)
  except:
    display.clear_output()
  finally:
    print('\nYour runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))
    print('You are using a high-RAM runtime!')



Running on device:  CUDA
Tue Dec  1 05:26:00 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.38       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P0    26W / 250W |     10MiB / 16280MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-------------------------------------------------------------

### <h3>Data Processing</h3>
#### <h4>Data Cleaning</h4>
<ul>
    <li>Convert to UTF8 format</li>
    <li>Remove Page numbers</li>
    <li>Remove Publication Information</li>
    <li>Check for spelling errors</li>
</ul>

Example usage:
```
all_text_files = get_all_txt_files()

harry_potter_texts = merge_texts(get_files_in_data_folder("Harry_Potter"))
print("Cleaned Harry Potter Text:\n\n\n" + harry_potter_texts[:2000]+"....\n\n")

stephen_king_texts = merge_texts(get_files_by_author("Stephen_King"))
print("Cleaned Stephen Text:\n\n\n" + stephen_king_texts[:2000]+"....\n\n")

horror_movie_transcripts = merge_texts(get_files_in_data_folder("Horror_Movie_Transcripts"))
print("Cleaned Horror Movie Transcripts:\n\n\n" + horror_movie_transcripts[:2000]+"....\n\n")

public_domain_texts = merge_texts(get_files_in_data_folder("Public_Domain_Horror_Novels"))
print("Cleaned Public Domain Horror Novels:\n\n\n" + public_domain_texts[:2000]+"....\n\n")
```

Combination function to merge text files, convert them to UTF8 and last then into train, test, and validation files:
```
get_train_test_validation(save_output_file('Cleaned_UTF8/', 
                          'merged_Stephen_King.txt', 
                          merge_texts(get_files_by_author("Stephen_King"))))
```

In [None]:
literary_tokens = ["characterization", "character", "setting", 
                   "exposition", "climax", "resolution", "plot", 
                   "context", "action", "weapon", 
                   "danger", "death", "suspense", 
                   "emotion", "surprise", "problem", "conflict", 
                   "perspective", "transition", "relief", "metaphor", 
                   "flashback"]

subgenre_tokens = ['Vampire', 'Ghost', 'Horror', 'Comedic Horror', 'Murder', 
                   'Werewolf', 'Apocalypse','Haunted House', 'Witch', 'Hell', 
                   'Alien', 'Gore', 'Monster']

subgenre_token = {'Vampire': '[VAMPIRE]', 'Ghost' : '[GHOST]', 'Horror' : '[HORROR]', 'Comedic Horror' : '[COMEDIC HORROR]', 'Murder' : '[MURDER]', 'Werewolf' : '[WEREWOLF]', 'Apocalypse' : '[APOCALYPSE]',
              'Haunted House' : '[HAUNTED HOUSE]', 'Witch' : '[WITCH]', 'Hell' : '[HELL]', 'Alien' : '[ALIEN]', 'Gore' : '[GORE]', 'Monster' : '[MONSTER]'}

author_list = ['Brian Evenson', 'M R James', 'Ambrose Bierce', 'Elliott O Donnell', 'Joseph Sheridan Le Fanu',
               'Edgar Allan Poe', 'Bram Stoker', 'Algernon Blackwood', 
               'Madeline Gobbo', 'Nnedi Okorafor', 'Sofia Samatar', 'Franz Kafka', 
               'Laird Barron', 'Nathan Ballingrud', 'Nellie Bly', 'William Hope Hodsgon',
               'Kelly Link', 'Arthur Machen', 'George Sylvester Viereck', 'Robert Chambers', 
               'John Meade Falkner', 'Ann Radcliffe', 'Howard Lovecraft', 'Robert Louis Stevenson',
               'Edith Birkhead', 'Jeff Vandermeer', 'Henry James', 'John William Polidori', 'W Bob Holland',
               'Prest and Rymer', 'Oliver Onions', 'Stephen King', 'Clive Barker']

literary_threshold = 0.95
subgenre_threshold = 0.25


# get file encoding type
def get_encoding_type(file):
    with open(file, 'rb') as f:
        rawdata = f.read()
    return detect(rawdata)['encoding']


def correctTxtEncoding(filename, encoding_to='UTF-8'):
    from_codec = get_encoding_type(filename)
    temp_filename = filename[:-4]+"temp.txt"
    try:
        with open(filename, 'r', encoding=from_codec) as fr:
            with open(temp_filename, 'w', encoding=encoding_to) as fw:
                for line in fr:
                    fw.write(line[:-1]+'\r\n')
        os.remove(filename) # remove old encoding file
        os.rename(temp_filename, filename) # rename new encoding
    except UnicodeDecodeError:
        print('Decode Error')
    except UnicodeEncodeError:
        print('Encode Error')



def add_tokens_to_text(text, author_token, generator):

    punc_pattern = "[!.?]" #looking for all punctuation in our text
    index = 0 #used to keep track of the start of a new sentence
    skip_iter = False
    tokenized_text = "" #used to store tokenized text
    tokens = "" #used to store tokens (ensure generator is not being fed tokens)

    #check if the file is a movie transcript
    is_book = True
    if author_token == '[Movie]':
        is_book = False
    
    if is_book:        
        #search for valid quotations in text and store their locations
        dialogue_pattern = r'"(?:(?:(?!(?<!\\)").)*)[.?!,]"'
        dialogue_locations = []
        for match in re.finditer(dialogue_pattern, text):
            s = match.start()
            e = match.end()
            dialogue_locations.append((s, e))

    sentence_count = 0
    for match in re.finditer(punc_pattern, text):
        punc_i = match.end() #get index of next punctuation

        #check if punctuation found was used in an honorific
        pat_obj = re.compile('(Mr)|(Mrs)|(Dr)|(Ms)|(Sr)|(Jr)|(Mt)', re.IGNORECASE)
        honorific_found = pat_obj.search(text[punc_i-4: punc_i])
        if honorific_found:
            continue
        
        if is_book:
            #if punctuation found is in between a set of quote, skip iteration       
            for s, e in dialogue_locations:
                if (s <= punc_i) and (punc_i <= e):
                    skip_iter = True
                    break
            if skip_iter == True:
                skip_iter = False
                continue 

        #define new line of text to tokenize
        line = text[index:punc_i]
        
        #zero-shot classifier needs valid sequences
        if len(line) == 0:
            continue
            
        sentence_count += 1
        if sentence_count < 3:
            continue
        
        index = punc_i+1 #update index for beginning of next sequence

        #classify the lines according to literary tokens
        literary_generator = generator(line, literary_tokens, multi_class=True)
        for i, score in enumerate(literary_generator['scores']):
            if score > literary_threshold:
                tokens += "["+ literary_generator['labels'][i].upper() + "]"
        #classify the lines according to literary tokens
        subgenre_generator = generator(line, subgenre_tokens, multi_class=False)
        if subgenre_generator['scores'][0] > subgenre_threshold:
            tokens += subgenre_token[subgenre_generator['labels'][0]]
        if is_book:
            line = insertTokenRegex(line, r'"(?:(?:(?!(?<!\\)").)*)"', "[DIALOGUE]", beginning_of_match=True)
        else:
            dialogue_pattern = r'[A-Z]+:'
            for match in re.finditer(dialogue_pattern, line):
                s = match.start()
                line = line[:s] + '[DIALOGUE]' + line[s:]
        line = tokens + line
        line = insertTokenRegex(line, r'((?i)Chapter\s+[0-9MDCLXVI]*.*)', "[CHAPTER]", beginning_of_match=True)
        line = insertAuthorToken(line, author_token)
        tokenized_text += line + "\n"
        tokens = ""
        sentence_count = 0
      
    return tokenized_text
    
def merge_texts(texts, author_token=None):
    merged_text_cleaned = ''
    find_author = False
    tokenizer = AutoTokenizer.from_pretrained("joeddav/xlm-roberta-large-xnli")
    model = AutoModelForSequenceClassification.from_pretrained("joeddav/xlm-roberta-large-xnli")
    generator = pipeline("zero-shot-classification", model=model, tokenizer=tokenizer)
    if author_token == None:
      find_author = True
    for text in tqdm(texts):
        new_text = ''
        correctTxtEncoding(text)
        with open(text,'r',encoding="utf8") as f:
            for line in f:
                new_text += line[:-1]+'\n'
            if find_author:
                for author in author_list:
                    if text.replace("_", " ").find(author) != -1:
                        author_token = "["+ author.upper() + "]"
        new_text_cleaned = clean_text(new_text)
        tokenized_text = add_tokens_to_text(new_text_cleaned, author_token, generator)
        merged_text_cleaned += tokenized_text
    return merged_text_cleaned

def get_all_txt_files():
    path = "/content/drive/My Drive/CSCI 470 Project/Epic/Data/" +'\**\*.txt';
    files = glob.glob(path, recursive=True)
    return files

def get_files_in_data_folder(folder):
    path = "/content/drive/My Drive/CSCI 470 Project/Epic/Data/" + folder + '/*.txt';
    files = glob.glob(path, recursive=True)
    return files

def get_files_by_author(author):
    path = "/content/drive/My Drive/CSCI 470 Project/Epic/Data/" + '\**\*_' + author + '.txt';
    files = glob.glob(path, recursive=True)
    return files

def get_train_test_validation(txt_file, train=0.70, test=0.20, val=0.10):
    train_doc =[]
    test_doc = []
    val_doc =[]
    with open(txt_file,'r', encoding='UTF-8') as f:
        file_input=f.readlines()

    count = 0
    for cnt, line in enumerate(file_input):
            if cnt <= len(file_input)*train:
                train_doc.append(line)
            elif (cnt > len(file_input)*train and cnt < len(file_input)*(train+test)):
                test_doc.append(line)
            else:
                val_doc.append(line)

    ## Write to file
    f = open(txt_file[:-4]+'_train.txt', "w+", encoding='UTF-8')
    count = 0
    for line in train_doc:
        count=count+1
        f.write(str(line))
        f.write("\n")  
    f.close()
    print("Training lines:\t",count)
    
    ## Write to file
    f = open(txt_file[:-4]+'_test.txt', "w+", encoding='UTF-8')
    count = 0
    for line in train_doc:
        count=count+1
        f.write(str(line))
        f.write("\n")  
    f.close()
    print("Testing lines:\t",count)

    ## Write to file
    f = open(txt_file[:-4]+'_val.txt',"w+", encoding='UTF-8')
    count = 0
    for line in val_doc:
        count=count+1
        f.write(str(line))
        f.write("\n")

    f.close()
    print("Validation lines:\t",count)
    
def save_output_file(path_from_output, file_name, data):
    path = "/content/drive/My Drive/CSCI 470 Project/Epic/Data/" + path_from_output + file_name;
    with open(path, "w", encoding='UTF-8') as file:
        file.write(data)
    print("File saved at:\t", path)
    return path

def checkSpelling(text):
    chkr = SpellChecker("en_US",filters=[EmailFilter,URLFilter])
    chkr.set_text(text)
    """
    for err in chkr:
        print("ERROR:", err.word)
    """

def reduce_lengthening(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

def insertTokenRegex(text, regex, token, 
                     beginning_of_match=False, 
                     beginning_of_line=False,
                     end_of_match=False, 
                     end_of_line=False):
    matches=re.findall(regex,text)
    for match in matches:
        index = text.find(match)
        if beginning_of_match:
            text = text[:index] + token + text[index:]
        if (end_of_match):
            text = text[:index+1] + match + token + text[len(match)+index+1:]
    if (len(matches) != 0):
        if (end_of_line):
            text = text + token
        if (beginning_of_line):
            text = token + text
    return text

def insertAuthorToken(text, token):
    return token + text

def clean_text(string):
    pattern = '(page|PAGE|Page)(\s+\|\s+)([0-9]+)(.*)$'
    output_cleaned = re.sub('\s$', '', string, flags=re.MULTILINE)
    p=re.compile(pattern,re.MULTILINE)
    output_cleaned = p.sub(" ",output_cleaned)
    output_cleaned = output_cleaned.replace('“','"').replace('”','"')
    return output_cleaned 

def find_num_sentences(text):
    num_sentences = 0

    #find and store locations of quotations within text 
    dialogue_locations = []
    dialogue_pattern = r'"(?:(?:(?!(?<!\\)").)*)[.?!,]"'
    for match in re.finditer(dialogue_pattern, text):
        s = match.start()
        e = match.end()
        dialogue_locations.append((s, e))
        

    for match in re.finditer("[!.?]", text):
        skip_punc = False
        punc_i = match.end() #get index of next punctuation

        #check if punctuation found was used in an honorific
        pat_obj = re.compile('(Mr)|(Mrs)|(Dr)|(Ms)|(Sr)|(Jr)|(Mt)', re.IGNORECASE)
        if pat_obj.search(text[punc_i-4: punc_i]):
            continue
        
        #if punctuation found is in between a set of quotes, skip iteration       
        for s, e in dialogue_locations:
            if (s <= punc_i) and (punc_i <= e):
                skip_punc = True
                break
        if skip_punc == True:
            skip_punc = False
            continue 
        num_sentences += 1

    return num_sentences

def add_tokens_to_text(line, author_token, generator, movie=False):
    punc_pattern = '[.!?]'
    tokens = ''

    #classify the lines according to literary tokens
    literary_generator = generator(line, literary_tokens, multi_class=True)
    for i, score in enumerate(literary_generator['scores']):
        if score > literary_threshold:
            tokens += "["+ literary_generator['labels'][i].upper() + "]"
        else: break
    
    #classify the lines according to subgenre tokens
    subgenre_generator = generator(line, subgenre_tokens, multi_class=False)
    if subgenre_generator['scores'][0] > subgenre_threshold:
        tokens += subgenre_token[subgenre_generator['labels'][0]]
    
    if movie:
        dialogue_pattern = r'[A-Z]+:'
        for match in re.finditer(dialogue_pattern, line):
            s = match.start()
            line = line[:s] + '[DIALOGUE]' + line[s:]
    else: 
        line = insertTokenRegex(line, r'"(?:(?:(?!(?<!\\)").)*)"', "[DIALOGUE]", beginning_of_match=True)

    line = tokens + line
    line = insertTokenRegex(line, r'((?i)Chapter\s+[0-9MDCLXVI]*.*)', "[CHAPTER]", beginning_of_match=True)
    if author_token != None: line = insertAuthorToken(line, author_token)

    return line + "\n"

def clean_files_sequential(file_paths):
    print("Loading Zero Shot Tokenization Model Pipeline\n"+"="*80)
    
    tokenizer = AutoTokenizer.from_pretrained("joeddav/xlm-roberta-large-xnli")
    model = AutoModelForSequenceClassification.from_pretrained("joeddav/xlm-roberta-large-xnli")
    generator = pipeline("zero-shot-classification", device=0, model=model, tokenizer=tokenizer)
    display.clear_output()
    
    output_path = "/content/drive/My Drive/CSCI 470 Project/Epic/Data/Cleaned_Text/"
    movie_input_path = "/content/drive/My Drive/CSCI 470 Project/Epic/Data/Horror_Movie_Transcripts"
    print("Input files: ")
    for txt in file_paths:
        print(txt)
    print("Output files: ")
    for txt in file_paths:
        print(output_path+txt.split("/")[-1])

    for i, txt_file in enumerate(file_paths):
        display.clear_output(wait=True)
        output_path_name = txt_file.split("/")[-1]
        print("Working on Cleaning "+ output_path_name)
        print(str(i)+"/"+str(len(file_paths)) + "\tFiles Completed")
        """
        Check to see if file already exists to continue cleaning for previous spot.
        """
        start_line = 0
        author_token = None
        files = glob.glob(output_path+"*"+txt_file.split("/")[-1])
        if len(files) > 0: 
            write_mode = "a"
            with open(files[0],'r',encoding="utf8") as clean_reader:
                start_line = len(clean_reader.readlines())
        else:
            """
            File doesn't already exist so setup the cleaning
            """
            write_mode = "w" # Overwrite current file
            correctTxtEncoding(txt_file)
            files = glob.glob(movie_input_path+txt_file.split("/")[-1])
            if len(files) > 0: author_token = '[MOVIE]'
            else:
                for author in author_list:
                    if txt_file.replace("_", " ").find(author) != -1:
                        author_token = "["+ author.upper() + "]"
                        break    
        
        """
        Start cleaning of THIS txt file
        """
        with open(txt_file,'r', encoding="utf8") as original_reader:
            with open(output_path+output_path_name, write_mode, encoding="utf8") as clean_writer:
                full_txt = original_reader.readlines()
                full_length = len(full_txt)
                with tqdm(total=full_length, position=start_line, leave=True) as pbar:
                    if author_token == '[MOVIE]':
                        for line in tqdm(range(start_line, full_length), position=start_line, leave=True):
                            pbar.update()
                            new_text_cleaned = clean_text(full_txt[line][:-1])+'\n'
                            tokenized_txt = add_tokens_to_text(new_text_cleaned, author_token, generator, movie=True)
                            clean_writer.write(tokenized_txt)
                    else:
                        sequence = ""
                        for line in tqdm(range(start_line, full_length), position=start_line, leave=True):
                            pbar.update()
                            new_text_cleaned = clean_text(full_txt[line][:-1]+" ")
                            sequence += new_text_cleaned
                            if find_num_sentences(sequence) > 3:
                                tokenized_txt = add_tokens_to_text(sequence, author_token, generator)
                                clean_writer.write(tokenized_txt)
                                sequence = ""


    display.clear_output()
    print("FINISHED CLEANING\n"+"="*80)
    print("Output files: ")
    for txt in file_paths:
        print("\t"+output_path+txt.split("/")[-1])

In [None]:
clean_files_sequential(get_files_in_data_folder("Stephen_King"))
clean_files_sequential(get_files_in_data_folder("Horror_Movie_Transcripts"))
clean_files_sequential(get_files_in_data_folder("Public_Domain_Horror_Novels"))
clean_files_sequential(get_files_in_data_folder("Harry_Potter"))
clean_files_sequential(get_files_in_data_folder("Clive_Barker"))
clean_files_sequential(get_files_in_data_folder("Mary_Shelly"))

#### <h4>Adding Special Tokens</h4>

In [None]:
subgenre_token = {'Vampire': '[VAMPIRE]', 'Ghost' : '[GHOST]', 'Horror' : '[HORROR]', 'Comedic Horror' : '[COMEDIC HORROR]', 'Murder' : '[MURDER]', 'Werewolf' : '[WEREWOLF]', 'Apocalypse' : '[APOCALYPSE]',
              'Haunted House' : '[HAUNTED HOUSE]', 'Witch' : '[WITCH]', 'Hell' : '[HELL]', 'Alien' : '[ALIEN]', 'Gore' : '[GORE]', 'Monster' : '[MONSTER]'}


author_token = {'Clive Barker' : '[CLIVE BARKER]', 'J. K. Rowling' : '[J.K. ROWLING]', 'Stephen King' : '[STEPHEN KING]', 'Théophile Gautier' : '[THEOPHILE GAUTIER]', 
               'James H. Hyslop' : '[JAMES H HYSLOP]', 'Lord Edward Bulwer-Lytton' : '[LORD EDWARD BULWER-LYTTON]', 'A. T. Quiller-Couch' : '[A. T. QUILLER-COUCH]', 
               'Mrs. Margaret Oliphant' : '[MRS. MARGARET OLIPHANT]', 'Ernest Theodor Amadeus Hoffmann' : '[ERNEST THEODOR AMADEUS HOFFMAN]', 'Erckmann-Chatrian' : '[ERCKMANN-CHATRAIN]', 
               'Fiona Macleod' : '[FIONA MACLEOD]', 'Amelia B. Edwards' : '[AMELIA B. EDWARDS]', 'H. B. Marryatt' : '[H. B. MARRYATT]', 'Thomas Hardy' : '[THOMAS HARDY]', 
               'Montague Rhodes James' : '[MONTAGUE RHODES JAMES]', 'Fitz-James O\'Brien' : '[FITZ-JAMES O\'BRIEN', 'James Stephen' : '[JAMES STEPHEN]', 'Alfred Lord Tennyson' : '[ALFRED LORD TENNYSON]',
               'Amelia Edwards' : '[AMELIA EDWARDS]', 'Edward Bulwer-Lytton' : '[EDWARD BULWER-LYTTON]', 'Erckmann Chatrian' : '[ERCKMANN CHATRIAN]', 'Latifa al-Zayya' : '[LATIFA AL-ZAYYA]',
               'M. R. James' : '[M. R. JAMES]', 'Paul Brandis' : '[PAUL BRANDIS]', 'Brain Evenson' : '[BRAIN EVENSON]', 'Elliott O\'Donnell' : '[ELLIOTT O\'DONNELL]', 
               'Joseph, Sheridan Le Fanu' : '[JOSEPH, SHERIDAN LE FANU]', 'Edgar Allan Poe' : '[EDGAR ALLEN POE]', 'Bram Stoker' : '[BRAM STOKER]', 'Algernon Blackwood' :'[ALGERNON BLACKWOOD]',
               'Miles Klee' : '[MILES KLEE]', 'Nnedi Okorador' : '[NNEDI OKORADOR]', 'Sofia Samatar' : '[SOFIA SAMATAR]', 'Franz Kafka' : '[FRANZ KAFKA]', 'Laird Barron' : '[LAIRD BARRON]',
               'Nathan Ballingrud' : '[NATHAN BALLINGRUD]', 'Nellie Bly' : '[NELLIE BLY]', 'William Hop Hodgson' : '[WILLIAM HOP HODGSON]', 'Ambrose Bierce' : '[AMBROSE BIERCE]',
               'Kelly Link' : '[KELLY LINK]', 'Arthur Machen' : '[ARTHUR MACHEN]', 'George Sylvester Viereck' : '[GEORGE SYLVESTER VIERECK]', 'Robert Chambers' : '[ROBERT CHAMBERS]',
               'John Meade Falkner' : '[JOHN MEADE FALKNER]', 'Ann Radcliffe' : '[ANN RADCLIFFE]', 'Howard Lovecraft' : '[HOWARD LOVECRAFT]', 'Louis Stevenson' : '[LOUIS STEVENSON]',
               'Edith Birkhead' : '[EDITH BIRKHEAD]', 'Jeff Vandermeer' : '[JEFF VANDERMEER]', 'Henry James' : '[HENRY JAMES]', 'John William Polidori' : '[JOHN WILLIAM POLIDORI]',
               'Bob Holland' : '[BOB HOLLAND]', 'Oliver Onions' : '[OLIVER ONIONS]'}


author_list = ['Clive Barker', 'J. K. Rowling', 'Stephen King', 'Théophile Gautier', 
               'James H. Hyslop', 'Lord Edward Bulwer-Lytton', 'A. T. Quiller-Couch', 
               'Mrs. Margaret Oliphant', 'Ernest Theodor Amadeus Hoffmann', 'Erckmann-Chatrian', 
               'Fiona Macleod', 'Amelia B. Edwards', 'H. B. Marryatt', 'Thomas Hardy', 
               'Montague Rhodes James', 'Fitz-James O\'Brien', 'James Stephen', 'Alfred Lord Tennyson',
               'Amelia Edwards', 'Edward Bulwer-Lytton', 'Erckmann Chatrian', 'Latifa al-Zayya',
               'M R James', 'Paul Brandis', ' Brain Evenson', 'Elliott O Donnell', 
               'Joseph, Sheridan Le Fanu', 'Edgar Allan Poe', 'Bram Stoker', 'Algernon Blackwood',
               'Miles Klee', 'Nnedi Okorador', 'Sofia Samatar', 'Franz Kafka', 'Laird Barron',
               'Nathan Ballingrud', 'Nellie Bly', 'William Hop Hodgson', 'Ambrose Bierce',
               'Kelly Link', 'Arthur Machen', 'George Sylvester Viereck', 'Robert Chambers',
               'John Meade Falkner', 'Ann Radcliffe', 'Howard Lovecraft', 'Louis Stevenson',
               'Edith Birkhead', 'Jeff Vandermeer', 'Henry James', 'John William Polidori',
               'Bob Holland', 'Oliver Onions', 'Brian Evenson', 'Prest and Rymer', 'W Bob Holland',
               'Robert Louis Stevenson', 'Howard Lovecraft', 'Hope Hodsgon']

In [None]:
# tokenizer.add_special_tokens({'pad_token': '[PAD]', 'sep_token': '[SEP]'})

import re


AUTHOR_LIST = ['Clive Barker', 'J. K. Rowling', 'Stephen King', 'Théophile Gautier', 
               'James H. Hyslop', 'Lord Edward Bulwer-Lytton', 'A. T. Quiller-Couch', 
               'Mrs. Margaret Oliphant', 'Ernest Theodor Amadeus Hoffmann', 'Erckmann-Chatrian', 
               'Fiona Macleod', 'Amelia B. Edwards', 'H. B. Marryatt', 'Thomas Hardy', 
               'Montague Rhodes James', 'Fitz-James O\'Brien', 'James Stephen', 'Alfred Lord Tennyson',
               'Amelia Edwards', 'Edward Bulwer-Lytton', 'Erckmann Chatrian', 'Latifa al-Zayya',
               'M. R. James', 'Paul Brandis', ' Brain Evenson', 'Elliott O\'Donnell', 
               'Joseph, Sheridan Le Fanu', 'Edgar Allan Poe', 'Bram Stoker', 'Algernon Blackwood',
               'Miles Klee', 'Nnedi Okorador', 'Sofia Samatar', 'Franz Kafka', 'Laird Barron',
               'Nathan Ballingrud', 'Nellie Bly', 'William Hop Hodgson', 'Ambrose Bierce',
               'Kelly Link', 'Arthur Machen', 'George Sylvester Viereck', 'Robert Chambers',
               'John Meade Falkner', 'Ann Radcliffe', 'Howard Lovecraft', 'Louis Stevenson',
               'Edith Birkhead', 'Jeff Vandermeer', 'Henry James', 'John William Polidori',
               'Bob Holland', 'Oliver Onions']
AUTHOR_LIST.sort()

GENRE_LIST = ['Vampire', 'Ghost', 'Horror', 'Comedic Horror', 'Murder', 'Werewolf', 'Apocalypse',
              'Haunted House', 'Witch', 'Hell', 'Alien', 'Gore', 'Monster']
GENRE_LIST.sort()


def insertTokenRegex(text, regex, token, 
                     beginning_of_match=False, 
                     beginning_of_line=False,
                     end_of_match=False, 
                     end_of_line=False):
    matches=re.findall(regex,text)
    for match in matches:
        index = text.find(match)
        if beginning_of_match:
            text = text[:index] + token + text[index:]
        if (end_of_match):
            text = text[:index] + match + token + text[len(match)+index+1:]
    if (len(matches) != 0):
        if (end_of_line):
            text = text + token
        if (beginning_of_line):
            text = token + text
    return text

def insertAuthorToken(text, token):
    return token + text

##### <h5>Dialogue Special Token</h5>

Usage: addDialogueTokens(text_to_add_token_to, token)

In [None]:
insertTokenRegex('Regex should return "String 1" or "String 2" or "String3" and "\\"double quoted string\\"" ', r'"(?:(?:(?!(?<!\\)").)*)"', "[DIALOGUE]", beginning_of_match=True)

"SWALES:  They won't take nothing, Dr. Seward.  I can't do. SEWARD:  Mrs. Callaway, where is my daughter? CALLAWAY:  Miss Lucy's upstairs in the house, sir, looking after that friend of hers."

##### <h5>Chapter Special Token</h5>


In [None]:
insertTokenRegex("Chapter 1 AFTER THE FLOOD (1957)", r'(Chapter\s+[0-9MDCLXVI]*)', "[CHAPTER]", beginning_of_match=True)

'[CHAPTER]Chapter 1 AFTER THE FLOOD (1957)'

##### <h5>Introduction Sentence to Paragraph Special Token</h5>


In [None]:
insertTokenRegex("Chapter 1 AFTER THE FLOOD (1957)", r'(Chapter\s+[0-9]*)', "[ISC]",end_of_line=True)


'Chapter 1 AFTER THE FLOOD (1957)[ISC]'

##### <h5>Author Special Token</h5>


In [None]:
insertAuthorToken("Chapter 1 AFTER THE FLOOD (1957)", "[STEPHEN_KING]")

'[STEPHEN_KING]Chapter 1 AFTER THE FLOOD (1957)'

##### <h5>Setting Special Token with BERT NER</h5>

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")

label_list = [
    "O",       # Outside of a named entity
    "B-MISC",  # Beginning of a miscellaneous entity right after another miscellaneous entity
    "I-MISC",  # Miscellaneous entity
    "B-PER",   # Beginning of a person's name right after another person's name
    "I-PER",   # Person's name
    "B-ORG",   # Beginning of an organisation right after another organisation
    "I-ORG",   # Organisation
    "B-LOC",   # Beginning of a location right after another location
    "I-LOC"    # Location
]

sequence = "No, I'm gettin it, Billy Bob Jo, George called back at once. He rubbed at his arms, trying to make the guilty gooseflesh disappear and be smooth skin again. I just stopped to get a drink of water."

generator = pipeline('ner',model=model, tokenizer=tokenizer, grouped_entities=True)
for ner in generator(sequence):
    print(ner)

{'entity_group': 'I-PER', 'score': 0.9041999379793803, 'word': 'Billy Bob Jo'}
{'entity_group': 'I-PER', 'score': 0.9958179593086243, 'word': 'George'}


In [None]:
!pwd

/content


##### <h5>Zero-Shot Token Classification</h5>

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("joeddav/xlm-roberta-large-xnli")

model = AutoModelForSequenceClassification.from_pretrained("joeddav/xlm-roberta-large-xnli")

generator = pipeline("zero-shot-classification", model=model, tokenizer=tokenizer)
sequence_to_classify = "He reached the kitchen and swept the door shut behind him. It banged gustily. He leaned back against it with his eyes closed, sweat popped out on his arms and forehead, the box of paraffin gripped tightly in one hand."
candidate_labels = ["characterization", "character", "setting", "exposition", "climax", "resolution", "plot", "context", "action", "dialogue"]
generator("My way lay by the Great East Anglian line as far as Clayborough station, where I was to be met by one of the Dumbleton carriages and conveyed across the remaining nine miles of country.", candidate_labels, multi_class=True)


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=734.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=5069051.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=150.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=25.0, style=ProgressStyle(description_w…




ValueError: ignored

In [None]:
tokenizer = AutoTokenizer.from_pretrained("joeddav/xlm-roberta-large-xnli")

model = AutoModelForSequenceClassification.from_pretrained("joeddav/xlm-roberta-large-xnli")

generator = pipeline("zero-shot-classification", model=model, tokenizer=tokenizer)
sequence_to_classify = "The vampire reached the kitchen and swept the door shut behind him. It banged gustily. He leaned back against it wanted to die, blood popped out on his arms and forehead, the box of paraffin gripped tightly in one hand."
candidate_labels = ['Vampire', 'Ghost', 'Horror', 'Comedic Horror', 'Murder', 'Werewolf', 'Apocalypse',
              'Haunted House', 'Witch', 'Hell', 'Alien', 'Gore', 'Monster']
generator(sequence_to_classify, candidate_labels, multi_class=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=734.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=5069051.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=150.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=25.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2243942751.0, style=ProgressStyle(descr…




Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'labels': ['Vampire',
  'Werewolf',
  'Monster',
  'Horror',
  'Ghost',
  'Murder',
  'Haunted House',
  'Hell',
  'Apocalypse',
  'Witch',
  'Comedic Horror',
  'Gore',
  'Alien'],
 'scores': [0.998860239982605,
  0.9866320490837097,
  0.9737526178359985,
  0.9712046980857849,
  0.9492768049240112,
  0.9465053081512451,
  0.7096695303916931,
  0.10715451091527939,
  0.04057566821575165,
  0.015528492629528046,
  0.014323946088552475,
  0.011541456915438175,
  0.0011715054279193282],
 'sequence': 'The vampire reached the kitchen and swept the door shut behind him. It banged gustily. He leaned back against it wanted to die, blood popped out on his arms and forehead, the box of paraffin gripped tightly in one hand.'}

##### <h5>1-5 Sentiment Analysis</h5>

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

model = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")


sequence = "I hate you"
generator = pipeline("sentiment-analysis",model=model, tokenizer=tokenizer, return_all_scores=True)
for ner in generator(sequence):
    print(ner)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=953.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=871891.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=39.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=669491321.0, style=ProgressStyle(descri…


[{'label': '1 star', 'score': 0.6346072554588318}, {'label': '2 stars', 'score': 0.1560467928647995}, {'label': '3 stars', 'score': 0.05133006349205971}, {'label': '4 stars', 'score': 0.04053521901369095}, {'label': '5 stars', 'score': 0.11748071014881134}]


In [None]:
generator(sequence)

[{'label': '1 star', 'score': 0.6346072554588318},
 {'label': '2 stars', 'score': 0.1560468077659607},
 {'label': '3 stars', 'score': 0.05133005604147911},
 {'label': '4 stars', 'score': 0.04053521156311035},
 {'label': '5 stars', 'score': 0.11748065799474716}]

In [None]:
generator(sequence)[0]

[{'label': '1 star', 'score': 0.6346072554588318},
 {'label': '2 stars', 'score': 0.1560468077659607},
 {'label': '3 stars', 'score': 0.05133005604147911},
 {'label': '4 stars', 'score': 0.04053521156311035},
 {'label': '5 stars', 'score': 0.11748065799474716}]

In [4]:
print("Analyzing sequence:", sequence)
for d in generator(sequence)[0]:
    print(f"{d['label']} has a score of {d['score']}")

Analyzing sequence: I hate you
1 star has a score of 0.6346072554588318
2 stars has a score of 0.1560467928647995
3 stars has a score of 0.05133006349205971
4 stars has a score of 0.04053521901369095
5 stars has a score of 0.11748071014881134


In [None]:
sentiment = ""
max_score = 0
for d in generator(sequence)[0]:
    if d['score'] > max_score:
        max_score = d['score']
        sentiment = d['label']
print(sentiment, max_score)
int(sentiment.split(" ")[0])

1 star 0.6346072554588318


1

In [None]:
import numpy as np
thing = np.array([("str1", 5), ("str2", 10)])
index = np.argmax((thing[:,1]))
print(thing[index,0])

str1


In [None]:
thing[:,1]

array(['5', '10'], dtype='<U4')

### <h3>Model Setup</h3>



In [None]:
# Initializing a GPT2 configuration
configuration = GPT2Config()
# Initializing a model from the configuration
model = GPT2Model(configuration)
# Accessing the model configuration
configuration = model.config
print(configuration)

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained('gpt2-medium')
# # Download model and configuration from S3 and cache.
model = AutoModelWithLMHead.from_pretrained(parent_directory+checkpoint_path, pad_token_id=tokenizer.eos_token_id)

train_path = parent_directory + 'Data/Cleaned_UTF8/merged_Stephen_King_train.txt';
test_path = parent_directory + 'Data/Cleaned_UTF8/merged_Stephen_King_test.txt';
output_path = parent_directory + 'Models/gpt2_medium_Stephen_King/continued_training'
print("Train Path:\t", train_path)
print("Test Path:\t", test_path)
print("Output Directory Path:\t", output_path)

#### <h4>Loading Data</h4>

In [None]:
def load_dataset(train_path,test_path,tokenizer):
    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=128)

    test_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=128)

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset,test_dataset,data_collator

train_path = parent_directory + 'Data/Cleaned_UTF8/merged_Stephen_King_train.txt';
test_path = parent_directory + 'Data/Cleaned_UTF8/merged_Stephen_King_test.txt';
output_path = parent_directory + 'Models/gpt2_medium_Stephen_King/continued_training'
print("Train Path:\t", train_path)
print("Test Path:\t", test_path)
print("Output Directory Path:\t", output_path)
train_dataset,test_dataset,data_collator = load_dataset(train_path,test_path,tokenizer)

### Visualization of GPT2

In [None]:
import sys
!test -d bertviz_repo && echo "FYI: bertviz_repo directory already exists, to pull latest version uncomment this line: !rm -r bertviz_repo"
# !rm -r bertviz_repo # Uncomment if you need a clean pull from repo
!test -d bertviz_repo || git clone https://github.com/jessevig/bertviz bertviz_repo
if not 'bertviz_repo' in sys.path:
  sys.path += ['bertviz_repo']
from transformers import GPT2Tokenizer, GPT2Model
from bertviz import head_view


def call_html():
  import IPython
  display(IPython.core.display.HTML('''
        <script src="/static/components/requirejs/require.js"></script>
        <script>
          requirejs.config({
            paths: {
              base: '/static/base',
              "d3": "https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.8/d3.min",
              jquery: '//ajax.googleapis.com/ajax/libs/jquery/2.0.0/jquery.min',
            },
          });
        </script>
        '''))

FYI: bertviz_repo directory already exists, to pull latest version uncomment this line: !rm -r bertviz_repo


In [None]:
model_version = 'gpt2'
model = GPT2Model.from_pretrained(model_version, output_attentions=True)
tokenizer = GPT2Tokenizer.from_pretrained(model_version)

text = "The quick brown fox jumps over the lazy dogs."
inputs = tokenizer.encode_plus(text, return_tensors='pt', add_special_tokens=True)
input_ids = inputs['input_ids']
attention = model(input_ids)[-1]
input_id_list = input_ids[0].tolist() # Batch index 0
tokens = tokenizer.convert_ids_to_tokens(input_id_list)
call_html()
head_view(attention, tokens)

Some weights of GPT2Model were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.0.attn.masked_bias', 'h.1.attn.masked_bias', 'h.2.attn.masked_bias', 'h.3.attn.masked_bias', 'h.4.attn.masked_bias', 'h.5.attn.masked_bias', 'h.6.attn.masked_bias', 'h.7.attn.masked_bias', 'h.8.attn.masked_bias', 'h.9.attn.masked_bias', 'h.10.attn.masked_bias', 'h.11.attn.masked_bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

#### <h4>Training Setup</h4>

The parameters used here are explained as follows:
Output_dir is the name of the folder where the model weights are stored.
* Model_type is the name of the model. In our case we are training on the gpt-2 architecture, we use ‘gpt-2’.
* Model_name_or_path is where we define the model size to be used.(’gpt2’ for small, ‘gpt2-medium’ for a medium model and ‘gpt2-large’ for a large model)
* Do_train is essentially a flag which we define to train the model.
train_data_file is used to specify the training file name.
* Do_eval is a flag which we define whether to evaluate the model or not, if we don’t define this, there would not be a perplexity score calculated.
* Eval_data_file is used to specify the test file name.
* gradient_accumulation_steps is a parameter used to define the number of updates steps to accumulate before performing a backward/update pass.
* Overwrite_output_dir is a parameter which when specified overwrites the output directory with new weights.
* block_size is a parameter according to which the training dataset will be truncated in block of this size for training.
* Per_gpu_train_batch_size is the batch size per GPU/CPU for training.
* Save steps — allows you to periodically save weights before the final set of weights
* num_epochs — Determines how many epochs are run.

In [None]:
training_args = TrainingArguments(
    run_name="Stephen_King_medium_560_continued",
    output_dir=output_path, #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory, set to true if continuing training
    num_train_epochs=3, # number of training epochs
    per_device_train_batch_size=8, # batch size for training
    per_device_eval_batch_size=7,  # batch size for evaluation
    eval_steps = 20, # Number of update steps between two evaluations.
    save_steps = 100, # after # steps model is saved
    warmup_steps=20,# number of warmup steps for learning rate scheduler
    evaluation_strategy="steps",
    logging_steps = 20,
    logging_dir=parent_directory+"Models/gpt2_medium_Stephen_King/logs",
    do_train=True,
    do_predict=True,
    save_total_limit=3 #Only save the last 3 models
    )

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

### <h3>Training/Finetuning</h3>

In [None]:
try:
    trainer.train()

finally:
    print("Exiting Training")
    trainer.save_model()
    # drive.flush_and_unmount()
    print('All changes made in this colab session should now be visible in Drive.')

### <h3>Outputting from the model</h3>
#### <h4>Using Pipelines</h4>

In [None]:
from transformers import pipeline

generator = pipeline('text-generation',model=output_path+"/checkpoint-560", tokenizer=tokenizer ,config={'max_length':800})

result = generator('Input Prompt')[0]['generated_text']

#### <h4>Using the Model Object</h4>

In [None]:
input_context = 'Input Prompt'
bad_words_ids = [tokenizer.encode(bad_word, add_prefix_space=True) for bad_word in ['idiot', 'stupid', 'shut up']]
input_ids = tokenizer.encode(input_context, return_tensors='pt')  # encode input context

sample_outputs = model.generate(
    input_ids,
    do_sample=True, 
    max_length=100, 
    top_k=50, 
    top_p=0.95,
    temperature=0.7,
    num_return_sequences=3
)

print("Output:\n" + 100 * '-')
for i, sample_output in enumerate(sample_outputs):
  print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

### <h3>Analyzing Text</h3>

#### <h4>Contradiction, Sentiment, Entailment with Roberta Model</h4>

In [14]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

contradiction_tokenizer = AutoTokenizer.from_pretrained("roberta-large-mnli")

contradiction_model = AutoModelForSequenceClassification.from_pretrained("roberta-large-mnli")

contradiction_generator = pipeline("sentiment-analysis",model=contradiction_model, tokenizer=contradiction_tokenizer, return_all_scores=True)

entailment_sequence = "The dog went to the store every day. It never bought anything."
print("Sequence:", entailment_sequence)
for d in contradiction_generator(entailment_sequence)[0]:
    print(f"{d['label']} has a score of {d['score']}")

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Sequence: The dog went to the store every day. It never bought anything.
CONTRADICTION has a score of 0.6192459464073181
NEUTRAL has a score of 0.37710049748420715
ENTAILMENT has a score of 0.0036535500548779964


#### <h4>WordNet Model</h4>
#### See if similar user input matches a special token
<hr>
<a href="https://wordnet.princeton.edu/">WordNet</a> is a lexical knowledge base that encodes a ton of useful information about how words relate to each other. NLTK provides a Python API to WordNet.

<hr>

##### Word Senses

Words with multiple meanings are called _polysemous_ words.  An example of a polysemous word is the word _bug_ which can mean 
1. an insect
2. a virus or microbe that makes you sick
3. an error in your computer program
4. a covert listening device
5. (verb) to annoy/bother
6. (verb) to wiretap

WordNet oraganizes word senses into a structure called _synsets_. Each word can have multiple synsets, and each synset represents a different meaning of the word.

In [None]:
import nltk
nltk.download('wordnet')
nltk.download('punkt')
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize

def get_senses(word):
  """Returns a list of word senses (WordNet synsets) for a word"""
  word_senses = wn.synsets(word)
  return word_senses

def get_definition(word_sense):
  return word_sense.definition()

def get_synonyms(word_sense):
  synonyms = []
  for lemma in word_sense.lemmas():
    synonym = lemma.name().replace('_', ' ')
    synonyms.append(synonym)
  return synonyms

#Here are the word senses for "bug". We can see what their distinct meanings are 
#by getting their definitions or their synonyms from WordNet.
word_senses = get_senses("bug")
for i, word_sense in enumerate(word_senses):
  print("\nSense %d: %s" % (i, word_sense.name()))
  print("Definition: ", get_definition(word_sense))
  print("Synonyms: ", get_synonyms(word_sense))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.

Sense 0: bug.n.01
Definition:  general term for any insect or similar creeping or crawling invertebrate
Synonyms:  ['bug']

Sense 1: bug.n.02
Definition:  a fault or defect in a computer program, system, or machine
Synonyms:  ['bug', 'glitch']

Sense 2: bug.n.03
Definition:  a small hidden microphone; for listening secretly
Synonyms:  ['bug']

Sense 3: hemipterous_insect.n.01
Definition:  insects with sucking mouthparts and forewings thickened and leathery at the base; usually show incomplete metamorphosis
Synonyms:  ['hemipterous insect', 'bug', 'hemipteran', 'hemipteron']

Sense 4: microbe.n.01
Definition:  a minute life form (especially a disease-causing bacterium); the term is not in technical use
Synonyms:  ['microbe', 'bug', 'germ']

Sense 5: tease.v.01
Definition:  ann

##### Wordnet Synset Hierarchy
###### Hypernyms / Hyponyms

In addition to representing word senses, WordNet also organizes words hierachically. For example, _red_ is a specific kind of _color_, or _microbe_ is a kind of _organism_.  These are example of _hyponym_ relationships.  If X is-a Y then X is a hyponym of Y, and Y is a hypernym of X. So _red_ is a hyponym of _color_ and _color_ is a hypernym of _red_.

In WordNet, each word sense (synset) has its own distinct hypernyms and hyponyms. 

In [None]:
hyper = lambda s: s.hypernyms()
hypo = lambda s: s.hyponyms()

def get_hypernyms(word_sense, depth=5):
  return list(word_sense.closure(hyper, depth=depth))

def get_hyponyms(word_sense, depth=5):
  return list(word_sense.closure(hypo, depth=depth))

word_senses = get_senses("bug")
for i, word_sense in enumerate(word_senses):
  # The synset names include a word from the set of synonyms, 
  # plus a part of speech (n for noun, v for verb), and 
  # the number of the sense (sense 01 is the most common sense).
  print("\nSense %d: %s (%s)" % (i, word_sense.name(), get_definition(word_sense)))
  print("Hypernyms:")
  hypernyms = word_sense.hypernyms()
  while len(hypernyms) >0:
    print("%s\tis a\t%s" % (word_sense.name(), hypernyms[0].name()))
    word_sense = hypernyms[0]
    hypernyms = word_sense.hypernyms()


Sense 0: bug.n.01 (general term for any insect or similar creeping or crawling invertebrate)
Hypernyms:
bug.n.01	is a	insect.n.01
insect.n.01	is a	arthropod.n.01
arthropod.n.01	is a	invertebrate.n.01
invertebrate.n.01	is a	animal.n.01
animal.n.01	is a	organism.n.01
organism.n.01	is a	living_thing.n.01
living_thing.n.01	is a	whole.n.02
whole.n.02	is a	object.n.01
object.n.01	is a	physical_entity.n.01
physical_entity.n.01	is a	entity.n.01

Sense 1: bug.n.02 (a fault or defect in a computer program, system, or machine)
Hypernyms:
bug.n.02	is a	defect.n.03
defect.n.03	is a	imperfection.n.01
imperfection.n.01	is a	state.n.02
state.n.02	is a	attribute.n.02
attribute.n.02	is a	abstraction.n.06
abstraction.n.06	is a	entity.n.01

Sense 2: bug.n.03 (a small hidden microphone; for listening secretly)
Hypernyms:
bug.n.03	is a	microphone.n.01
microphone.n.01	is a	electro-acoustic_transducer.n.01
electro-acoustic_transducer.n.01	is a	transducer.n.01
transducer.n.01	is a	electrical_device.n.01
elect

##### Manually Anotating Senses / Hypernyms / Hyponyms

In [None]:
def annotate_synsets(sentences):
  """This function queries WordNet for each word in a list of sentences,
     and asks the user to input a number corresponding to the synset."""

  word_senses = {}
  # Cached selections maps from word string to the previous
  # selection for this word (an integer)
  cached_selections = {}

  for i, sent in enumerate(sentences):
    words = word_tokenize(sent.lower())

    for word in words:
      sysnsets = wn.synsets(word)
      if len(sysnsets) != 0:
        selection = select_synset(sent, word, sysnsets, cached_selections)
        if selection != None:
          cached_selections[word] = selection
          if selection < len(sysnsets):
            s = sysnsets[selection]
            word_senses[word] = s.name()
  return word_senses


def select_synset(sent, word, sysnsets, cached_selections):
  """Ask the user to select which sense of the word  
     is being used in this sentence."""
  print(sent)
  print(word.upper())

  prev_selection = -1
  if word in cached_selections:
    prev_selection = cached_selections[word]

  for choice, s in enumerate(sysnsets):
    if choice == prev_selection:
      print("*** ", end = '')
    print("%d) %s - %s" % (choice, s.name(), s.definition()))

  choice += 1
  if choice == prev_selection:
    print("*** ", end = '')
  print("%d None of these." % choice)

  selection = -1
  while selection == -1:
    try:
      user_input = input(">")
      if user_input.strip() == 'x':
        # The user can press 'x' to exit.
        return None
      if user_input.strip() == '' and prev_selection > -1:
        # The user can press retrun to confirm the previous selection.
        return prev_selection
      selection = int(user_input)
    except:
      selection = -1
    if selection < 0 or selection > len(sysnsets):
      print("Please select a number between 0-%d, or type 'x' to exit" % len(sysnsets))
      if prev_selection > -1:
        print("You can also press return to confirm the previous selection (marked by ***).")
    else:
      return selection


def confirm_hyponyms(word, sysnset, do_hypernyms_instead=False):
  """Ask the user to confirm which of the hyponyms are applicable 
     for this sentence."""
  print(word.upper())

  confirmed = []
  if do_hypernyms_instead:
    unconfirmed = sysnset.hypernyms()
  else:
    unconfirmed = sysnset.hyponyms()

  while len(unconfirmed) > 0:
    s = unconfirmed.pop(0)
    print("Is %s an appropriate substitute for %s? (y/n)" % (s.name(), word))
    print("It means:", s.definition())
    print("Synonyms are:", get_synonyms(s))
    user_input = ''
    while user_input == '':
      user_input = input(">")
      user_input = user_input.strip()
      if user_input == 'y' or user_input == 'yes':
        confirmed.append(s.name())
        if do_hypernyms_instead:
          unconfirmed.extend(s.hypernyms())
        else:
          unconfirmed.extend(s.hyponyms())
        
      elif user_input == 'n' or user_input == 'no':
        pass
      elif user_input == 'x':
        # The user can press 'x' to exit.
        return confirmed
      else:
        print("Please type 'yes' or 'no' or 'x' to stop confirming for this word")
        user_input = ''
  return confirmed

# Save your annotations to a file, so that you can submit them with your homework.
def save_to_drive(word_senses, confirmed_hyponyms, confirmed_hypernyms):
  import json

  output_file = '/content/drive/My Drive/word-sense-annotations.json'
  output_json = {}
  output_json['senses'] = word_senses
  output_json['hyponyms'] = confirmed_hyponyms
  output_json['hypernyms'] = confirmed_hypernyms

  with open(output_file, 'w') as write_file:
    write_file.write(json.dumps(output_json, sort_keys=True, indent=4))
    write_file.write('\n')



commands = [
	'wear crown',
	'smell rose',
	'eat fish',
	'light lamp',
	'give fish to troll',
	'propose to the princess',
	'go north',
]

word_senses = annotate_synsets(commands)
confirmed_hyponyms = {}
confirmed_hypernyms = {}
for word in word_senses:
  print("First, pick the word sense for the word '%s'" % word)
  print("==============")
  word_sense = wn.synset(word_senses[word])
  print("\nNext, pick which hypernyms of %s we should allow players to use." % word_sense.name())
  print("==============")
  confirmed_hypernyms[word] = confirm_hyponyms(word, word_sense, do_hypernyms_instead=True)
  print("\Finally, pick which hyponyms of %s we should allow players to use." % word_sense.name())
  print("==============")  
  confirmed_hyponyms[word] = confirm_hyponyms(word, word_sense)


print("You're done annotating!  Save your annotation to your Google drive.")
print("You need to paste in a confirmation code to allow Colab to have access.")
print("We'll create a file called 'word-sense-annotations.json' for you to turn in.")
print("==============")
save_to_drive(word_senses, confirmed_hyponyms, confirmed_hypernyms)

for word in word_senses:
  print('\n', word.upper())
  word_sense = wn.synset(word_senses[word])
  print('Synonyms:\t', get_synonyms(word_sense))
  print('Hypernyms:', )
  for hypernym in confirmed_hypernyms[word]:
    print('\t', get_synonyms(wn.synset(hypernym)))

  print('Hyponyms:', )
  hyponyms = confirmed_hyponyms[word]
  for hyponym in hyponyms:
    print('\t', get_synonyms(wn.synset(hyponym)))
    

###### Showing Manually Added Tokens

In [None]:
import itertools #We're using the product method from itertools

def get_alternatives(word, word_senses, confirmed_hypernyms, confirmed_hyponyms):
  """Create a list of good alternatives for a word by listing out the synonyms
    for its word sense, and for its hyponyms and hypernyms."""
  alternatives = []
  if not word in word_senses:
    alternatives.append(word)
    return alternatives
  word_sense = wn.synset(word_senses[word])
  alternatives.extend(get_synonyms(word_sense))
  for hypernym in confirmed_hypernyms[word]:
    alternatives.extend(get_synonyms(wn.synset(hypernym)))
  for hyponym in confirmed_hyponyms[word]:
    alternatives.extend(get_synonyms(wn.synset(hyponym)))
  return alternatives

def enumerate_alternatives(sentence, word_senses, confirmed_hypernyms, confirmed_hyponyms):
  """Enumerate all of the sentenes that can result by taking any combination of
     the alternates for each word in the sentence."""
  words = word_tokenize(sentence.lower())
  # a list of lists
  alternatives_per_word = []
  for word in words:
    alternatives = get_alternatives(word, word_senses, confirmed_hypernyms, confirmed_hyponyms)
    alternatives_per_word.append(alternatives)
  
  alternative_to_original = {}
  # all combinations of a list of lists
  for words in list(itertools.product(*alternatives_per_word)):
    alt_sent = " ".join(words)
    alternative_to_original[alt_sent] = sentence
  return alternative_to_original


# alternative_commands is a dictionary that maps 
# the new commands onto the original ones.
alternative_commands = {}
for command in commands:
  alternative_commands.update(enumerate_alternatives(command, 
                                                     word_senses, 
                                                     confirmed_hypernyms, 
                                                     confirmed_hyponyms))

for alt_sent in alternative_commands:
  print("%s ==> %s" % (alt_sent, alternative_commands[alt_sent]))
print("Congratulations you can now handle %d commands instead of just %d!" % 
      (len(alternative_commands.keys()), len(commands)))

#### PyMagnitude For Token Similarity

In [None]:
# Install Magnitude on Google Colab
! echo "Installing Magnitude.... (please wait, can take a while)"
! (curl https://raw.githubusercontent.com/plasticityai/magnitude/master/install-colab.sh | /bin/bash 1>/dev/null 2>/dev/null)
! echo "Done installing Magnitude."
!wget http://magnitude.plasticity.ai/glove/heavy/glove.6B.300d.magnitude
#!wget http://magnitude.plasticity.ai/word2vec/heavy/GoogleNews-vectors-negative300.magnitude

from pymagnitude import *
vectors = Magnitude("glove.6B.300d.magnitude")
print("The number of words with vector representations in this file is %s." % len(vectors))
#vectors = Magnitude("GoogleNews-vectors-negative300.magnitude")

def construct_sentence_vector(command, vectors):
  sentence_vector = np.zeros(shape=(vectors.dim,))
  for word in command.split():
    word_vector = vectors.query(word)
    # TODO - Do something
  return sentence_vector

def find_most_similar_command(user_command, known_commands, vectors):
  # TODO - Do something
  return known_commands[0]



Installing Magnitude.... (please wait, can take a while)
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   137  100   137    0     0    494      0 --:--:-- --:--:-- --:--:--   494
Done installing Magnitude.
--2020-11-27 01:40:59--  http://magnitude.plasticity.ai/glove/heavy/glove.6B.300d.magnitude
Resolving magnitude.plasticity.ai (magnitude.plasticity.ai)... 52.216.110.74
Connecting to magnitude.plasticity.ai (magnitude.plasticity.ai)|52.216.110.74|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1384890368 (1.3G) [binary/octet-stream]
Saving to: ‘glove.6B.300d.magnitude’


2020-11-27 01:41:29 (43.6 MB/s) - ‘glove.6B.300d.magnitude’ saved [1384890368/1384890368]

The number of words with vector representations in this file is 400000.


##### Similarity

In [None]:
print("Trolls to Ogres Similarity: ", vectors.similarity("trolls", "ogres"))
print("Trolls to Princess Similarity: ",vectors.similarity("trolls", "princesses"))
print("Princes to Princesses Similarity: ",vectors.similarity("princes", "princesses"))

# Most similar to word in list of words
vectors.most_similar_to_given("troll", ["princess", "prince", "ogre", "knight"]) 

# Top Similar words
vectors.most_similar_approx("trolls", topn = 20)

# Finding Most similar Command
construct_sentence_vector("get fish", vectors)

##### Solving Word Analogy Problems
Famously, word2vec was shown to to be able solve many word analogy problems like "***man*** is to ***king*** as ***woman*** is to **-----**".  It does this by performing some vector arithmetic.   We take the vector for *king*, subtract the vector for *man*, and then add the vector for *woman*:<p>+ *king* <p>- *man*<p>+ *woman*<p>The result is a vector.  To figure out what word is closest to it, we find the most similar word vectors to the vector that resulted from our arithmetic. 

Magnitude allows us to do this in the following way:





In [None]:
vectors.most_similar(positive = ["king", "woman"], negative = ["man"])

#### Allen NLP Dependency Parsing

In [None]:
!git clone https://github.com/allenai/allennlp.git
%cd allennlp
!pip install -e .
!pip install -r dev-requirements.txt
!pip install allennlp-models
%cd ..

def verb_object_pairs(sentence):
  print('Sentence: ')
  print(sentence)

  prediction = predictor.predict(sentence=sentence)

  words = prediction['words']
  pred_dependencies = prediction['predicted_dependencies']
  pred_heads = prediction['predicted_heads']

  pairs = []
  for i in range(len(words)):
    if pred_dependencies[i] == 'dobj':
      verb =  words[pred_heads[i]-1] # -1 is bc head indices are one-indexed
      direct_object = words[i]
      pairs.append((verb, direct_object))
  return pairs

Cloning into 'allennlp'...
remote: Enumerating objects: 34, done.[K
remote: Counting objects: 100% (34/34), done.[K
remote: Compressing objects: 100% (32/32), done.[K
remote: Total 33511 (delta 11), reused 8 (delta 2), pack-reused 33477[K
Receiving objects: 100% (33511/33511), 70.66 MiB | 16.20 MiB/s, done.
Resolving deltas: 100% (25056/25056), done.
/content/allennlp
Obtaining file:///content/allennlp
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting boto3<2.0,>=1.14
[?25l  Downloading https://files.pythonhosted.org/packages/cc/a8/b5037dc144e458b3574c085d891b85ab2035b63ab946b5c91c23f2dfc1c6/boto3-1.16.4-py2.py3-none-any.whl (129kB)
[K     |████████████████████████████████| 133kB 2.7MB/s 
Collecting transformers<3.5,>=3.1
[?25l  Downloading https://files.pythonhosted.org/packages/2c/4e/4f1ede0fd7a36278844a277f8d53c21f88f37f3754abf76a5d6224f76d4a/transformers

Collecting allennlp-models
[?25l  Downloading https://files.pythonhosted.org/packages/8a/b4/1f322e890a834e349ecc6543501dc3196f8cdd183117cae383385ed08665/allennlp_models-1.1.0-py3-none-any.whl (322kB)
[K     |████████████████████████████████| 327kB 2.6MB/s 
[?25hCollecting py-rouge==1.1
[?25l  Downloading https://files.pythonhosted.org/packages/9c/1d/0bdbaf559fb7afe32308ebc84a2028600988212d7eb7fb9f69c4e829e4a0/py_rouge-1.1-py3-none-any.whl (56kB)
[K     |████████████████████████████████| 61kB 4.9MB/s 
[?25hCollecting word2number>=1.1
  Downloading https://files.pythonhosted.org/packages/4a/29/a31940c848521f0725f0df6b25dca8917f13a2025b0e8fcbe5d0457e45e6/word2number-1.1.zip
Collecting allennlp==1.1.0
[?25l  Downloading https://files.pythonhosted.org/packages/a5/95/d1d606fff85b537ba6dd133ed998ab62bf0c950feb6df2d101c0ec804ca6/allennlp-1.1.0-py3-none-any.whl (485kB)
[K     |████████████████████████████████| 491kB 8.5MB/s 
[?25hCollecting conllu==4.1
  Downloading https://files.pytho

In [None]:
from allennlp.predictors.predictor import Predictor
import allennlp_models.structured_prediction
predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/biaffine-dependency-parser-ptb-2020.04.06.tar.gz")
predictor.predict(
  sentence="If I bring 10 dollars tomorrow, can you buy me lunch?"
)

downloading: 100%|##########| 73220444/73220444 [00:02<00:00, 27321702.19B/s]
Your label namespace was 'pos'. We recommend you use a namespace ending with 'labels' or 'tags', so we don't add UNK and PAD tokens by default to your vocabulary.  See documentation for `non_padded_namespaces` parameter in Vocabulary.
  normalized_arc_logits.unsqueeze(1) + normalized_pairwise_head_logits


{'arc_loss': 0.379052996635437,
 'hierplane_tree': {'linkToPosition': {'acomp': 'right',
   'ccomp': 'right',
   'csubj': 'left',
   'csubjpass': 'left',
   'dobj': 'right',
   'iobj': 'right',
   'nsubj': 'left',
   'nsubjpass': 'left',
   'pcomp': 'right',
   'pobj': 'right',
   'xcomp': 'right'},
  'nodeTypeToStyle': {'acomp': ['color2'],
   'advcl': ['color3'],
   'advmod': ['color4'],
   'amod': ['color4'],
   'appos': ['color4'],
   'aux': ['color3'],
   'cc': ['color3'],
   'ccomp': ['color2'],
   'conj': ['color3'],
   'cop': ['color3'],
   'csubj': ['color1'],
   'csubjpass': ['color1'],
   'dep': ['color5', 'strong'],
   'det': ['color3'],
   'discourse': ['color3'],
   'dobj': ['color2'],
   'expletive': ['color3'],
   'infmod': ['color4'],
   'iobj': ['color2'],
   'mark': ['color2'],
   'mod': ['color4'],
   'neg': ['color0'],
   'nn': ['color4'],
   'npadvmod': ['color4'],
   'nsubj': ['color1'],
   'nsubjpass': ['color1'],
   'number': ['color3'],
   'pcomp': ['color2'],

In [None]:
print(verb_object_pairs("Take the apple from the table and eat it."))
print(verb_object_pairs("Taunt the dragon before slaying him with my sword."))

Sentence: 
Take the apple from the table and eat it.
[('Take', 'apple')]
Sentence: 
Taunt the dragon before slaying him with my sword.
[]


#### Allen NLP Coreference Resolution
You may have noticed in the previous section that we end up with verb-object pairs where the object is a pronoun.

Pronouns are words that refer to an entity that has already been mentioned in the text or is a participant in the conversation.

In English, pronouns are:

<div>
<img src="https://live.staticflickr.com/626/31598952693_017b53571c_c.jpg" width="500"/>
</div>

Since the commands in your text-adventure game are all in [inperative form](https://grammar.collinsdictionary.com/easy-learning/the-imperative), you will really only need to deal with pronouns being used as direct objects (the left column above).

You can use a coreference resolution algorithm to resolve the "it" in `Take the apple from the table and eat it.` or the "him" in `"Taunt the dragon before slaying him with my sword.`.

## Challenges with Coreference Resolution
Play around with AllenNLP's coreference resolution demo [here](https://demo.allennlp.org/coreference-resolution).

You'll notice that the system is far from perfect. AllenNLP predicts that the "it" is actually the table. This is a result of the inherent ambiguity in English language. There are a couple ways you can try to deal with this in your game.

1. Use auxiliary linguistic information (word embeddings perhaps) to figure out which entity is more likely being referenced.
2. Incorporate the coreference resolution algorithm's likely mistakes into the gameplay experience, adding humor. For example:

```
THE ROOM CONTAINS A SINGLE WOODEN TABLE. THERE IS A SHINY RED APPLE SITTING ON IT.
> Take the apple from the table and eat it.
YOU PUT THE APPLE INTO YOUR INVENTORY. YOU ATTEMPT TO TAKE A BITE OUT OF THE TABLE...OUCH! THAT HURT YOUR TEETH!
> Eat the apple.
THE APPLE TASTES DELICIOUS. HOWEVER, YOU SUDDENLY START TO FEEL VERY SLEEPY.
```


In [None]:
from allennlp.predictors.predictor import Predictor
import allennlp_models.coref
predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2020.02.27.tar.gz")
predictor.predict(
  document="The woman reading a newspaper sat on the bench with her dog."
)


def coreference_resolution(text):
  print(text)
  prediction = predictor.predict(document=text)
  print(prediction)
  clusters = prediction['clusters']
  words = prediction['document']
  for cluster in clusters:
    entity_indices, pronoun_indices = cluster

    entity_str = words[entity_indices[0]:entity_indices[1]+1]
    pronoun_str = words[pronoun_indices[0]:pronoun_indices[1]+1]
    print('"%s" references "%s"' % (pronoun_str, entity_str))


downloading: 100%|##########| 1345947288/1345947288 [00:24<00:00, 54572658.98B/s]


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=414.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=665132540.0, style=ProgressStyle(descri…




Some weights of BertModel were not initialized from the model checkpoint at SpanBERT/spanbert-large-cased and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
coreference_resolution("Take the apple from the table and eat it.")
coreference_resolution("John takes the apple from the table, and he eats it.")
coreference_resolution("Take the apple from the table and eat it. John likes to eat apples.")
coreference_resolution("Taunt the dragon before slaying him.")


Take the apple from the table and eat it.
{'top_spans': [[1, 2], [4, 5], [7, 7], [8, 8]], 'antecedent_indices': [[0, 1, 2, 3], [0, 1, 2, 3], [0, 1, 2, 3], [0, 1, 2, 3]], 'predicted_antecedents': [-1, -1, -1, 0], 'document': ['Take', 'the', 'apple', 'from', 'the', 'table', 'and', 'eat', 'it', '.'], 'clusters': [[[1, 2], [8, 8]]]}
"['it']" references "['the', 'apple']"
John takes the apple from the table, and he eats it.
{'top_spans': [[0, 0], [2, 3], [5, 6], [9, 9], [11, 11]], 'antecedent_indices': [[0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [0, 1, 2, 3, 4]], 'predicted_antecedents': [-1, -1, -1, 0, 1], 'document': ['John', 'takes', 'the', 'apple', 'from', 'the', 'table', ',', 'and', 'he', 'eats', 'it', '.'], 'clusters': [[[0, 0], [9, 9]], [[2, 3], [11, 11]]]}
"['he']" references "['John']"
"['it']" references "['the', 'apple']"
Take the apple from the table and eat it. John likes to eat apples.
{'top_spans': [[1, 2], [4, 5], [8, 8], [10, 10], [11, 11], [13, 13]

#### Predicting Word Concreteness ***NOT WORKING YET***
Concreteness is a measure of how readily the concerpt repreesented by a word can be seen, smelled, heard, or felt. 

If a concept can be readily perceived by the senses then is is very concrete. If a concept cannot be perceived, then it is the opposite of concrete--abstract.

It's possible from a word's embedding to prdict how concrete the word is. 

In [None]:
!wget -N http://crr.ugent.be/papers/Concreteness_ratings_Brysbaert_et_al_BRM.txt

import csv
from tqdm import tqdm
from zlib import crc32

import sklearn
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neural_network import MLPRegressor

import scipy.stats

# from smart_open import open

import gensim.downloader as api
text8_path = api.load('text8', return_path=True)
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec

# path = get_tmpfile("word2vec.model")

# model = Word2Vec(common_texts, size=100, window=5, min_count=1, workers=4)
# model.save("word2vec.model")

from gensim.models import Word2Vec, KeyedVectors
from gensim.models.word2vec import Text8Corpus

# Using params from Word2Vec_FastText_Comparison
params = {
    'alpha': 0.05,
    'size': 100,
    'window': 5,
    'iter': 5,
    'min_count': 5,
    'sample': 1e-4,
    'sg': 1,
    'hs': 0,
    'negative': 5
}
# model = Word2Vec(Text8Corpus(text8_path), **params)
# print(model)

model = Word2Vec(**params)
print(model)


def read_in_data(file_path, word2vec):
  words = []
  concs = []
  embs = []
  with open(file_path) as csvfile:
    reader = csv.DictReader(csvfile, delimiter='\t',)
    for row in tqdm(reader):
      conc = float(row['Conc.M'])
      word = row['Word']
      if conc != 0:
        # 0 means there was not enough interannotator agreement for them to
        # include the score.

        word = word.replace(' ', '-').lower()
        if word in word2vec:
          # For now, skip words not in the embedding file. 
          embs.append(word2vec[word])
          words.append(word)
          concs.append(conc)
  return words, concs, embs

def floathash(b):
  return float(crc32(b.encode('utf-8')) & 0xffffffff) / 2**32

def create_split(words, concs, embs, train_prob = 0.9):
  val_words = []
  val_concs = []
  val_embs = []

  train_words = []
  train_concs = []
  train_embs = []

  for word, conc, emb in tqdm(zip(words, concs, embs)):
    if floathash(word) <= train_prob:
      train_words.append(word)
      train_concs.append(conc)
      train_embs.append(emb)
    else:
      val_words.append(word)
      val_concs.append(conc)
      val_embs.append(emb)
  return train_words, train_concs, train_embs, val_words, val_concs, val_embs 

def crush_scores(scores):
  """Turn 1-5 scores to 0-1 scale."""
  return [(s - 1) / 4.0 for s in scores]

def train_model(train_embs, train_concs, val_embs, val_concs, method='linear', normalize=False):
  print('Training with method %s, %s' % (method, '[0,1]' if normalize else '[1,4]'))
  if normalize:
    val_concs = crush_scores(val_concs)
    train_concs = crush_scores(train_concs)    
  if method == 'linear':
    model = LinearRegression()
  elif method == '2mlp':
    model = MLPRegressor(hidden_layer_sizes=[64,32])
  else:
    raise ValueError('Unsupported method')

  model = model.fit(train_embs, train_concs)
  print('Train correlation: ')
  print(scipy.stats.pearsonr(model.predict(train_embs), train_concs))
  
  print('Val correlation: ')
  print(scipy.stats.pearsonr(model.predict(val_embs), val_concs))
  
  print('')
  return model

# words, concs, embs = read_in_data('Concreteness_ratings_Brysbaert_et_al_BRM.txt', model)

# train_words, train_concs, train_embs, val_words, val_concs, val_embs = create_split(words, concs, embs, 0.95)
# print('Train set size: %d' % len(train_words))
# print('Val set size: %d' % len(val_words))

# model = train_model(train_embs, train_concs, val_embs, val_concs, '2mlp', True)


In [None]:
# # Predictions for words not in train set
# print('archetype' in train_words)
# print(model.predict([model['archetype'])]))

# print('pigtailed' in train_words)
# print(model.predict([model['pigtailed'])]))

# print('determination' in train_words)
# print(model.predict([model['determination']]))

# print('whirlpool' in train_words)
# print(model.predict([model['whirlpool']]))

#### Bert Contexutal Word Embeddings
One issue with word embeddings is that they don't handle ambiguity. If I say the word "bat", do you picture baseball or a cute flying mammal?  Word2vec would end up picking a vector somewhere in between the two.

Contextual word embeddings are word embeddings that vary based on the context in which a word is being used.

Consider the following sentences.
```
1) The bat comes out at night to eat mosquitoes.
2) The swallow flitted from branch to branch, eating mosquitoes.
3) The player dropped the bat and sprinted past first base.
```

With contextual word embeddings, the embedding of "bat" in (1) will end up being close to the embedding for "swallow" in  (2) than the embedding of "bat" in (3).

BERT is a neural network trained to produce one embedding per token in the input


In [None]:
import torch
import tensorflow as tf
import numpy as np
from transformers import BertModel, BertConfig
from scipy.spatial.distance import cosine

tokenizer = BertConfig.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def get_tokens_and_embeddings(text):
  inputs_ids = tokenizer.encode(text)
  input_ids = torch.tensor(inputs_ids).unsqueeze(0)  # Batch size 1

  token_embeddings, merged_embedding = model(input_ids)

  # Remove the embeddings in the first and last positions
  # which are the [CLS] and [SEP] tokens.
  token_embeddings = token_embeddings.squeeze()[1:-1, :]
  return token_embeddings.detach().numpy()

def token_indexes_for_word(tokens, word):
  """Returns the token indexes corresponding to the specified word."""
  ids = tokenizer.convert_tokens_to_ids(tokens)

  word_ids = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(word))
  word_len = len(word_ids)

  for i in range(len(tokens) - word_len):
    if np.all(np.equal(ids[i:(i+word_len)], word_ids)):
      return list(range(i, i+word_len))
  return None



AttributeError: ignored

##### Compute Word Embedding

In [None]:
# Since BERT uses a subword vocabulary can take up multiple tokens.
# This can be seen in the word "mosquitoes" in the following sentence.
sentence = "The bat comes out at night to eat mosquitoes."
embeddings = get_tokens_and_embeddings(sentence)
tokens = tokenizer.tokenize(sentence)
mosquitoes_indices = token_indexes_for_word(tokens, "mosquitoes")
print(sentence)
print(tokens)
print("'mosquitoes' is in token positions: %s" % str(mosquitoes_indices))

# For 'mosquitoes' and other multi-token words, a single embedding for the word
# can be computed by simply taking the embedding of the first token of the word.
# Another option is to take the mean over all of the constituent token
# embeddings.
mosquitoes_embedding = embeddings[mosquitoes_indices[0], :]
alternative_mosquitoes_embedding = np.mean(embeddings[mosquitoes_indices, :], axis=0)
print(mosquitoes_embedding.shape)
print(alternative_mosquitoes_embedding.shape)

The bat comes out at night to eat mosquitoes.
['the', 'bat', 'comes', 'out', 'at', 'night', 'to', 'eat', 'mosquito', '##es', '.']
'mosquitoes' is in token positions: [8, 9]
(768,)
(768,)


##### Comparing Contextual Word Embeddings

In [None]:
sentence = "The bat comes out at night to eat mosquitoes."
embeddings = get_tokens_and_embeddings(sentence)
animalbat_index = token_indexes_for_word(tokens, "bat")[0]
animalbat_embedding = embeddings[animalbat_index, :]

sentence = "The swallow flitted from branch to branch, eating mosquitoes."
embeddings = get_tokens_and_embeddings(sentence)
swallow_index = token_indexes_for_word(tokens, "bat")[0]
swallow_embedding = embeddings[swallow_index, :]

sentence = "The player dropped the bat and sprinted past first base."
embeddings = get_tokens_and_embeddings(sentence)
baseballbat_index = token_indexes_for_word(tokens, "bat")[0]
baseballbat_embedding = embeddings[baseballbat_index, :]

print('Distance between a swallow and an animal bat: %f' %
      cosine(animalbat_embedding, swallow_embedding))
print('Distance between an animal bat and a baseball bat: %f' %
      cosine(animalbat_embedding, baseballbat_embedding))
print('Distance between a swallow and a baseball bat: %f' %
      cosine(swallow_embedding, baseballbat_embedding))

Distance between a swallow and an animal bat: 0.346092
Distance between an animal bat and a baseball bat: 0.666941
Distance between a swallow and a baseball bat: 0.706544


### Analyzing Output
#### ROCStories
The [ROCStories task](https://cs.rochester.edu/nlp/rocstories/) involves predicting which sentence best ends a short story. The stories look something like this:

**Story**
```
Dorothy's cat was pregnant.
She didn't know how it happened.
She convinced the family to keep the kittens.
It wound up having 7 kittens.
```
**Candidate Ending 1**
```
Dorothy made sure to buy lots of cat food.
```
**Candidate Ending 2**
```
Dorothy went to the pet store and bought a new hamster.
```

The bad ending sentences are designed to be on topic but clearly incorrect to a human. Despite Ending 2 mentioning a pet store, you should have quickly guessed that Ending 1 is the correct one.

The tricky part about ROCStories is that the training set only contains 5-sentence stories with good ending sentences.
However, at test time you see two possible 5th sentences and need to classify which is better.
You can read up on the dataset and how it was collected in the [paper introducing the dataset](https://www.aclweb.org/anthology/N16-1098.pdf).


In [None]:
### Download the data
%mkdir rocstories_data
!wget -nc -O rocstories_data/train2017.csv https://docs.google.com/spreadsheets/d/1emH8KL8NVCCumZc2oMu-3YqRWddD3AqZEHvNqMdfgKA/export?format=csv
!wget -nc -O rocstories_data/valid2018.csv https://docs.google.com/spreadsheets/d/1F9vtluzD3kZOn7ULKyMQZfoRnSRzRnnaePyswkRqIdY/export?format=csv
!wget -nc -O rocstories_data/valid2016.csv https://docs.google.com/spreadsheets/d/1FkdPMd7ZEw_Z38AsFSTzgXeiJoLdLyXY_0B_0JIJIbw/export?format=csv
!wget -nc -O rocstories_data/test2016.csv  https://docs.google.com/spreadsheets/d/11tfmMQeifqP-Elh74gi2NELp0rx9JMMjnQ_oyGKqCEg/export?format=csv

import IPython
import csv
import numpy as np
# IPython.display.clear_output()  # Clear the stdout/


mkdir: cannot create directory ‘rocstories_data’: File exists
File ‘rocstories_data/train2017.csv’ already there; not retrieving.
File ‘rocstories_data/valid2018.csv’ already there; not retrieving.
File ‘rocstories_data/valid2016.csv’ already there; not retrieving.
File ‘rocstories_data/test2016.csv’ already there; not retrieving.


In [None]:

def read_rocstories_valid_csv(path):
  examples = []
  with open(path) as f:
    reader = csv.DictReader(f)
    for line in reader:
      context = [line['InputSentence1'], line['InputSentence2'],
                 line['InputSentence3'], line['InputSentence4']]
      option_0 = line['RandomFifthSentenceQuiz1']
      option_1 = line['RandomFifthSentenceQuiz2']
      label = int(line['AnswerRightEnding']) - 1
      examples.append({'context': context, 
                       'options': [option_0, option_1],
                       'label': label})
  return examples

def read_rocstories_train_csv(path):
  examples = []
  with open(path) as f:
    reader = csv.DictReader(f)
    for line in reader:
      story = [line['sentence1'], line['sentence2'],
               line['sentence3'], line['sentence4'],
               line['sentence5']]
      examples.append({'story': story})
  return examples

# Computes an accuracy given the data dictionary and a list of [0, 1] predictions.

def compute_accuracy(data, predictions):
  ground_truth = np.array([ex['label'] for ex in data])
  predictions = np.array(predictions)
  assert len(ground_truth) == len(predictions)

  return np.sum(np.equal(ground_truth, predictions)) / float(len(ground_truth))

def predict_based_on_sentiment(data):
  """Returns a list with one value per example in data.

  List values should either be 0 or 1 indicating which ending is predicted.
  """
  #### TODO: YOUR IMPLEMENTATION HERE ####
  predictions = [0] * len(data)
  return predictions




train_data = read_rocstories_train_csv('/content/rocstories_data/train2017.csv')
valid_2016_data = read_rocstories_valid_csv('/content/rocstories_data/valid2016.csv')
valid_2018_data = read_rocstories_valid_csv('/content/rocstories_data/valid2018.csv')
test_2016_data = read_rocstories_valid_csv('/content/rocstories_data/test2016.csv')

predictions_valid_2016 = predict_based_on_sentiment(valid_2016_data)
print('\n2016 validation accuracy: ' )
print(compute_accuracy(valid_2016_data, predictions_valid_2016))

predictions_valid_2018 = predict_based_on_sentiment(valid_2018_data)
print('\n2018 validation accuracy: ' )
print(compute_accuracy(valid_2018_data, predictions_valid_2018))

predictions_test_2016 = predict_based_on_sentiment(test_2016_data)
print('\n2016 test accuracy: ' )
print(compute_accuracy(test_2016_data, predictions_test_2016))




2016 validation accuracy: 
0.5141635489043292

2018 validation accuracy: 
0.5111394016549968

2016 test accuracy: 
0.51309460181721


##### ROC Stories with Bert Embedding

In [None]:
from transformers import BertTokenizer, BertModel
import torch
import pickle

def load_bert():
  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
  model = BertModel.from_pretrained('bert-base-uncased')
  return model, tokenizer

def bert_embedding(text):
  inputs_ids = TOKENIZER.encode(text)
  input_ids = torch.tensor(inputs_ids).unsqueeze(0)  # Batch size 1

  _, merged_embedding = BERT_MODEL(input_ids)
  return merged_embedding.detach().numpy()
  
def get_train_embeddings(data):
  """Computes embeddings for each example in the provided train set."""
  context_embeddings = []
  ending_embeddings = []
  print('Starting')
  # for example in tqdm(data, desc='Computing BERT embeddings '):
  for idx, example in enumerate(data):
    if idx % 20 == 0:
      print('{}/{}'.format(idx+1, len(data)))
      print(' '.join(example['story']))
    context_embedding = bert_embedding(' '.join(example['story'][:4]))
    ending_embedding = bert_embedding(example['story'][4])

    context_embeddings.append(context_embedding)
    ending_embeddings.append(ending_embedding)
  context_embeddings = np.concatenate(context_embeddings, axis=0)
  ending_embeddings = np.concatenate(ending_embeddings, axis=0)
  return context_embeddings, ending_embeddings

def get_valid_embeddings(data):
  """Computes embeddings for each example in the provided validation set."""
  context_embeddings = []
  ending_0_embeddings = []
  ending_1_embeddings = []
  for example in tqdm(data, desc='Computing BERT embeddings '):
    context_embedding = bert_embedding(' '.join(example['context'][:4]))
    ending_0_embedding = bert_embedding(example['options'][0])
    ending_1_embedding = bert_embedding(example['options'][1])

    context_embeddings.append(context_embedding)
    ending_0_embeddings.append(ending_0_embedding)
    ending_1_embeddings.append(ending_1_embedding)

  context_embeddings = np.concatenate(context_embeddings, axis=0)
  ending_0_embeddings = np.concatenate(ending_0_embeddings, axis=0)
  ending_1_embeddings = np.concatenate(ending_1_embeddings, axis=0)
  return context_embeddings, ending_0_embeddings, ending_1_embeddings

# These are the lines I used to generate BERT embeddings. Since, they are slow
# to compute, we've provided the outputs as .pkl files.
# BERT_MODEL, TOKENIZER = load_bert()
# train_context_embs, train_ending_embs = get_train_embeddings(train_data)
# valid_2016_context_embs, valid_2016_ending_0_embs, valid_2016_ending_1_embs = get_valid_embeddings(valid_2016_data)
# valid_2018_context_embs, valid_2018_ending_0_embs, valid_2018_ending_1_embs = get_valid_embeddings(valid_2018_data)
# test_2016_context_embs, test_2016_ending_0_embs, test_2016_ending_1_embs = get_valid_embeddings(test_2018_data)

!gsutil cp gs://cis700_shared_data/rocstories_data/rocstories_train.pkl /content/rocstories_train.pkl
with open('/content/rocstories_train.pkl', 'rb') as f:
  data = pickle.load(f)
  train_context_embs = data['contexts']
  train_ending_embs = data['endings']

!gsutil cp gs://cis700_shared_data/rocstories_data/rocstories_valid_2016.pkl /content/rocstories_valid_2016.pkl
with open('/content/rocstories_valid_2016.pkl', 'rb') as f:
  data = pickle.load(f)
  valid_2016_context_embs = data['contexts']
  valid_2016_ending_0_embs = data['endings_0']
  valid_2016_ending_1_embs = data['endings_1']

!gsutil cp gs://cis700_shared_data/rocstories_data/rocstories_valid_2018.pkl /content/rocstories_valid_2018.pkl
with open('/content/rocstories_valid_2018.pkl', 'rb') as f:
  data = pickle.load(f)
  valid_2018_context_embs = data['contexts']
  valid_2018_ending_0_embs = data['endings_0']
  valid_2018_ending_1_embs = data['endings_1']

!gsutil cp gs://cis700_shared_data/rocstories_data/rocstories_test_2016.pkl /content/rocstories_test_2016.pkl
with open('/content/rocstories_test_2016.pkl', 'rb') as f:
  data = pickle.load(f)
  test_2016_context_embs = data['contexts']
  test_2016_ending_0_embs = data['endings_0']
  test_2016_ending_1_embs = data['endings_1']

Copying gs://cis700_shared_data/rocstories_data/rocstories_train.pkl...
/ [1 files][308.6 MiB/308.6 MiB]                                                
Operation completed over 1 objects/308.6 MiB.                                    
Copying gs://cis700_shared_data/rocstories_data/rocstories_valid_2016.pkl...
\ [1 files][ 16.4 MiB/ 16.4 MiB]                                                
Operation completed over 1 objects/16.4 MiB.                                     
Copying gs://cis700_shared_data/rocstories_data/rocstories_valid_2018.pkl...
\ [1 files][ 13.8 MiB/ 13.8 MiB]                                                
Operation completed over 1 objects/13.8 MiB.                                     
Copying gs://cis700_shared_data/rocstories_data/rocstories_test_2016.pkl...
\ [1 files][ 16.4 MiB/ 16.4 MiB]                                                
Operation completed over 1 objects/16.4 MiB.                                     


###### Bert Embedding Training on Train Set

In [None]:
import tensorflow as tf
import random

def get_batch(batch_size, num_candidates):
  """Returns a single training batch.
  
  Returns:
  batch_inputs: [batch_size, embedding_size] matrix of context embeddings.
  batch_candidates: [num_candidates, embedding_size] matrix of embeddings of 
    candidate 5th sentence embeddings. The groundtruth 5th sentence for the ith
    example in batch_inputs is in the ith row of batch_candidates.
  labels: [batch_size] For each example in batch_inputs, the index of the true
    5th sentence in batch_candidates.
  """
  if num_candidates < batch_size:
    raise ValueError(
        'At minimum the number of candidates is at least all of the other 5th '
        'sentences in the batch.')
    
  batch_inputs = []
  batch_candidates = []
  batch_labels = []
  for i in range(batch_size):
    rand_ex_index = random.randint(0, train_context_embs.shape[0]-1)
    batch_inputs.append(train_context_embs[rand_ex_index, :])
    batch_candidates.append(train_ending_embs[rand_ex_index, :])
    # The true next embedding is in the ith position in the candidates
    batch_labels.append(i)

  # Increase the number of "distractor" candidates to num_candidates.
  for i in range(num_candidates - batch_size):
    rand_ex_index = random.randint(0, train_context_embs.shape[0]-1)
    batch_candidates.append(train_ending_embs[rand_ex_index, :])

  batch_inputs = np.stack(batch_inputs, axis=0)
  batch_candidates = np.stack(batch_candidates, axis=0)
  return batch_inputs, batch_candidates, batch_labels

def predict_based_on_bert_classifier(
    context_embs, ending_0_embs, ending_1_embs, model):
  """Returns a list of predictions based on model."""
  predicted_embs = model(context_embs)
  
  predictions = []
  for idx in range(predicted_embs.shape[0]):
    pred_emb = predicted_embs[idx, :]
    score_0 = np.dot(pred_emb, ending_0_embs[idx, :])
    score_1 = np.dot(pred_emb, ending_1_embs[idx, :])
    predictions.append(score_0 < score_1)
  return predictions
  
def get_model():
  """Returns a Keras model.
  The model should input a [batch_size, embedding_size] tensor and output a new
  [batch_size, embedding_size] tensor. At it's simplest, it could just be a
  single dense layer. You should experiment with adding layers, changing the
  activation function, or otherwise modifying the architecture defined below.
  See:
  https://www.tensorflow.org/api_docs/python/tf/keras/layers/Dense
  
  """

  # This is an example of a very simple network consisting of a single nonlinear
  # layer followed by a linear projection back to the BERT embedding size.
  model = tf.keras.Sequential()
  model.add(tf.keras.layers.Dense(512, activation="relu"))
  model.add(tf.keras.layers.Dense(768, activation="linear"))
  
  return model

#### HYPERPARAMETERS ####
NUM_TRAIN_STEPS = 10000  # How many step to train for.
BATCH_SIZE = 32  # Number of examples used in step of training.
NUM_CANDIDATES = 50  # Number of candidate 5th sentences classifier must decide between.
LEARNING_RATE = 0.001  # Learning rate.
# If your loss is barely going down, learning rate might be too small.
# If your loss is jumping around, it might be too big.

# You may experiment with other optimizers or loss functions if you'd like.
optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model = get_model()

# Iterate over the batches of a dataset.
for train_step in range(NUM_TRAIN_STEPS):
  with tf.GradientTape() as tape:
    batch_inputs, batch_candidates, batch_labels = get_batch(BATCH_SIZE, NUM_CANDIDATES)

    # Predicted 5th sentence embedding for each batch position/
    outputs = model(batch_inputs)
    # The logits will be batch_size * num_candidates, giving a score for each
    # candidate 5th sentence. We'd like the true 5th sentence to have the
    # highest score.
    logits = tf.matmul(outputs, batch_candidates, transpose_b=True)
    # Loss value for this minibatch
    loss_value = loss_fn(batch_labels, logits)

  grads = tape.gradient(loss_value, model.trainable_weights)
  optimizer.apply_gradients(zip(grads, model.trainable_weights))

  if train_step % 100 == 0:
    print('Step {}, batch_train_loss={}'.format(train_step, loss_value))
  if train_step % 1000 == 0:
    predictions_2016 = predict_based_on_bert_classifier(valid_2016_context_embs, valid_2016_ending_0_embs, valid_2016_ending_1_embs,model)
    predictions_2018 = predict_based_on_bert_classifier(valid_2018_context_embs, valid_2018_ending_0_embs, valid_2018_ending_1_embs,model)
    
    print('2016 validation accuracy: {}'.format(compute_accuracy(valid_2016_data, predictions_2016)))
    print('2018 validation accuracy: {}'.format(compute_accuracy(valid_2018_data, predictions_2018)))

predictions_2016 = predict_based_on_bert_classifier(
    valid_2016_context_embs, valid_2016_ending_0_embs, valid_2016_ending_1_embs,
    model)
print('\n2016 validation accuracy: ' )
print(compute_accuracy(valid_2016_data, predictions_2016))

predictions_2018 = predict_based_on_bert_classifier(
    valid_2018_context_embs, valid_2018_ending_0_embs, valid_2018_ending_1_embs,
    model)
print('\n2018 validation accuracy: ' )
print(compute_accuracy(valid_2018_data, predictions_2018))

predictions_2016 = predict_based_on_bert_classifier(
    test_2016_context_embs, test_2016_ending_0_embs, test_2016_ending_1_embs,
    model)
print('\n2016 test accuracy: ' )
print(compute_accuracy(test_2016_data, predictions_2016))

Step 0, batch_train_loss=4.469831466674805
2016 validation accuracy: 0.5093532870122929
2018 validation accuracy: 0.513049013367282
Step 100, batch_train_loss=3.732597589492798
Step 200, batch_train_loss=3.5529496669769287
Step 300, batch_train_loss=3.290818691253662
Step 400, batch_train_loss=3.2141976356506348
Step 500, batch_train_loss=3.2065553665161133
Step 600, batch_train_loss=3.223421573638916
Step 700, batch_train_loss=3.055365800857544
Step 800, batch_train_loss=3.145728588104248
Step 900, batch_train_loss=2.8109350204467773
Step 1000, batch_train_loss=2.7897844314575195
2016 validation accuracy: 0.5686798503474078
2018 validation accuracy: 0.5716104392106939
Step 1100, batch_train_loss=2.8699021339416504
Step 1200, batch_train_loss=2.9516353607177734
Step 1300, batch_train_loss=2.82285213470459
Step 1400, batch_train_loss=3.1685118675231934
Step 1500, batch_train_loss=2.8744866847991943
Step 1600, batch_train_loss=2.9216463565826416
Step 1700, batch_train_loss=2.733722686767

###### Bert Embedding Training on Validation Set

In [None]:
def get_batch_from_valid(batch_size, inputs, labels):
  """Returns a single training batch extracted form the validation set.

  Inputs:
  batch_size: The batch size.
  inputs: [dataset_size, 2*embedding_size] matrix of all inputs in the training
    set.
  labels: [dataset_size] for each example, 0 if example has the incorrect ending
    embedding, 1 if it has the correct ending embedding.
  
  Returns:
  batch_inputs: [batch_size, 2*embedding_size] matrix of embeddings (each
    embedding is a context embedding concatenated with an ending embedding).
  labels: [batch_size] For each example in batch_inputs, contains either 0 or 1,
    indicating whether the 5th ending is the correct one.
  """
  batch_inputs = []
  batch_labels = []
  for i in range(batch_size):
    rand_ex_index = random.randint(0, inputs.shape[0]-1)    
    batch_inputs.append(inputs[rand_ex_index, :])
    batch_labels.append(labels[rand_ex_index])
    
  batch_inputs = np.stack(batch_inputs, axis=0)
  return batch_inputs, batch_labels

# Each input example consists of a context_embedding concatenated with an ending embedding.
def build_dataset():
  """Builds a dataset out of the validation set examples.

  Each example in valid_2016 and valid_2018 becomes two exampes in this new 
  dataset:
  * one where ending_0's embedding is concatenated to the context embedding
  * one where ending_1's embedding is concatenated to the context embedding

  The label for each example is 1 if the correct ending's embedding is present,
  0 if the incorrect ending's embedding is present.

  Returns:
  all_inputs: [new_dataset_size, embedding_size*2]
  all_labels: [new_dataset_size]
  """
  inputs_2016 = tf.concat(
      [tf.concat([valid_2016_context_embs, valid_2016_ending_0_embs], axis=-1),
      tf.concat([valid_2016_context_embs, valid_2016_ending_1_embs], axis=-1)], axis=0)
  labels = [ex['label'] for ex in valid_2016_data]
  labels_2016 = labels + [1 - label for label in labels]

  inputs_2018 = tf.concat(
      [tf.concat([valid_2018_context_embs, valid_2018_ending_0_embs], axis=-1),
      tf.concat([valid_2018_context_embs, valid_2018_ending_1_embs], axis=-1)], axis=0)
  labels = [ex['label'] for ex in valid_2018_data]
  labels_2018 = labels + [1 - label for label in labels]

  all_inputs = tf.concat([inputs_2016, inputs_2018], axis=0)
  all_labels = labels_2016 + labels_2018

  return all_inputs, all_labels

def predict_based_on_bert_binary_classifier(
    context_embs, ending_0_embs, ending_1_embs, model):
  """Returns a list of predictions based on binary classification model."""
  scores_ending_0 = model(tf.concat([context_embs, ending_0_embs], -1))
  scores_ending_1 = model(tf.concat([context_embs, ending_1_embs], -1))
  predictions = tf.greater(scores_ending_0, scores_ending_1)[:, 1]
  return predictions

def get_binary_classifier():
  """Returns a Keras model.
  The model should input a [batch_size, 2*embedding_size] tensor and output a
  [batch_size, 2] tensor. The final final dimension needs to be 2 because we are
  doing binary classification.
  
  You should experiment with modifying the architecture below.
  See:
  https://www.tensorflow.org/api_docs/python/tf/keras/layers/Dense
  
  """

  model = tf.keras.Sequential()
  model.add(tf.keras.layers.Dense(512, activation="relu"))
  model.add(tf.keras.layers.Dense(2, activation="linear"))
  
  return model

NUM_TRAIN_STEPS = 20000  # How many step to train for.
BATCH_SIZE = 32  # Number of examples used in step of training.
LEARNING_RATE = 0.001  # Learning rate.

NUM_TRAIN_EXAMPLES = 5000 # How many examples from the valid set to use for training.
# The remainder will be placed into a new valid set.

# You should with varying NUM_TRAIN_EXAMPLES. If it is larger, you will train a 
# better model, but you will have fewer examples available your validation set
# for tuning other hyperparameters.
all_inputs, all_labels = build_dataset()
train_inputs = all_inputs[:NUM_TRAIN_EXAMPLES, :]
train_labels = all_labels[:NUM_TRAIN_EXAMPLES]
valid_inputs = all_inputs[NUM_TRAIN_EXAMPLES:, :]
valid_labels = all_labels[NUM_TRAIN_EXAMPLES:]

# You may experiment with other optimizers or loss functions if you'd like.
optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

model_2 = get_binary_classifier()

# Iterate over the batches of a dataset.
for train_step in range(NUM_TRAIN_STEPS):
  with tf.GradientTape() as tape:
    batch_inputs, batch_labels = get_batch_from_valid(
        BATCH_SIZE, train_inputs, train_labels)

    logits = model_2(batch_inputs)
    loss_value = loss_fn(batch_labels, logits)

  grads = tape.gradient(loss_value, model_2.trainable_weights)
  optimizer.apply_gradients(zip(grads, model_2.trainable_weights))

  if train_step % 100 == 0:
    batch_acc = sum(tf.equal(batch_labels, tf.argmax(logits, axis=-1)).numpy()) / BATCH_SIZE
    print('Step {0}, batch_loss={1:.5f}, batch_acc={2:.3f}'.format(
        train_step, loss_value, batch_acc))
  if train_step % 1000 == 0:
    valid_logits = model_2(valid_inputs)
    num_correct = sum(tf.equal(valid_labels, tf.argmax(valid_logits, axis=-1)).numpy())
    print('Validation accuracy: {0:.3f}'.format(num_correct / len(valid_labels)))

# We can no longer fairly evaluate on the 2016 and 2018 validation sets since
# they've been used for training. Instead, we only evaluate on the 2016 test set.

predictions_2016 = predict_based_on_bert_binary_classifier(
    test_2016_context_embs, test_2016_ending_0_embs, test_2016_ending_1_embs,
    model_2)
print('\n2016 test accuracy: ' )
print(compute_accuracy(test_2016_data, predictions_2016))


Step 0, batch_loss=0.72578, batch_acc=0.500
Validation accuracy: 0.486
Step 100, batch_loss=0.72898, batch_acc=0.438
Step 200, batch_loss=0.59476, batch_acc=0.688
Step 300, batch_loss=0.68226, batch_acc=0.531
Step 400, batch_loss=0.70704, batch_acc=0.500
Step 500, batch_loss=0.85098, batch_acc=0.406
Step 600, batch_loss=0.64028, batch_acc=0.656
Step 700, batch_loss=0.61611, batch_acc=0.812
Step 800, batch_loss=0.69246, batch_acc=0.469
Step 900, batch_loss=0.60957, batch_acc=0.688
Step 1000, batch_loss=0.69638, batch_acc=0.531
Validation accuracy: 0.517
Step 1100, batch_loss=0.69869, batch_acc=0.469
Step 1200, batch_loss=0.68641, batch_acc=0.594
Step 1300, batch_loss=0.67161, batch_acc=0.594
Step 1400, batch_loss=0.67791, batch_acc=0.562
Step 1500, batch_loss=0.65010, batch_acc=0.688
Step 1600, batch_loss=0.64370, batch_acc=0.688
Step 1700, batch_loss=0.65998, batch_acc=0.656
Step 1800, batch_loss=0.74999, batch_acc=0.406
Step 1900, batch_loss=0.64760, batch_acc=0.656
Step 2000, batch_l

#### Cloze Stories Test
The  cloze  task  (Taylor,  1953)  is  used  to  evaluatea human (or a system) for language understandingby  deleting  a  random  word  from  a  sentence  andhaving  a  human  fill  in  the  blank.    We  introduce‘Story  Cloze  Test’,  in  which  a  system  is  given  afour-sentence ‘context’ and two alternative endingsto the story,  called ‘right ending’ and ‘wrong end-ing’.  Hence, in this test the fifth sentence is blank.Then the system’s task is to choose the right end-ing. The ‘right ending’ can be viewed as ‘entailing’hypothesis in a classic Recognizing Textual Entail-ment (RTE) framework (Giampiccolo et al., 2007),and ‘wrong’ ending can be seen as the ’contradict-ing’ hypothesis. 

(4) (PDF) A Corpus and Cloze Evaluation for Deeper Understanding of Commonsense Stories. Available from: https://www.researchgate.net/publication/305342142_A_Corpus_and_Cloze_Evaluation_for_Deeper_Understanding_of_Commonsense_Stories [accessed Oct 24 2020].

## Processing User Input
### Steps


1.   Create UI
 * Select Author Style (single author, list of authors, all): Checkbox with all options
 * Select the genere/s: Checkbox with all the options
 *   Character Init
 *   Start story button (Clears the setup UI and displays generation loop UI)
 *   Dropdown for type of Input (Direct Line, Summarize, Question, Continue)
 *  Text area for user input; Merge with below?
 *  Text area with editable text. This will be the output and context for the next round. The user should be able to edit the text the AI has written before submitting it for the next generation cycle.
 *  Toggle/Checkbox to have produced content read allowed
 * Range Slider for length of text to generate on each run




2.   Analyze Input
 * Sentiment Analysis
 * Match input words to special token words



#### UI

In [None]:
%%html
<style>
.mytext {
    font-size: 20px;
    color: red;
}
.widget-select > select {background-color: red;}

.box-style {
    background-color: white;
}

.widget-box, .widget-vbox { 
   box-sizing: border-box; 
   background-color: white;
   margin: 0; 
   overflow: auto; 
 } 
</style>


In [None]:
from ipywidgets import *
from IPython import *

AUTHOR_LIST = ['Clive Barker', 'J. K. Rowling', 'Stephen King', 'Théophile Gautier', 
               'James H. Hyslop', 'Lord Edward Bulwer-Lytton', 'A. T. Quiller-Couch', 
               'Mrs. Margaret Oliphant', 'Ernest Theodor Amadeus Hoffmann', 'Erckmann-Chatrian', 
               'Fiona Macleod', 'Amelia B. Edwards', 'H. B. Marryatt', 'Thomas Hardy', 
               'Montague Rhodes James', 'Fitz-James O\'Brien', 'James Stephen', 'Alfred Lord Tennyson',
               'Amelia Edwards', 'Edward Bulwer-Lytton', 'Erckmann Chatrian', 'Latifa al-Zayya',
               'M. R. James', 'Paul Brandis', ' Brain Evenson', 'Elliott O\'Donnell', 
               'Joseph, Sheridan Le Fanu', 'Edgar Allan Poe', 'Bram Stoker', 'Algernon Blackwood',
               'Miles Klee', 'Nnedi Okorador', 'Sofia Samatar', 'Franz Kafka', 'Laird Barron',
               'Nathan Ballingrud', 'Nellie Bly', 'William Hop Hodgson', 'Ambrose Bierce',
               'Kelly Link', 'Arthur Machen', 'George Sylvester Viereck', 'Robert Chambers',
               'John Meade Falkner', 'Ann Radcliffe', 'Howard Lovecraft', 'Louis Stevenson',
               'Edith Birkhead', 'Jeff Vandermeer', 'Henry James', 'John William Polidori',
               'Bob Holland', 'Oliver Onions']
AUTHOR_LIST.sort()

GENRE_LIST = ['Vampire', 'Ghost', 'Horror', 'Comedic Horror', 'Murder', 'Werewolf', 'Apocalypse',
              'Haunted House', 'Witch', 'Hell', 'Alien', 'Gore', 'Monster']
GENRE_LIST.sort()

box_layout = Layout(display='inline-flex',
                    flex_flow='row wrap',
                    align_items='stretch',
                    border='solid',
                    width='100%',
                    justify_content='space-between'
                    )

author_checkboxes=[]
for author in AUTHOR_LIST:
    author_checkboxes.append(
        widgets.Checkbox(
        value=False,
        description='<span style="font-size:1rem">'+author+'</span>',
        disabled=False,
        indent=False,
        layout=Layout(width="30%")
    ))
    author_checkboxes[-1].add_class("mytext")
author_checkboxes = HBox(author_checkboxes, layout=box_layout)
author_checkboxes.add_class("box-style")

genre_checkboxes=[]
for genre in GENRE_LIST:
    genre_checkboxes.append(
        widgets.Checkbox(
        value=False,
        description='<span style="font-size:1rem">'+genre+'</span>',
        disabled=False,
        indent=False,
        layout=Layout(width="30%")
    ))
genre_checkboxes = HBox(genre_checkboxes, layout=box_layout)

start_btn = Button(description='Start Story!',layout=Layout(width='20%', padding="0.25rem 1rem 1rem"))
start_btn.style.button_color = 'rgba(52, 235, 67,1)'
appLayout = VBox([author_checkboxes, genre_checkboxes, start_btn])
appLayout.add_class("box-style")
appLayout



VBox(children=(HBox(children=(Checkbox(value=False, description='<span style="font-size:1rem"> Brain Evenson</…

In [None]:
input_type = widgets.Combobox(
    placeholder='Input Action',
    options=['Ask a Question', 'Summarize', 'Direct Line Addition', 'Continue'],
    description='User Input: ',
    ensure_option=True,
    disabled=False
)

input_area = widgets.Textarea(
    value='',
    placeholder='Type something',
    description='Your Story: ',
    disabled=False
)

generate_btn = Button(description='Generate!',
           layout=Layout(width='20%', height='1rem'))


generation_layout = HBox([input_type, input_area, generate_btn])
generation_layout

NameError: ignored

#### Character Generator and Text to Speech

In [None]:
from transformers.pipelines import pipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
!pip install gTTS
from gtts import gTTS
from IPython.display import Audio
from IPython.display import clear_output

tokenizer = AutoTokenizer.from_pretrained("google/pegasus-xsum")
model = AutoModelForSeq2SeqLM.from_pretrained("google/pegasus-xsum")

# Character Generator
characterGenerator = pipeline("summarization", model=model, tokenizer=tokenizer)
characterDescription = "Carson is a CS student who likes to code. He spends his free time out in nature with his friends."
# characterDescription = input("Write a small summary about your character:")
summary = characterGenerator(characterDescription, min_length=len(characterDescription))[0]['summary_text']
clear_output()
print(summary)

# Text to speech
gTTS(summary).save('1.wav')
sound_file = '1.wav'
Audio(sound_file, autoplay=False)

ModuleNotFoundError: ignored

### User Speech to Text

In [None]:
!apt install libasound2-dev portaudio19-dev libportaudio2 libportaudiocpp0 ffmpeg
!pip install pyaudio ffmpeg-python
!pip install SpeechRecognition

"""
To write this piece of code I took inspiration/code from a lot of places.
It was late night, so I'm not sure how much I created or just copied o.O
Here are some of the possible references:
https://blog.addpipe.com/recording-audio-in-the-browser-using-pure-html5-and-minimal-javascript/
https://stackoverflow.com/a/18650249
https://hacks.mozilla.org/2014/06/easy-audio-capture-with-the-mediarecorder-api/
https://air.ghost.io/recording-to-an-audio-file-using-html5-and-js/
https://stackoverflow.com/a/49019356
"""
from IPython.display import HTML, Audio
from google.colab.output import eval_js
from base64 import b64decode
import numpy as np
from scipy.io.wavfile import read as wav_read
import io
import ffmpeg
import speech_recognition as SR
import scipy

AUDIO_HTML = """
<script>
var my_div = document.createElement("DIV");
var my_p = document.createElement("P");
var my_btn = document.createElement("BUTTON");
var t = document.createTextNode("Press to start recording");

my_btn.appendChild(t);
//my_p.appendChild(my_btn);
my_div.appendChild(my_btn);
document.body.appendChild(my_div);

var base64data = 0;
var reader;
var recorder, gumStream;
var recordButton = my_btn;

var handleSuccess = function(stream) {
  gumStream = stream;
  var options = {
    //bitsPerSecond: 8000, //chrome seems to ignore, always 48k
    mimeType : 'audio/webm;codecs=opus'
    //mimeType : 'audio/webm;codecs=pcm'
  };            
  //recorder = new MediaRecorder(stream, options);
  recorder = new MediaRecorder(stream);
  recorder.ondataavailable = function(e) {            
    var url = URL.createObjectURL(e.data);
    var preview = document.createElement('audio');
    preview.controls = true;
    preview.src = url;
    document.body.appendChild(preview);

    reader = new FileReader();
    reader.readAsDataURL(e.data); 
    reader.onloadend = function() {
      base64data = reader.result;
      //console.log("Inside FileReader:" + base64data);
    }
  };
  recorder.start();
  };

recordButton.innerText = "Recording... press to stop";

navigator.mediaDevices.getUserMedia({audio: true}).then(handleSuccess);


function toggleRecording() {
  if (recorder && recorder.state == "recording") {
      recorder.stop();
      gumStream.getAudioTracks()[0].stop();
      recordButton.innerText = "Saving the recording... pls wait!"
  }
}

// https://stackoverflow.com/a/951057
function sleep(ms) {
  return new Promise(resolve => setTimeout(resolve, ms));
}

var data = new Promise(resolve=>{
//recordButton.addEventListener("click", toggleRecording);
recordButton.onclick = ()=>{
toggleRecording()

sleep(2000).then(() => {
  // wait 2000ms for the data to be available...
  // ideally this should use something like await...
  //console.log("Inside data:" + base64data)
  resolve(base64data.toString())

});

}
});
      
</script>
"""

def get_audio():
  display(HTML(AUDIO_HTML))
  data = eval_js("data")
  binary = b64decode(data.split(',')[1])
  
  process = (ffmpeg
    .input('pipe:0')
    .output('pipe:1', format='wav')
    .run_async(pipe_stdin=True, pipe_stdout=True, pipe_stderr=True, quiet=True, overwrite_output=True)
  )
  output, err = process.communicate(input=binary)
  
  riff_chunk_size = len(output) - 8
  # Break up the chunk size into four bytes, held in b.
  q = riff_chunk_size
  b = []
  for i in range(4):
      q, r = divmod(q, 256)
      b.append(r)

  # Replace bytes 4:8 in proc.stdout with the actual size of the RIFF chunk.
  riff = output[:4] + bytes(b) + output[8:]

  sr, audio = wav_read(io.BytesIO(riff))

  return audio, sr


r = SR.Recognizer()
audio, sr = get_audio()
scipy.io.wavfile.write('recording.wav', sr, audio)

recording = SR.AudioFile('recording.wav')
with recording as source:
    r.adjust_for_ambient_noise(source)
    audio = r.listen(source)

print("Converting Speech to Text...")

try:
    print("You said: " + r.recognize_google(audio))
except Exception as e:
    print("Couldn't recognize what you said")

   

Reading package lists... Done
Building dependency tree       
Reading state information... Done
libportaudio2 is already the newest version (19.6.0-1).
libportaudiocpp0 is already the newest version (19.6.0-1).
portaudio19-dev is already the newest version (19.6.0-1).
libasound2-dev is already the newest version (1.1.3-5ubuntu0.5).
ffmpeg is already the newest version (7:3.4.8-0ubuntu0.2).
0 upgraded, 0 newly installed, 0 to remove and 21 not upgraded.


Converting Speech to Text...
You said: I was at their last night with my family


# Frontend/Backend Development

# Flask
Google Colab provides a VM(virtual machine) so we cannot access the localhost(all it does it route it to our local machine’s localhost) as we do on our local machine when running a local web server. What we can do is expose it to a public URL using ngrok. Here comes the Python library flask-ngrok.



---


Package the app for the host platform:

npm run package (output binaries will be stored in /dist)

In [None]:
!pip install flask-ngrok
from flask_ngrok import run_with_ngrok
from flask import Flask
app = Flask(__name__)
run_with_ngrok(app)   #starts ngrok when the app is run
@app.route("/")
def home():
    return "<h1>Running Flask on Google Colab!</h1>"

# Application is running on http://_________.ngrok.io  
app.run()

Collecting flask-ngrok
  Downloading https://files.pythonhosted.org/packages/af/6c/f54cb686ad1129e27d125d182f90f52b32f284e6c8df58c1bae54fa1adbc/flask_ngrok-0.0.25-py3-none-any.whl
Installing collected packages: flask-ngrok
Successfully installed flask-ngrok-0.0.25
 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)


 * Running on http://6387d2ea86d1.ngrok.io
 * Traffic stats available on http://127.0.0.1:4040


127.0.0.1 - - [19/Nov/2020 17:40:53] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [19/Nov/2020 17:40:53] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
127.0.0.1 - - [19/Nov/2020 17:40:58] "[37mGET / HTTP/1.1[0m" 200 -


In [None]:
from flask import Flask, render_template
import os
import sys
from flask import Blueprint





class BaseConfig:
    SECRET_KEY = os.getenv('SECRET_KEY', 'REPLACE ME')


class DevelopmentConfig(BaseConfig):
    DEBUG = True


class TestingConfig(BaseConfig):
    DEBUG = True


class ProductionConfig(BaseConfig):
    pass

def init_extensions(app: Flask):
    # use .init_app() on your extensions to register them on
    # the Flask instance
    pass


def get_root_dir_abs_path() -> str:
    """
    Get the absolute path to the root directory of the application.
    """
    # Check if the application runs in a bundled executable from PyInstaller.
    # When executed, the bundled executable get's unpacked into the temporary directory sys._MEIPASS.
    # See also: https://pyinstaller.readthedocs.io/en/stable/runtime-information.html#using-file
    return getattr(sys, "_MEIPASS", os.path.abspath(os.path.dirname(__file__)))


def create_app(config_object_name) -> Flask:
    """
    :param config_object_name: The python path of the config object.
                               E.g. appname.settings.ProdConfig
    """

    root_dir_abs_path = get_root_dir_abs_path()

    # Initialize the core application
    app = Flask(
        __name__,
        instance_relative_config=False,
        static_folder=os.path.join(root_dir_abs_path, "static"),
        template_folder=os.path.join(root_dir_abs_path, "templates"),
    )
    app.config.from_object(config_object_name)

    # Initialize Plugins at startup using init_app()
    init_extensions(app)
    home_blueprint = Blueprint("home", __name__)
    with app.app_context():
        # Register Blueprints
        app.register_blueprint(home_blueprint, url_prefix="/")

        @app.errorhandler(404)
        def page_not_found(error):
            return render_template("page/errors/404.html", title="Page Not Found"), 404

        return app


@home.route("/")
def homepage():
    """
    Render the homepage template on the / route
    """
    return render_template("page/home/index.html", title="Welcome")


@home.route("/dashboard")
def dashboard():
    """
    Render the dashboard template on the /dashboard route
    """
    for a
    author selection = "<div><label>Author><input><div>"
    
    return render_template("page/home/dashboard.html", title="Dashboard", author_selection)


application = create_app("config.DevelopmentConfig")
application.run(host="0.0.0.0", port=4040)