In [2]:
import openai
import pandas as pd
import numpy as np
import re
import nltk

In [4]:
def get_book_blocks(book_content, block_size=512):
    
    tokenized_book = nltk.word_tokenize(book_content)
    book_blocks = [" ".join(tokenized_book[i:i+block_size]) for i in range(0, len(tokenized_book), block_size)]
    
    return book_blocks

def clean_book_blocks(book_blocks):
    
    # remove empty blocks
    book_blocks = [block for block in book_blocks if block.strip() != ""]
    
    # remove blocks with less than 20 tokens
    book_blocks = [block for block in book_blocks if len(nltk.word_tokenize(block)) > 20]
    
    # remove special characters such as \n, \t, \r
    book_blocks = [block.replace("\n", " ").replace("\t", " ").replace("\r", " ") for block in book_blocks]
    
    # remove multiple spaces
    book_blocks = [re.sub(' +', ' ', block) for block in book_blocks]
    
    # make lowercase
    book_blocks = [block.lower() for block in book_blocks]
    
    # remove special utf-8 characters such as \xa0, and â\x80\x99, and \x9d etc
    book_blocks = [re.sub(r'[^\x00-\x7f]',r'', block) for block in book_blocks]
    
    return book_blocks

def block_word_count(book_blocks):
    
    word_count = [len(re.findall(r'\w+', block)) for block in book_blocks]
    
    return word_count

def block_token_count(book_blocks):
    
    token_size = [len(block.split()) for block in book_blocks]
    
    return token_size

def build_dataset(author_name, author_bio, book_name, book_id, book_content) -> pd.DataFrame:
    
    df = pd.DataFrame({"Author_Name": [author_name], 
                       "Author_Bio": [author_bio], 
                       "Book_Name": [book_name], 
                       "Book_ID": [book_id], 
                       "Book_Content": [book_content]})
    
    return df

In [6]:
with open("metadata/Antoine de Saint-Exupery.txt", "r") as f:
    author = f.read()
    
with open("metadata/El Principito.txt", "r", encoding='UTF8') as f:
    book = f.read()

author_name = "Antoine de Saint-Exupéry"
author_bio = author
book_name = "El Principito"
book_id = 9999
book_content = book

train_data = build_dataset(author_name, author_bio, book_name, book_id, book_content)

book_blocks = get_book_blocks(book_content, block_size=2000)
cleaned_book_blocks = clean_book_blocks(book_blocks)

train_data["Book_Blocks"] = [cleaned_book_blocks]
train_data["Word_Count"] = [block_word_count(cleaned_book_blocks)]
train_data["Token_Count"] = [block_token_count(cleaned_book_blocks)]

In [7]:
train_data

Unnamed: 0,Author_Name,Author_Bio,Book_Name,Book_ID,Book_Content,Book_Blocks,Word_Count,Token_Count
0,Antoine de Saint-Exupéry,"Antoine Marie Jean-Baptiste Roger, comte de Sa...",El Principito,9999,el principito antoine de saintexupery a leon w...,[el principito antoine de saintexupery a leon ...,"[1997, 1999, 1999, 1996, 1999, 2000, 994]","[2000, 2000, 2000, 2000, 2000, 2000, 994]"


In [8]:
def summarize_book(book_content, 
                   block_size=2000, 
                   max_tokens=200, 
                   temperature=0.5, 
                   top_p=1.0, 
                   frequency_penalty=0.0, 
                   presence_penalty=0.0):
    
    book_blocks = get_book_blocks(book_content)
    cleaned_book_blocks = clean_book_blocks(book_blocks)
    
    book_summary = ""
    
    for block in cleaned_book_blocks:
        
        response = openai.Completion.create(
          engine="text-davinci-003",
          prompt=f"Summarize the following chunk of a book and keep the essence of the author. Keep in mind that there are multiple chunks and they are being fed sequentially: {block}",
          max_tokens=max_tokens,
          temperature=temperature,
          top_p=top_p,
          frequency_penalty=frequency_penalty,
          presence_penalty=presence_penalty,
          stop=["\n", " #"]
        )
        
        book_summary += response.choices[0].text

    
    return book_summary

In [9]:
book_summary = summarize_book(book_content)

In [10]:
train_data["Book_Summary"] = book_summary

In [11]:
train_data

Unnamed: 0,Author_Name,Author_Bio,Book_Name,Book_ID,Book_Content,Book_Blocks,Word_Count,Token_Count,Book_Summary
0,Antoine de Saint-Exupéry,"Antoine Marie Jean-Baptiste Roger, comte de Sa...",El Principito,9999,el principito antoine de saintexupery a leon w...,[el principito antoine de saintexupery a leon ...,"[1997, 1999, 1999, 1996, 1999, 2000, 994]","[2000, 2000, 2000, 2000, 2000, 2000, 994]",de la serpiente boa un cordero.un existen otr...


In [19]:
# add second book summary to train data
demo_data = pd.read_csv("train_data.csv")

# replace first row of demo data with train data
demo_data['Book_Summary'] = [0,0]

principito_blocks = get_book_blocks(book_content, block_size=2000)
principito_cleaned_blocks = clean_book_blocks(principito_blocks)

demo_data["Book_Blocks"][1] = principito_cleaned_blocks

demo_data["Word_Count"][1] = block_word_count(principito_cleaned_blocks)
demo_data["Token_Count"][1] = block_token_count(principito_cleaned_blocks)

demo_data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  demo_data["Book_Blocks"][1] = principito_cleaned_blocks
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  demo_data["Word_Count"][1] = block_word_count(principito_cleaned_blocks)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  demo_data["Token_Count"][1] = block_token_count(principito_cleaned_blocks)


Unnamed: 0,Author_Name,Author_Bio,Book_Name,Book_ID,Book_Content,Book_Blocks,Word_Count,Token_Count,Book_Summary
0,Ernest Hemingway,Ernest Miller Hemingway was an American noveli...,The sun also rises,67138,The Project Gutenberg eBook of The Sun Also Ri...,['the project gutenberg ebook of the sun also ...,"[459, 463, 463, 490, 490, 542, 512, 503, 514, ...","[512, 512, 512, 512, 512, 512, 512, 512, 512, ...",0
1,Antoine de Saint-Exupery,"Antoine Marie Jean-Baptiste Roger, comte de Sa...",El Principito,9999,el principito antoine de saintexupery a leon w...,[el principito antoine de saintexupery a leon ...,"[1997, 1999, 1999, 1996, 1999, 2000, 994]","[2000, 2000, 2000, 2000, 2000, 2000, 994]",0


In [20]:
demo_data['Book_Summary'][1] = summarize_book(demo_data['Book_Content'][1])
demo_data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  demo_data['Book_Summary'][1] = summarize_book(demo_data['Book_Content'][1])


Unnamed: 0,Author_Name,Author_Bio,Book_Name,Book_ID,Book_Content,Book_Blocks,Word_Count,Token_Count,Book_Summary
0,Ernest Hemingway,Ernest Miller Hemingway was an American noveli...,The sun also rises,67138,The Project Gutenberg eBook of The Sun Also Ri...,['the project gutenberg ebook of the sun also ...,"[459, 463, 463, 490, 490, 542, 512, 503, 514, ...","[512, 512, 512, 512, 512, 512, 512, 512, 512, ...",0
1,Antoine de Saint-Exupery,"Antoine Marie Jean-Baptiste Roger, comte de Sa...",El Principito,9999,el principito antoine de saintexupery a leon w...,[el principito antoine de saintexupery a leon ...,"[1997, 1999, 1999, 1996, 1999, 2000, 994]","[2000, 2000, 2000, 2000, 2000, 2000, 994]",de mis aviones y el dijo esta mejor pero se v...


In [21]:
demo_data['Book_Summary'][0] = summarize_book(demo_data['Book_Content'][0])
demo_data

Unnamed: 0,Author_Name,Author_Bio,Book_Name,Book_ID,Book_Content,Book_Blocks,Word_Count,Token_Count,Book_Summary
0,Ernest Hemingway,Ernest Miller Hemingway was an American noveli...,The sun also rises,67138,The Project Gutenberg eBook of The Sun Also Ri...,['the project gutenberg ebook of the sun also ...,"[459, 463, 463, 490, 490, 542, 512, 503, 514, ...","[512, 512, 512, 512, 512, 512, 512, 512, 512, ...","so good , in fact , that spider kelly said he..."
1,Antoine de Saint-Exupery,"Antoine Marie Jean-Baptiste Roger, comte de Sa...",El Principito,9999,el principito antoine de saintexupery a leon w...,[el principito antoine de saintexupery a leon ...,"[1997, 1999, 1999, 1996, 1999, 2000, 994]","[2000, 2000, 2000, 2000, 2000, 2000, 994]",de mis aviones y el dijo esta mejor pero se v...


In [22]:
demo_data.to_csv("train_data.csv", index=False)

In [11]:
prompt = f"Based on the following author biography {author_bio} and book summary {book_summary}" \
       + f"Adopt the personality of the author and be capable of answering questions about the book. " \
       + f"For example, if the question is 'What is the name of the book?', the answer should be the name of the book. " \
       + f"Another example is 'What is the name of the author?', the answer should be the name of the author. " \
       + f"Another example is 'What is the book about?', the answer should be the summary of the book. " \
       + f"Finally, other examples can be more specific about the book's content. " \
       + "The important thing is to converse as if you are the author of the book. " \
       + "[Question]" + "what was your motivation behind it? Please answer in spanish" + "[Answer]" + " "

In [12]:
completions = openai.Completion.create(engine="text-davinci-003", 
                                        prompt=prompt, 
                                        max_tokens=384, 
                                        n=1,
                                        stop=None,
                                        temperature=0.5)
response = completions.choices[0].text
response

' Mi motivación detrás de este libro fue contar la historia de un niño que viaja por el universo y descubre la verdadera significancia de la vida y los tesoros más preciosos que no se ven ni se tocan.'

In [23]:
demo = pd.read_csv("train_data.csv")

In [43]:
# convert Word_Count and Token_Count to list of integers
sum(map(int, demo['Word_Count'][1].replace('[', '').replace(']', '').split(', ')))

12984