# Data Preprocessing

In [2]:
import pandas as pd
import numpy as np
import nltk
import re

In [3]:
with open("metadata/Ernest Hemingway.txt", "r") as f:
    author = f.read()
    
with open("metadata/The sun also rises-67138.txt", "r", encoding='UTF8') as f:
    book = f.read()
    

In [4]:
def build_dataset(author_name, author_bio, book_name, book_id, book_content) -> pd.DataFrame:
    
    df = pd.DataFrame({"Author_Name": [author_name], 
                       "Author_Bio": [author_bio], 
                       "Book_Name": [book_name], 
                       "Book_ID": [book_id], 
                       "Book_Content": [book_content]})
    
    return df

In [5]:
author_name = "Ernest Hemingway"
author_bio = author
book_name = "The sun also rises"
book_id = 67138
book_content = book

train_data = build_dataset(author_name, author_bio, book_name, book_id, book_content)
train_data

Unnamed: 0,Author_Name,Author_Bio,Book_Name,Book_ID,Book_Content
0,Ernest Hemingway,Ernest Miller Hemingway was an American noveli...,The sun also rises,67138,The Project Gutenberg eBook of The Sun Also Ri...


### Clean book content

In [6]:
def get_book_blocks(book_content, block_size=512):
    
    tokenized_book = nltk.word_tokenize(book_content)
    book_blocks = [" ".join(tokenized_book[i:i+block_size]) for i in range(0, len(tokenized_book), block_size)]
    
    return book_blocks

def clean_book_blocks(book_blocks):
    
    # remove empty blocks
    book_blocks = [block for block in book_blocks if block.strip() != ""]
    
    # remove blocks with less than 20 tokens
    book_blocks = [block for block in book_blocks if len(nltk.word_tokenize(block)) > 20]
    
    # remove special characters such as \n, \t, \r
    book_blocks = [block.replace("\n", " ").replace("\t", " ").replace("\r", " ") for block in book_blocks]
    
    # remove multiple spaces
    book_blocks = [re.sub(' +', ' ', block) for block in book_blocks]
    
    # make lowercase
    book_blocks = [block.lower() for block in book_blocks]
    
    return book_blocks

def block_word_count(book_blocks):
    
    word_count = [len(re.findall(r'\w+', block)) for block in book_blocks]
    
    return word_count

def block_token_count(book_blocks):
    
    token_size = [len(block.split()) for block in book_blocks]
    
    return token_size

In [7]:
book_blocks = get_book_blocks(book_content)
cleaned_book_blocks = clean_book_blocks(book_blocks)

train_data["Book_Blocks"] = [cleaned_book_blocks]
train_data["Word_Count"] = [block_word_count(cleaned_book_blocks)]
train_data["Token_Count"] = [block_token_count(cleaned_book_blocks)]

In [8]:
train_data

Unnamed: 0,Author_Name,Author_Bio,Book_Name,Book_ID,Book_Content,Book_Blocks,Word_Count,Token_Count
0,Ernest Hemingway,Ernest Miller Hemingway was an American noveli...,The sun also rises,67138,The Project Gutenberg eBook of The Sun Also Ri...,[the project gutenberg ebook of the sun also r...,"[459, 463, 463, 490, 490, 542, 512, 503, 514, ...","[512, 512, 512, 512, 512, 512, 512, 512, 512, ..."


In [9]:
with open("metadata/Antoine de Saint-Exupery.txt", "r") as f:
    author = f.read()
    
with open("metadata/El Principito.txt", "r", encoding='UTF8') as f:
    book = f.read()
    
author_name = "Antoine de Saint-Exupery"
author_bio = author
book_name = "El Principito"
book_id = 9999
book_content = book

book_blocks = get_book_blocks(book_content)
cleaned_book_blocks = clean_book_blocks(book_blocks)

principito_word_count = block_word_count(cleaned_book_blocks)
principito_token_count = block_token_count(cleaned_book_blocks)

# add this data as new rows to the train_data dataframe
train_data = train_data.append(build_dataset(author_name, author_bio, book_name, book_id, book_content), ignore_index=True)

train_data

  train_data = train_data.append(build_dataset(author_name, author_bio, book_name, book_id, book_content), ignore_index=True)


Unnamed: 0,Author_Name,Author_Bio,Book_Name,Book_ID,Book_Content,Book_Blocks,Word_Count,Token_Count
0,Ernest Hemingway,Ernest Miller Hemingway was an American noveli...,The sun also rises,67138,The Project Gutenberg eBook of The Sun Also Ri...,[the project gutenberg ebook of the sun also r...,"[459, 463, 463, 490, 490, 542, 512, 503, 514, ...","[512, 512, 512, 512, 512, 512, 512, 512, 512, ..."
1,Antoine de Saint-Exupery,"Antoine Marie Jean-Baptiste Roger, comte de Sa...",El Principito,9999,el principito antoine de saintexupery a leon w...,,,


In [11]:
train_data.to_csv("train_data.csv", index=False)