## Preprocess all gutenberg books and store a selected subset with some famous authors and some random authors

In [None]:

import pandas as pd
import os
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer
from tqdm import tqdm
import gc
df = pd.read_csv('gutenberg_books.csv')

print(df.columns)
print(df.shape)
print(df.iloc[0])

eng_df = df[df['Language'] == 'en']
eng_df = eng_df.drop(columns=['Link', 'Language'])
print(eng_df.shape)
print(eng_df.head(2))

eng_df['Author'] = eng_df['Author'].astype(str)
eng_df['Text'] = eng_df['Text'].astype(str)
eng_df['Title'] = eng_df['Title'].astype(str)
eng_df = eng_df.drop_duplicates(subset=['Author', 'Title'], keep='first')
print(eng_df.shape)
eng_df = eng_df[
    ~eng_df['Author'].str.contains("United States", case=False, na=False) &  # Remove any author containing 'United States'
    ~eng_df['Author'].isin(['Unknown', 'Anonymous', '', 'Unknown Author', 'Unknown Authors', 'Various', 'Various Authors', 'Anonymous Authors'])
    & ~eng_df['Author'].str.contains("Library of Congress", case=False, na=False)
    & ~eng_df['Author'].str.contains("University", case=False, na=False)
    & ~eng_df['Author'].str.contains("Government", case=False, na=False)
    & ~eng_df['Author'].str.contains("State", case=False, na=False)
    & ~eng_df['Author'].str.contains("Press", case=False, na=False)
    & ~eng_df['Author'].str.contains("Society", case=False, na=False)
    & ~eng_df['Author'].str.contains("Association", case=False, na=False)
    & ~eng_df['Author'].str.contains("Institute", case=False, na=False)
    & ~eng_df['Author'].str.contains("Commission", case=False, na=False)
    & ~eng_df['Author'].str.contains("Board", case=False, na=False)
    & ~eng_df['Author'].str.contains("Foundation", case=False, na=False)
    & ~eng_df['Author'].str.contains("Center", case=False, na=False)
    & ~eng_df['Author'].str.contains("Council", case=False, na=False)
    & ~eng_df['Author'].str.contains("Institute", case=False, na=False)
]
print(eng_df.shape)
eng_df = eng_df[~eng_df['Title'].str.contains("Works of", case=False, na=False)]
eng_df = eng_df[~eng_df['Title'].str.contains("Volume", case=False, na=False)]
eng_df = eng_df[~eng_df['Title'].str.contains("Vol", case=False, na=False)]
eng_df = eng_df[~eng_df['Title'].str.contains("Part", case=False, na=False)]

print(eng_df.shape)


## Get middle of the text
print('get middle of the text')
print('removing the first and last parts \n cleaning the text but keeping punctuation and stopwords etc \n removing the first 10% of the text and the last 10% of the text to capture the middle')
import re
def reorder_author_name(author):
    author = author.strip().rstrip(',')
    if ',' in author:
        # Split by the comma, swap the order, and remove leading/trailing spaces
        parts = [part.strip() for part in author.split(',', 1)]
        return f"{parts[1]} {parts[0]}"
    return author.strip()
eng_df.loc[:, 'Author'] = eng_df['Author'].astype(str)
eng_df['Author'] = eng_df['Author'].apply(reorder_author_name)



In [None]:

special_authors = ['William Shakespeare', 'Jane Austen', 'Ernest Hemingway', 'Charles Dickens', 
                    'F. Scott Fitzgerald', 'Mark Twain', 'Oscar Wilde', 'Edgar Allan Poe', 'Mary Shelley', 'George Orwell', 
                    'Virginia Woolf', 'Miguel de Cervantes', 'Herman Melville', 'J.R.R Tolkien', 'Howard Pyle']
william_df = eng_df[eng_df['Author'] == 'William Shakespeare']
jane_austen_df = eng_df[eng_df['Author'] == 'Jane Austen']
ernest_hemmingway_df = eng_df[eng_df['Author'] == 'Ernest Hemingway']
charles_dickens_df = eng_df[eng_df['Author'] == 'Charles Dickens']
rest_df = eng_df[~eng_df['Author'].isin(special_authors)]
selected_authors = np.random.choice(eng_df['Author'].unique(), size=10, replace=False)
selected_authors_df = rest_df[rest_df['Author'].isin(selected_authors)]
special_authors_df = eng_df[eng_df['Author'].isin(special_authors)]

print(william_df.shape)
print(jane_austen_df.shape)
print(ernest_hemmingway_df.shape)
print(charles_dickens_df.shape)
print(selected_authors_df.shape)
print(special_authors_df.shape)
special_authors_df['Author'].value_counts()
## Either getting only from special authors or from selected authors or selected special authors
#total_df = pd.concat([william_df.iloc[:10], jane_austen_df, ernest_hemmingway_df.iloc[:10], charles_dickens_df.iloc[:10]])
total_df = special_authors_df
print(total_df.shape)
total_df = total_df.drop_duplicates(subset=['Title'])
print(total_df.shape)
total_df = total_df.drop_duplicates(subset=['Text'])
print(total_df.shape)


print(total_df.loc[total_df['Author'] == 'Edgar Allan Poe', 'Title'].tolist())
total_df = total_df.groupby('Author').head(3).reset_index(drop=True)
total_df = total_df[total_df['Title'] != 'First Project Gutenberg Collection of Edgar Allan Poe']
total_df.loc[:, ['Author', 'Title']]


In [None]:

def extract_main_text(text):
    # Find the start and end markers
    start_match = re.search(r"\*\*\* *START OF THE PROJECT GUTENBERG.*?\*\*\*", text, re.IGNORECASE)
    end_match = re.search(r"\*\*\* *END OF THE PROJECT GUTENBERG.*?\*\*\*", text, re.IGNORECASE)

    # Extract the main content if both markers are found
    if start_match and end_match:
        start_idx = start_match.end()
        end_idx = end_match.start()
        text = text[start_idx:end_idx].strip()
    text = re.sub(r'(Produced by.*?)(\n|$)', '', text, flags=re.DOTALL)
    text = re.sub(r'End of the Project Gutenberg.*?(\n|$)', '', text, flags=re.DOTALL)
    text = text.strip()

    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)

    text = re.sub(r'\@\w+|\#','', text)
    # Remove text inside (), [], and {}
    text = re.sub(r'\[.*?\]|\(.*?\)|\{.*?\}', '', text)
    text = text.replace('_', ' ')              # <— strip underscores
    text = re.sub(r'-{2,}', ' ', text)
    text = re.sub(r'\n{2,}', '\n\n', text)
    text = re.sub(r'[^\w\s\.\,\!\?\-\'\"\;\:\n]', '', text)  # keep common punctuation
    text = re.sub(r'\s+', ' ', text).strip()
    words = text.split()
    cutoff_index = int(len(words) * 0.1)
    
    return ' '.join(words[cutoff_index:-cutoff_index]).strip()

total_df = total_df[~total_df['Title'].str.contains(r'^works of', case=False, na=False)]
total_df = total_df[~total_df['Title'].str.contains(r'volumes', case=False, na=False)]
total_df = total_df.drop_duplicates(subset=['Title'])
print(total_df.shape)
total_df = total_df[~total_df['Title'].str.contains("United States", case=False, na=False)]
print(total_df.shape)
total_df = total_df.drop_duplicates(subset=['Author', 'Title'])
# Apply the function to the 'Text' column
total_df['Text'] = total_df['Text'].apply(extract_main_text)
total_df = total_df.drop_duplicates(subset=['Text'])
print(total_df.shape)
total_df = total_df.dropna(subset=['Text', 'Author'])
print(total_df.shape)
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({"additional_special_tokens": ["<SEP>"], "pad_token": "<PAD>"})

# Create dataset


import re
def middle_of_text_by_sentence(text):
    # Split the text into sentences using punctuation as a delimiter
    sentences = re.split(r'(?<=[.])\s+', text.strip())
    # Remove first and last 20% of sentences
    n = len(sentences)
    start = int(0.3 * n)
    end = int(0.7 * n)
    middle_sentences = sentences[start:end]
    return ' '.join(middle_sentences).strip()
total_df['Text_'] = total_df.loc[:, 'Text'].apply(middle_of_text_by_sentence)
total_df = total_df.drop(columns=['Text'])
df_ = total_df.rename(columns={'Text_': 'Text'})
df_['book_length'] = df_['Text'].apply(len)

df_.head(4)


from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  
block_size = 128  # GPT-style block size
step_size = block_size//2
def tokenize_and_chunk(row, tokenizer=tokenizer, block_size=block_size, step_size=step_size):
    author = row['Author']
    text = row['Text']
    title = row['Title']
    tokens = tokenizer(text, return_tensors='pt', truncation=False)['input_ids'][0]
    chunks = [(tokens[i:i+block_size], author, title) for i in range(0, len(tokens) - block_size, step_size)]

    return chunks
print('tokenizing and chunking...')
author_text = df_.loc[:, ['Author', 'Text', 'Title']].apply(tokenize_and_chunk, axis=1)
author_text_list=author_text.tolist()
print(step_size)
total_tokens = sum([len(chunk[0]) for chunk_ in author_text_list for chunk in chunk_])
print(total_tokens)

df_.head(40)
#df_.to_csv('data/datasets/selected_dataset_small.csv', index=False)
## Either getting only from special authors or from selected authors or selected special authors
#total_df = pd.concat([william_df.iloc[:10], jane_austen_df, ernest_hemmingway_df.iloc[:10], charles_dickens_df.iloc[:10]])


In [None]:

total_df = eng_df.sample(20, random_state=46)
print(total_df.shape)
total_df = total_df.drop_duplicates(subset=['Title'])
print(total_df.shape)
total_df = total_df.drop_duplicates(subset=['Text'])
print(total_df.shape)
total_df.head(25)

def extract_main_text(text):
    # Find the start and end markers
    start_match = re.search(r"\*\*\* *START OF THE PROJECT GUTENBERG.*?\*\*\*", text, re.IGNORECASE)
    end_match = re.search(r"\*\*\* *END OF THE PROJECT GUTENBERG.*?\*\*\*", text, re.IGNORECASE)

    # Extract the main content if both markers are found
    if start_match and end_match:
        start_idx = start_match.end()
        end_idx = end_match.start()
        text = text[start_idx:end_idx].strip()
    text = re.sub(r'(Produced by.*?)(\n|$)', '', text, flags=re.DOTALL)
    text = re.sub(r'End of the Project Gutenberg.*?(\n|$)', '', text, flags=re.DOTALL)
    text = text.strip()

    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)

    text = re.sub(r'\@\w+|\#','', text)
    # Remove text inside (), [], and {}
    text = re.sub(r'\[.*?\]|\(.*?\)|\{.*?\}', '', text)
    text = text.replace('_', ' ')              # <— strip underscores
    text = re.sub(r'-{2,}', ' ', text)
    text = re.sub(r'\n{2,}', '\n\n', text)
    text = re.sub(r'[^\w\s\.\,\!\?\-\'\"\;\:\n]', '', text)  # keep common punctuation
    text = re.sub(r'\s+', ' ', text).strip()
    words = text.split()
    # Calculate the 10% cutoff
    cutoff_index = int(len(words) * 0.1)
    
    return ' '.join(words[cutoff_index:-cutoff_index]).strip()

total_df = total_df[~total_df['Title'].str.contains(r'^works of', case=False, na=False)]
total_df = total_df[~total_df['Title'].str.contains(r'volumes', case=False, na=False)]
total_df = total_df.drop_duplicates(subset=['Title'])
print(total_df.shape)
total_df = total_df[~total_df['Title'].str.contains("United States", case=False, na=False)]
print(total_df.shape)
total_df = total_df.drop_duplicates(subset=['Author', 'Title'])
# Apply the function to the 'Text' column
total_df['Text'] = total_df['Text'].apply(extract_main_text)
total_df = total_df.drop_duplicates(subset=['Text'])
print(total_df.shape)
total_df = total_df.dropna(subset=['Text', 'Author'])
print(total_df.shape)
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")


In [None]:

# Create dataset


import re
def middle_of_text_by_sentence(text):
    # Split the text into sentences using punctuation as a delimiter
    sentences = re.split(r'(?<=[.])\s+', text.strip())
    # Remove first and last 20% of sentences
    n = len(sentences)
    start = int(0.3 * n)
    end = int(0.7 * n)
    middle_sentences = sentences[start:end]
    return ' '.join(middle_sentences).strip()
total_df['Text_'] = total_df.loc[:, 'Text'].apply(middle_of_text_by_sentence)
total_df = total_df.drop(columns=['Text'])
df_random = total_df.rename(columns={'Text_': 'Text'})
df_random['book_length'] = df_random['Text'].apply(len)

df_random = df_random[df_random['Author'] != 'William Chaffers']
df_random = df_random[df_random['Author'] != 'Morris Jastrow']
df_random.head(25)
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # required for compatibility
block_size = 128  # GPT-style block size
step_size = block_size//2
def tokenize_and_chunk(row, tokenizer=tokenizer, block_size=block_size, step_size=step_size):
    author = row['Author']
    text = row['Text']
    title = row['Title']
    tokens = tokenizer(text, return_tensors='pt', truncation=False)['input_ids'][0]
    chunks = [(tokens[i:i+block_size], author, title) for i in range(0, len(tokens) - block_size, step_size)]

    return chunks
print('tokenizing and chunking...')
author_text = df_random.loc[:, ['Author', 'Text', 'Title']].apply(tokenize_and_chunk, axis=1)
author_text_list=author_text.tolist()
print(step_size)
total_tokens = sum([len(chunk[0]) for chunk_ in author_text_list for chunk in chunk_])
print(total_tokens)
combined_df = pd.concat([df_, df_random])

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # required for compatibility
block_size = 128  # GPT-style block size
step_size = block_size//2
def tokenize_and_chunk(row, tokenizer=tokenizer, block_size=block_size, step_size=step_size):
    author = row['Author']
    text = row['Text']
    title = row['Title']
    tokens = tokenizer(text, return_tensors='pt', truncation=False)['input_ids'][0]
    chunks = [(tokens[i:i+block_size], author, title) for i in range(0, len(tokens) - block_size, step_size)]

    return chunks
print('tokenizing and chunking...')
author_text = combined_df.loc[:, ['Author', 'Text', 'Title']].apply(tokenize_and_chunk, axis=1)
author_text_list=author_text.tolist()
print(step_size)
total_tokens = sum([len(chunk[0]) for chunk_ in author_text_list for chunk in chunk_])
print(total_tokens)
combined_df = combined_df.reset_index(drop=True)



In [None]:

combined_df.head(45)
combined_df.to_csv('selected_dataset_mixed.csv', index=False)


In [None]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(combined_df, test_size=0.1, random_state=42)
def tokenize_and_chunk(row, tokenizer=tokenizer, block_size=block_size, step_size=step_size):
    author = row['Author']
    text = row['Text']
    title = row['Title']
    text_chunks = []
    title_chunks = []
    author_chunks = []
    for i in range(0, len(text) - block_size, step_size):
        text_chunks.append(text[i:i+block_size])
        title_chunks.append(title)
        author_chunks.append(author)
    return text_chunks, title_chunks, author_chunks

train_data_np, train_data_titles, train_data_authors = train_df.apply(tokenize_and_chunk, axis=1)
eval_data_np, eval_data_titles, eval_data_authors = test_df.apply(tokenize_and_chunk, axis=1)
selected_dataset = {
    'train_data_np': train_data_np,
    'train_data_titles': train_data_titles,
    'train_data_authors': train_data_authors,
    'eval_data_np': eval_data_np,
    'eval_data_titles': eval_data_titles,
    'eval_data_authors': eval_data_authors
}
import json
with open('selected_dataset_mixed.json', 'w') as f:
    json.dump(selected_dataset, f)


In [None]:
import pandas as pd
df = pd.read_csv('selected_dataset_mixed.csv')
df.head(40)