## Generating Science Fiction Dataset
This is the script I use to generate the dataset I used in <a href=''>my unsupervised learning capstone project</a>. Some works are divided into chapters marked by roman numerals and newlines, but the works are not all consistent. I manually added roman numeral dividers to some of the documents to make this script work. I chose roman numerals because some documents are already organized this way, and it's a simple way to create a unique and easy-to-match string ('\n[numeral]\n') while avoiding confusion with other pieces of the text.

In [1]:
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt

In [2]:
#These paths are local to my computer so unfortunately you can't run this script yourself.
authors = {
    'asimov': ['./asimov/youth.txt', './asimov/nuclear_energy_3.txt'],
    'bradbury': ['./bradbury/a_little_journey.txt', './bradbury/futuria.txt'],
    'dick': ['./dick/mrspaceship.txt', './dick/variableman.txt'],
    'herbert': ['./herbert/haystack.txt', './herbert/old_rambling_house.txt'],
    'pohl': ['./pohl/skysearch.txt', './pohl/tunnel.txt'],
    'reynolds': ['./reynolds/off_course.txt', './reynolds/spaceman_spree.txt'],
    'rockwell': ['./rockwell/space_pirates.txt', './rockwell/venus_revolt.txt'],
    'verne': ['./verne/20000leagues.txt', './verne/moon_journey.txt'],
    'vonnegut': ['./vonnegut/2BR02B.txt', './vonnegut/trip_up_yonder.txt'],
    'wells': ['./wells/discovery_future.txt', './wells/moon_men.txt']
}

In [3]:
documents = []
for auth in authors:
    for work in authors[auth]:
        documents.append(work.split('./'+auth+'/')[1])
        documents[-1] = documents[-1].split('.txt')[0]

In [4]:
date = {}
for doc in documents:
    date[doc] = 0
    
date['discovery_future'] = 1913
date['moon_men'] = 1901
date['youth'] = 1952
date['nuclear_energy_3'] = 1972
date['a_little_journey'] = 1951
date['futuria'] = 1940
date['mrspaceship'] = 1953
date['variableman'] = 1953
date['haystack'] = 1959
date['old_rambling_house'] = 1958
date['skysearch'] = 1954
date['tunnel'] = 1955
date['off_course'] = 1954
date['spaceman_spree'] = 1963
date['space_pirates'] = 1953
date['venus_revolt'] = 1954
date['20000leagues'] = 1870
date['moon_journey'] = 1865
date['2BR02B'] = 1962
date['trip_up_yonder'] = 1953

In [5]:
fiction = {}
for doc in documents:
    if doc == 'nuclear_energy_3' or doc == 'discovery_future':
        fiction[doc] = 0
    else:
        fiction[doc] = 1

In [6]:
num = {}
for doc in documents:
    num[doc] = 5
    
num['2BR02B'] = 2
num['trip_up_yonder'] = 4
num['a_little_journey'] = 3
num['futuria'] = 7
num['off_course'] = 3
num['spaceman_spree'] = 7
num['old_rambling_house'] = 3
num['nuclear_energy_3'] = 6
num['haystack'] = 7
num['moon_men'] = 5
num['discovery_future'] = 6
num['20000leagues'] = 6
num['tunnel'] = 6

In [7]:
def get_text(document):
    cut = document.split('*** START OF THIS PROJECT GUTENBERG')[1]
    result = cut.split('*** END OF THIS PROJECT GUTENBERG')[0]
    return result

In [8]:
def roman_numeral(number):
    """Takes an integer and returns a string with roman numeral
    form of integer. Works up to value of 39 for now.
    """
    base = 'X'*(number//10)
    if number%10 < 4:
        base += 'I'*(number%10)
    elif number%10 == 4:
        base += 'IV'
    elif 5 <= number%10 < 9:
        base += 'V' + 'I'*(number%10-5)
    elif number%10 == 9:
        base = 'IX' + base
    return base

def get_chapters(document, n_chapters=10):
    result = []
    doc = document
    for number in range(1, n_chapters+1):
        result.append(doc.split('\n'+roman_numeral(number)+'\n')[1].split('\n'+roman_numeral(number+1)+'\n')[0])
    return result
        

In [9]:
df = pd.DataFrame(columns=['text', 'title', 'author', 'date', 'fiction'])
for auth in authors:
    for i in range(0, 2):
        path = authors[auth][i]
        tit = path.split('./'+auth+'/')[1].split('.txt')[0]
        work = open(path).read()
        chapters = get_chapters(work, n_chapters=num[tit])
        for chapter in chapters:
            next_row = pd.DataFrame(columns=df.columns, index=[len(df)])
            next_row['text'] = chapter
            next_row['title'] = tit
            next_row['author'] = auth
            next_row['date'] = date[tit]
            next_row['fiction'] = fiction[tit]
            df = pd.concat([df, next_row], axis=0)
            
df.head()

Unnamed: 0,text,title,author,date,fiction
0,\n Red and Slim found the two strange littl...,youth,asimov,1952,1
1,\nThe Astronomer entered the dining room with ...,youth,asimov,1952,1
2,\nThe swaying had come to a halt and it was da...,youth,asimov,1952,1
3,\nThe Merchant was awake too and his steady sc...,youth,asimov,1952,1
4,"\nThe Astronomer said, ""You think the noise wa...",youth,asimov,1952,1


In [10]:
df.to_csv('chapters.csv', index=False)