In [14]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine as ce
from sqlalchemy import inspect
from pathlib import Path

In [15]:
DATA = Path("books.db")

engine = ce("sqlite:///"+str(DATA))

inspector = inspect(engine)

print(inspector.get_table_names())

books_df = pd.read_sql("books", con = engine)
author_df = pd.read_sql("authors", con = engine)
book_file_df = pd.read_sql("book_file", con = engine)
text_file_df = pd.read_sql("text_files", con = engine)

['authors', 'book_file', 'book_original', 'books', 'text_files']


In [16]:
# author_df.to_csv('author_data', index=False)  

### Author

In [17]:
author_cleaned_df = pd.read_csv('author_data.csv')
author_cleaned_df

# dataset explanation:
# the 10000 in column born and death means either undefined or in B.C. or in early A.D. (before 1000 AD)

Unnamed: 0,index,author,born,death
0,0,Adam Ferguson,1723,1816
1,1,Adam Smith,1723,1790
2,2,Aeschylus,525,10000
3,3,Aesop,10000,10000
4,4,Alan Seeger,1888,1916
...,...,...,...,...
287,287,William S.gilbert,1836,1911
288,288,William Shakespeare,1564,1616
289,289,William Thackeray,1811,1863
290,290,Xenophon,10000,10000


In [18]:
#distribution based on death year:
#before 1000 AD

author_before_1000 = author_cleaned_df[author_cleaned_df.death == 10000] 
print(author_before_1000.shape)
author_before_1000

(16, 4)


Unnamed: 0,index,author,born,death
2,2,Aeschylus,525,10000
3,3,Aesop,10000,10000
24,24,Aristophanes,450,10000
25,25,Aristotle,384,10000
81,81,Euripides,480,10000
127,127,Hippocrates,460,10000
129,129,Homer,10000,10000
200,200,Nan,10000,10000
212,212,P. Cornelius Tacitus,56,10000
216,216,Plato,10000,10000


In [19]:
#between 1000 ~ 1300
author_1000_1300 = author_cleaned_df.loc[(author_cleaned_df.death >= 1000) & (author_cleaned_df.death < 1300 )] 
print(author_1000_1300.shape)
author_1000_1300

(1, 4)


Unnamed: 0,index,author,born,death
209,209,Omar Khayam,1048,1122


In [20]:
#between 1300 ~ 1500
author_1300_1500 = author_cleaned_df.loc[(author_cleaned_df.death >= 1300) & (author_cleaned_df.death < 1500 )] 
print(author_1300_1500.shape)
author_1300_1500

(3, 4)


Unnamed: 0,index,author,born,death
52,52,Dante Alighieri,1265,1321
107,107,Giovanni Boccaccio,1313,1375
254,254,Thomas Malory,1405,1471


In [21]:
#between 1500 ~ 1600
author_1500_1600 = author_cleaned_df.loc[(author_cleaned_df.death >= 1500) & (author_cleaned_df.death < 1600 )] 
print(author_1500_1600.shape)
author_1500_1600

(7, 4)


Unnamed: 0,index,author,born,death
49,49,Christopher Marlowe,1564,1593
86,86,Francis Rabelais,1494,1553
182,182,Ludovico Ariosto,1474,1533
191,191,Martin Luther,1483,1546
202,202,Nicolo Machiavelli,1469,1527
255,255,Thomas More,1478,1535
264,264,Torquato Tasso,1544,1595


In [22]:
#between 1600 ~ 1700
author_1600_1700 = author_cleaned_df.loc[(author_cleaned_df.death >= 1600) & (author_cleaned_df.death < 1700 )] 
print(author_1600_1700.shape)
author_1600_1700

(13, 4)


Unnamed: 0,index,author,born,death
31,31,Benedict De Spinoza,1632,1677
66,66,Edward Misseldon,1608,1654
85,85,Francis Bacon,1561,1626
153,153,John Bunyan,1628,1688
161,161,John Milton,1608,1674
186,186,Madame De La Fayette,1643,1693
196,196,Matthew Hale,1609,1676
198,198,Miguel De Cervantes,1547,1616
223,223,Richard Lovelace,1618,1657
251,251,Thomas Hobbes,1588,1679


In [23]:
#between 1700 ~ 1800
author_1700_1800 = author_cleaned_df.loc[(author_cleaned_df.death >= 1700) & (author_cleaned_df.death < 1800 )] 
print(author_1700_1800.shape)
author_1700_1800

(21, 4)


Unnamed: 0,index,author,born,death
1,1,Adam Smith,1723,1790
21,21,Anne-robert-jacques Turgot,1727,1781
33,33,Benjamin Franklin,1706,1790
51,51,Daniel Defoe,1661,1731
54,54,David Hume,1711,1776
64,64,Edward Gibbon,1737,1794
119,119,Henry Fielding,1707,1754
131,131,Horace Walpole,1717,1797
140,140,Jacob Vanderlint,10000,1740
141,141,James Boswell,1740,1795


In [24]:
#between 1800 ~ 1900
author_1800_1900 = author_cleaned_df.loc[(author_cleaned_df.death >= 1800) & (author_cleaned_df.death < 1900 )] 
print(author_1800_1900.shape)
author_1800_1900

(96, 4)


Unnamed: 0,index,author,born,death
0,0,Adam Ferguson,1723,1816
6,6,Alexander Hamilton,1755,1804
7,7,Alexandre Dumas,1802,1870
9,9,Alfred Tennyson,1809,1892
17,17,Andrew Steinmetz,1816,1877
...,...,...,...,...
278,278,William Blake,1757,1827
282,282,William Godwin,1756,1836
283,283,William Hickling Prescott,1796,1859
285,285,William Morris,1834,1896


In [25]:
#between 1900 ~ now
author_after_1900 = author_cleaned_df.loc[(author_cleaned_df.death >= 1900) & (author_cleaned_df.death < 2000)] 
print(author_after_1900.shape)
author_after_1900

(134, 4)


Unnamed: 0,index,author,born,death
4,4,Alan Seeger,1888,1916
5,5,Alexander H.japp,1839,1905
8,8,Alfred Marshall,1842,1924
10,10,Alice Meynell,1847,1924
11,11,Ambrose Bierce,1842,1914
...,...,...,...,...
279,279,William Butler Yeats,1865,1939
281,281,William Dean Howells,1837,1920
284,284,William James,1842,1910
287,287,William S.gilbert,1836,1911


### Book

In [26]:
#all books with author died before 1000
book_before_1000 = books_df[books_df.author_id.isin(author_before_1000.index)] 
print(book_before_1000.shape)
book_before_1000

(114, 4)


Unnamed: 0,book_id,bookname,author_id,cate1
54,54,Aesop's Fables,3,Translated Works
55,55,Agamemnon,2,Translated Works
56,56,Agesilaus,290,Academic Readings
58,58,Ajax,243,Translated Works
59,59,Alcestis,81,Translated Works
...,...,...,...,...
975,975,The Trojan Women,81,Translated Works
998,998,The Wasps,24,Translated Works
1013,1013,Theaetetus,216,Academic Readings
1021,1021,Timaeu,216,Academic Readings


In [27]:
#all books with author died 1000 ~ 1300
book_1000_1300 = books_df[books_df.author_id.isin(author_1000_1300.index)] 
print(book_1000_1300.shape)
book_1000_1300

(1, 4)


Unnamed: 0,book_id,bookname,author_id,cate1
610,610,Rubaiyat Of Omar Khayyam,209,Translated Works


In [28]:
#all books with author died 1300 ~ 1500
book_1300_1500 = books_df[books_df.author_id.isin(author_1300_1500.index)] 
print(book_1300_1500.shape)
book_1300_1500

(4, 4)


Unnamed: 0,book_id,bookname,author_id,cate1
262,262,Index,254,English Literature
310,310,Le Mort D'arthur,254,English Literature
739,739,The Decameron,107,Translated Works
743,743,The Divine Comedy,52,Translated Works


In [29]:
#all books with author died 1500 ~ 1600
book_1500_1600 = books_df[books_df.author_id.isin(author_1500_1600.index)] 
print(book_1500_1600.shape)
book_1500_1600

(13, 4)


Unnamed: 0,book_id,bookname,author_id,cate1
83,83,An Open Letter On Translating,191,Academic Readings
140,140,Commentary On The Epistle To The Galatians,191,Academic Readings
163,163,Dr. Faustus,49,English Literature
210,210,Gargantua And Pantagruel,86,Translated Works
278,278,Jerusalem Delivered,264,Translated Works
354,354,Martin Luther's 95 Theses,191,Academic Readings
356,356,Massacre At Paris,49,English Literature
441,441,Orlando Furioso,182,Translated Works
657,657,"Tamburlaine The Great, Pt 1",49,English Literature
658,658,"Tamburlaine The Great,pt 2",49,English Literature


In [30]:
#all books with author died 1600 ~ 1700
book_1600_1700 = books_df[books_df.author_id.isin(author_1600_1700.index)] 
print(book_1600_1700.shape)
book_1600_1700

(70, 4)


Unnamed: 0,book_id,bookname,author_id,cate1
24,24,A Lover's Complaint,288,English Literature
26,26,A Midsummer Night's Dream,288,English Literature
41,41,A Theologico-political Treatise [part I],31,Academic Readings
42,42,A Theologico-political Treatise [part Ii],31,Academic Readings
43,43,A Theologico-political Treatise [part Iii],31,Academic Readings
...,...,...,...,...
1003,1003,The Winter's Tale,288,English Literature
1022,1022,Titus Andronicus,288,English Literature
1036,1036,Treatise On Taxes And Contributions,286,Academic Readings
1038,1038,Twelfth Night,288,English Literature


In [31]:
#all books with author died 1700 ~ 1800
book_1700_1800 = books_df[books_df.author_id.isin(author_1700_1800.index)] 
print(book_1700_1800.shape)
book_1700_1800

(40, 4)


Unnamed: 0,book_id,bookname,author_id,cate1
20,20,A Journal Of The Plague Year,51,English Literature
27,27,A Modest Proposal,166,English Literature
35,35,A Sentimental Journey,178,English Literature
72,72,An Essay Concerning Human Understanding,159,Academic Readings
79,79,An Inquiry Into The Principles Of Political Ec...,145,Academic Readings
142,142,Concerning Civil Government,159,Academic Readings
183,183,Essay On The Nature Of Commerce In General,222,Academic Readings
188,188,"Essays, Moral And Literary",54,Academic Readings
209,209,From This World To The Next,119,English Literature
226,226,Gulliver' S Travels,166,English Literature


In [32]:
#all books with author died 1800 ~ 1900
book_1800_1900 = books_df[books_df.author_id.isin(author_1800_1900.index)] 
print(book_1800_1900.shape)
book_1800_1900

(410, 4)


Unnamed: 0,book_id,bookname,author_id,cate1
4,4,A Child's Garden Of Verses,226,English Literature
5,5,A Child's History Of England,42,English Literature
6,6,A Christmas Carol,42,English Literature
11,11,A Daughter Of Eve,130,Translated Works
12,12,A Distinguished Provincial At Paris,130,Translated Works
...,...,...,...,...
1072,1072,Weir Of Hermiston,226,English Literature
1078,1078,"Wieland,or The Transformation",39,English Literature
1081,1081,Wives And Daughters,72,English Literature
1084,1084,Wuthering Heights,75,English Literature


In [33]:
#all books with author died 1900 ~ now
book_after_1900 = books_df[books_df.author_id.isin(author_after_1900.index)] 
print(book_after_1900.shape)
book_after_1900

(434, 4)


Unnamed: 0,book_id,bookname,author_id,cate1
0,0,'twixt Land & Sea,167,English Literature
1,1,"20,000 Leagues Under The Sea",169,Translated Works
2,2,50 Bab Ballads,287,English Literature
3,3,A Cathedral Courtship,172,English Literature
7,7,A Collection Of Ballads,16,English Literature
...,...,...,...,...
1079,1079,"Winesburg,ohio",239,English Literature
1080,1080,Within The Tides,167,English Literature
1082,1082,Woman And Labour,207,English Literature
1083,1083,Women In Love,50,English Literature


### Books File

In [34]:
book_file_df

Unnamed: 0,index,file_id,book_id,chapter
0,0,0,943,Nan
1,1,1,54,Nan
2,2,2,681,25-khalifah
3,3,3,681,Entertainments
4,4,4,681,02-the Fishmerman And The Jinni
...,...,...,...,...
10166,10166,10130,106,Volumeone-part01
10167,10167,10131,106,Volumeone-part03
10168,10168,10132,106,Volumeone-part02
10169,10169,10133,106,Volumeone-part05


In [35]:
#all books files with author died before 1000
book_file_before_1000 = book_file_df[book_file_df.book_id.isin(book_before_1000.book_id)] 
print(book_file_before_1000.shape)
book_file_before_1000

(314, 4)


Unnamed: 0,index,file_id,book_id,chapter
0,0,0,943,Nan
1,1,1,54,Nan
2,2,2,681,25-khalifah
3,3,3,681,Entertainments
4,4,4,681,02-the Fishmerman And The Jinni
...,...,...,...,...
10126,10126,8645,915,Book06
10127,10127,8646,915,Book07
10128,10128,8647,915,Book05
10129,10129,8648,915,Book10


In [36]:
#all books files with author died 1000 ~ 1300
book_file_1000_1300 = book_file_df[book_file_df.book_id.isin(book_1000_1300.book_id)] 
print(book_file_1000_1300.shape)
book_file_1000_1300

(1, 4)


Unnamed: 0,index,file_id,book_id,chapter
133,133,133,610,Nan


In [37]:
#all books files with author died 1300 ~ 1500
book_file_1300_1500 = book_file_df[book_file_df.book_id.isin(book_1300_1500.book_id)] 
print(book_file_1300_1500.shape)
book_file_1300_1500

(40, 4)


Unnamed: 0,index,file_id,book_id,chapter
415,415,415,262,Nan
1186,1186,1171,743,Index
1187,1187,1172,743,Paradiso
1188,1188,1173,743,Inferno
1189,1189,1174,743,Purgatorio
2216,2216,1921,739,Day07
2217,2217,1922,739,Day06
2218,2218,1923,739,Day10
2219,2219,1924,739,Day04
2220,2220,1925,739,Day05


In [38]:
#all books files with author died 1500 ~ 1600
book_file_1500_1600 = book_file_df[book_file_df.book_id.isin(book_1500_1600.book_id)] 
print(book_file_1500_1600.shape)
book_file_1500_1600

(96, 4)


Unnamed: 0,index,file_id,book_id,chapter
432,432,432,356,Nan
433,433,433,658,Nan
434,434,434,657,Nan
435,435,435,817,Nan
436,436,436,163,Nan
...,...,...,...,...
9872,9872,8391,1052,More2-2
9873,9873,8392,1052,More2-6
9874,9874,8393,1052,More2-7
9875,9875,8394,1052,More2-5


In [39]:
#all books files with author died 1600 ~ 1700
book_file_1600_1700 = book_file_df[book_file_df.book_id.isin(book_1600_1700.book_id)] 
print(book_file_1600_1700.shape)
book_file_1600_1700

(302, 4)


Unnamed: 0,index,file_id,book_id,chapter
70,70,70,898,Nan
236,236,236,856,Nan
237,237,237,946,Nan
238,238,238,100,Nan
239,239,239,980,Nan
...,...,...,...,...
9773,9773,8371,319,Introduction
9774,9774,8372,319,Part01
9775,9775,8373,319,Part03
9776,9776,8374,319,Part02


In [40]:
#all books files with author died 1700 ~ 1800
book_file_1700_1800 = book_file_df[book_file_df.book_id.isin(book_1700_1800.book_id)] 
print(book_file_1700_1800.shape)
book_file_1700_1800

(340, 4)


Unnamed: 0,index,file_id,book_id,chapter
152,152,152,623,Nan
216,216,216,27,Nan
218,218,218,35,Nan
298,298,298,209,Nan
299,299,299,282,Nan
...,...,...,...,...
10086,10086,8605,72,Introduction
10087,10087,8606,72,Book01
10088,10088,8607,72,Book03
10089,10089,8608,72,Book02


In [41]:
#all books files with author died 1800 ~ 1900
book_file_1800_1900 = book_file_df[book_file_df.book_id.isin(book_1800_1900.book_id)] 
print(book_file_1800_1900.shape)
book_file_1800_1900

(5089, 4)


Unnamed: 0,index,file_id,book_id,chapter
35,35,35,887,Nan
36,36,36,198,Nan
67,67,67,235,Nan
68,68,68,614,Nan
69,69,69,36,Nan
...,...,...,...,...
10101,10101,8620,651,Chapter08
10102,10102,8621,651,Chapter05
10103,10103,8622,651,Chapter02
10104,10104,8623,651,Introduction


In [42]:
#all books files with author died after 1900
book_file_after_1900 = book_file_df[book_file_df.book_id.isin(book_after_1900.book_id)] 
print(book_file_after_1900.shape)
book_file_after_1900

(3988, 4)


Unnamed: 0,index,file_id,book_id,chapter
37,37,37,858,Nan
38,38,38,1018,Nan
39,39,39,128,Nan
40,40,40,9,Nan
41,41,41,197,Nan
...,...,...,...,...
10166,10166,10130,106,Volumeone-part01
10167,10167,10131,106,Volumeone-part03
10168,10168,10132,106,Volumeone-part02
10169,10169,10133,106,Volumeone-part05


In [43]:
book_file_before_1000.to_csv('book_file_before_1000.csv', index=False) 
book_file_1000_1300.to_csv('book_file_1000_1300.csv', index=False) 
book_file_1300_1500.to_csv('book_file_1300_1500.csv', index=False) 
book_file_1500_1600.to_csv('book_file_1500_1600.csv', index=False) 
book_file_1600_1700.to_csv('book_file_1600_1700.csv', index=False) 
book_file_1700_1800.to_csv('book_file_1700_1800.csv', index=False) 
book_file_1800_1900.to_csv('book_file_1800_1900.csv', index=False) 
book_file_after_1900.to_csv('book_file_after_1900.csv', index=False) 

In [44]:
text_file_df

Unnamed: 0,index,fmt,text
0,0,txt,\n\n\n\n\n\n\n\nThe Song of Roland\n\n\n\n\n\n...
1,1,txt,\n\n\n\n\n\n\n\n\n\n\t\tAesop's Fables \n\n\n\...
2,2,txt,\n\n\n\n\n\n\n\n\n\n KHALIFAH THE...
3,3,txt,\n\n\n\n\n\n\n\n\n\n ...
4,4,txt,\n\n\n\n\n\n\n\n\n\n THE FISHE...
...,...,...,...
10166,10166,txt,\n\n\n\n\n\n\n\nIII\n\n\n\n\n\n\n\n\n\n\n\nCha...
10167,10167,txt,\n\n\n\n\n\n\n\nII\n\n\n\n\n\n\n\n\n\n\n\nOn w...
10168,10168,txt,\n\n\n\n\n\n\n\nBook Fifth\n\n\n\n\n\n\n\n\n\n...
10169,10169,txt,\n\n\n\n\n\n\n\nII\n\n\n\n\n\n\n\n\n\n\n\nStre...


### Text Files

In [47]:
#all text files with author died before 1000
text_file_before_1000 = text_file_df[text_file_df.index.isin(book_file_before_1000.file_id)] 
print(text_file_before_1000.shape)
text_file_before_1000

(314, 3)


Unnamed: 0,index,fmt,text
0,0,txt,\n\n\n\n\n\n\n\nThe Song of Roland\n\n\n\n\n\n...
1,1,txt,\n\n\n\n\n\n\n\n\n\n\t\tAesop's Fables \n\n\n\...
2,2,txt,\n\n\n\n\n\n\n\n\n\n KHALIFAH THE...
3,3,txt,\n\n\n\n\n\n\n\n\n\n ...
4,4,txt,\n\n\n\n\n\n\n\n\n\n THE FISHE...
...,...,...,...
8645,8645,txt,\n\n\n\n\n\n\n\n\n\n BOOK VI\n\n\n\n\n\n\n\n ...
8646,8646,txt,\n\n\n\n\n\n\n\n\n\n\n\n BOOK VII\n\n\n\n\n\n...
8647,8647,txt,\n\n\n\n\n\n\n\n\n\n BOOK V\n\n\n\n\n\n\n\n ...
8648,8648,txt,\n\n\n\n\n\n\n\n\n\n BOOK X\n\n\n\n\n\n\n\n ...


In [48]:
#all text files with author died between 1000 ~ 1300
text_file_1000_1300 = text_file_df[text_file_df.index.isin(book_file_1000_1300.file_id)] 
print(text_file_1000_1300.shape)
text_file_1000_1300

(1, 3)


Unnamed: 0,index,fmt,text
133,133,txt,"\n\n\n\n\n\n\n\n\n\nRubaiyat of Omar Khayyam, ..."


In [49]:
#all text files with author died between 1300 ~ 1500
text_file_1300_1500 = text_file_df[text_file_df.index.isin(book_file_1300_1500.file_id)] 
print(text_file_1300_1500.shape)
text_file_1300_1500

(40, 3)


Unnamed: 0,index,fmt,text
415,415,html,"<HTML>\n\n<HEAD>\n\n <META HTTP-EQUIV=""Conte..."
1171,1171,html,"<HTML>\n\n<HEAD>\n\n <META HTTP-EQUIV=""Conte..."
1172,1172,txt,\n\n\n\n\n\n\n\n\n\n\n\n\n\nThe Divine Comedy\...
1173,1173,txt,\n\n\n\n\n\n\n\n\n\nThe Divine Comedy \n\n\n\n...
1174,1174,txt,\n\n\n\n\n\n\n\n\n\n\n\nThe Divine Comedy\n\n\...
1921,1921,txt,\n\n\n\n\n\n\n\n\n\n\n\n TH...
1922,1922,txt,\n\n\n\n\n\n\n\n\n\n\n\n T...
1923,1923,txt,\n\n\n\n\n\n\n\n\n\n\n\n THE IN...
1924,1924,txt,\n\n\n\n\n\n\n\n\n\n ...
1925,1925,txt,\n\n\n\n\n\n\n\n\n\n THE IN...


In [50]:
#all text files with author died between 1500 ~ 1600
text_file_1500_1600 = text_file_df[text_file_df.index.isin(book_file_1500_1600.file_id)] 
print(text_file_1500_1600.shape)
text_file_1500_1600

(96, 3)


Unnamed: 0,index,fmt,text
432,432,txt,\n\n\n\n\n\n\n\n\n\nMassacre at Paris\n\n\n\n\...
433,433,txt,"\n\n\n\n\n\n\n\n\n\nTamburlaine the Great, Par..."
434,434,txt,"\n\n\n\n\n\n\n\n\n\nTamburlaine the Great, Par..."
435,435,txt,\n\n\n\n\n\n\n\nThe Jew of Malta\n\n\n\n\n\n\n...
436,436,txt,\n\n\n\n\n\n\n\n\n\n\n\n\n\nDr. Faustus\n\n\n\...
...,...,...,...
8391,8391,txt,"\n\n\n\n\n\n\n\n\n BOOK II: OF THEIR TOWNS, ..."
8392,8392,txt,\n\n\n\n\n\n\n\n\n BOOK II: OF THE TRAVELLIN...
8393,8393,txt,"\n\n\n\n\n\n\n\n\n BOOK II: OF THEIR SLAVES,..."
8394,8394,txt,\n\n\n\n\n\n\n\n\n BOOK II: OF THEIR TRAFFIC...


In [51]:
#all text files with author died between 1600 ~ 1700
text_file_1600_1700 = text_file_df[text_file_df.index.isin(book_file_1600_1700.file_id)] 
print(text_file_1600_1700.shape)
text_file_1600_1700

(302, 3)


Unnamed: 0,index,fmt,text
70,70,txt,\n\n\n\n\n\n\n\nThe Princess of Cleves\n\n\n\n...
236,236,txt,\n\n\n\n\n\n\n\n\n\n ...
237,237,txt,\n\n\n\n\n\n\n\n\n\n ...
238,238,txt,\n\n\n\n\n\n\n\n\n\n\n\n\n\n ...
239,239,txt,\n\n\n\n\n\n\n\n\n\n ...
...,...,...,...
8819,8819,txt,\n\n\n\n CHAPTER X\n\n\n\n OF THE PLEASANT D...
8820,8820,txt,\n\n\n\n CHAPTER VI\n\n\n\n OF THE DIVERTING...
8821,8821,txt,\n\n\n\n CHAPTER XII\n\n\n\n OF WHAT A GOATH...
8822,8822,txt,\n\n\n\n CHAPTER XIII\n\n\n\n IN WHICH IS EN...


In [53]:
#all text files with author died between 1700 ~ 1800
text_file_1700_1800 = text_file_df[text_file_df.index.isin(book_file_1700_1800.file_id)] 
print(text_file_1700_1800.shape)
text_file_1700_1800

(340, 3)


Unnamed: 0,index,fmt,text
152,152,txt,\n\n\n\n\n\nShe Stoops to Conquer\n\n\n\n\n\n\...
216,216,txt,\n\n\n\n\n\n\n\nA Modest Proposal\n\n\n\n\n\n\...
218,218,txt,\n\n\n\n\n\n\n\nA Sentimental Journey\n\n\n\n\...
298,298,txt,\n\n\n\n\n\n\n\n\n\nFrom This World to the Nex...
299,299,txt,\n\n\n\n\n\n\n\n\n\nJournal of A Voyage to Lis...
...,...,...,...
9290,9290,txt,\n\n\n\n\n\n\n\n\n\n\n\nPART I. A VOYAGE TO L...
9291,9291,txt,\n\n\n\n\n\n\n\n\n\nCHAPTER V.\n\n\n\n\n\n\n\n...
9292,9292,txt,\n\n\n\n\n\n\n\n\n\nCHAPTER IV.\n\n\n\n\n\n\n\...
9293,9293,txt,\n\n\n\n\n\n\n\n\n\n\n\nCHAPTER VI.\n\n\n\n\n\...


In [52]:
#all text files with author died between 1800 ~ 1900
text_file_1800_1900 = text_file_df[text_file_df.index.isin(book_file_1800_1900.file_id)] 
print(text_file_1800_1900.shape)
text_file_1800_1900

(5089, 3)


Unnamed: 0,index,fmt,text
35,35,txt,\n\n\n\n\n\n\n\n\n\nThe Poems of Goethe\n\n\n\...
36,36,txt,\n\n\n\n\t\t\tFaust \n\n\n\n\t\tby Johann W. ...
67,67,txt,\n\n\n\n\n\n\n\nHerodias\n\n\n\n\n\n\n\nby Gus...
68,68,txt,\n\n\n\n\n\n\n\nSalammbo\n\n\n\n\n\n\n\nby Gus...
69,69,txt,\n\n\n\n\n\n\n\nA Simple Soul\n\n\n\n\n\n\n\nb...
...,...,...,...
10120,10120,txt,The Principles of Political Economy\n\n\nby Jo...
10121,10121,txt,The Principles of Political Economy\n\n\nby Jo...
10122,10122,txt,The Principles of Political Economy\n\n\nby Jo...
10123,10123,txt,The Principles of Political Economy\n\n\nby Jo...


In [54]:
#all text files with author died after 1900
text_file_after_1900 = text_file_df[text_file_df.index.isin(book_file_after_1900.file_id)] 
print(text_file_after_1900.shape)
text_file_after_1900

(3988, 3)


Unnamed: 0,index,fmt,text
37,37,txt,\n\n\n\n\n\n\n\n\n\nTHE MILLER'S DAUGHTER\n\n\...
38,38,txt,\n\n\n\n\n\n\n\n\n\nTHE DEATH OF OLIVIER BECAI...
39,39,txt,\n\n\n\n\n\n\n\n\n\n\n\nCAPTAIN BURLE\n\n\n\n\...
40,40,txt,\n\n\n\n\n\n\n A Conf...
41,41,txt,\n\n\n\n\n\n\n\n\n\nFather Sergius\n\n\n\n\n\n...
...,...,...,...
10166,10166,txt,\n\n\n\n\n\n\n\nIII\n\n\n\n\n\n\n\n\n\n\n\nCha...
10167,10167,txt,\n\n\n\n\n\n\n\nII\n\n\n\n\n\n\n\n\n\n\n\nOn w...
10168,10168,txt,\n\n\n\n\n\n\n\nBook Fifth\n\n\n\n\n\n\n\n\n\n...
10169,10169,txt,\n\n\n\n\n\n\n\nII\n\n\n\n\n\n\n\n\n\n\n\nStre...


In [55]:
text_file_before_1000.to_csv('text_file_before_1000.csv', index=False) 
text_file_1000_1300.to_csv('text_file_1000_1300.csv', index=False) 
text_file_1300_1500.to_csv('text_file_1300_1500.csv', index=False) 
text_file_1500_1600.to_csv('text_file_1500_1600.csv', index=False) 
text_file_1600_1700.to_csv('text_file_1600_1700.csv', index=False) 
text_file_1700_1800.to_csv('text_file_1700_1800.csv', index=False) 
text_file_1800_1900.to_csv('text_file_1800_1900.csv', index=False) 
text_file_after_1900.to_csv('text_file_after_1900.csv', index=False) 