# Gutenberg dataset: word counts, TF-IDF & Similarity Vectors
Demonstrates: 
<ul>
    <li>CountVectorizer</li>
    <li>TfidfTransformer</li>
    <li>Normalizer</li>
    <li>Pipeline</li>
    <li>cosine_similarity</li>    
</ul>

## Load data

In [1]:
import numpy as np
import zipfile

books_file_name = 'Gutenberg.zip'
max_books_number = np.Inf 

authors = []
book_names = []
content  = []

with zipfile.ZipFile(books_file_name, "r") as in_file: # open zip file
    for index, file_name in enumerate(in_file.namelist()): # iterate through all books in the zip file
        if file_name.endswith('.txt') and index<max_books_number:
            book_elements = file_name[file_name.rindex('/')+1:].split('___') # get author name & book title from the file name
            if len(book_elements)==2:
                authors.append(book_elements[0])
                book_names.append(book_elements[1][:book_elements[1].rindex('.')])
                content.append(in_file.read(file_name).decode('utf-8','ignore')) # read the file content
print('read {} books by {} authors'.format(len(book_names),len(np.unique(authors))))TfidfVectorizer

read 3036 books by 142 authors


## TF-IDF

<h6>Skip from demo</h6>

In [2]:
get_feature_namesfrom sklearn.feature_extraction.text import CountVectorizer

vect_model = CountVectorizer(stop_words='english')
word_counts = vect_model.fit_transform(content)

In [16]:
word_counts

<3036x476408 sparse matrix of type '<class 'numpy.float64'>'
	with 17806763 stored elements in Compressed Sparse Row format>

In [4]:
17806763 / (word_counts.shape[0]*word_counts.shape[1])

0.012311307123759621

In [5]:
#!pip install tabulate

In [6]:
from IPython.display import HTML, display
import tabulate
table = [ ['Author','Book','Term','TF','DF','IDF','TF-IDF']]

book_indeces = [18,25, 311]
terms = ['candle','king','hebrew','death']

number_of_documents = word_counts.shape[0]

#print(content[book_index])
for term in terms:
    term_id = vect_model.vocabulary_.get(term)
    df = np.sum(word_counts[:,term_id]!=0)
    idf = np.log10(number_of_documents/df)
    for book_index in book_indeces:
        tf = word_counts[book_index,term_id]
        
        row = [authors[book_index], book_names[book_index],term,tf,df,idf,tf*idf]     
        table.append(row)
display(HTML(tabulate.tabulate(table, tablefmt='html')))

0,1,2,3,4,5,6
Author,Book,Term,TF,DF,IDF,TF-IDF
Ambrose Bierce,The Damned Thing,candle,6,1410,0.3330826545680629,1.9984959274083773
Andrew Lang,A Short History of Scotland,candle,0,1410,0.3330826545680629,0.0
Charles Dickens,Hunted Down,candle,0,1410,0.3330826545680629,0.0
Ambrose Bierce,The Damned Thing,king,0,2232,0.13360757695790157,0.0
Andrew Lang,A Short History of Scotland,king,214,2232,0.13360757695790157,28.592021468990936
Charles Dickens,Hunted Down,king,0,2232,0.13360757695790157,0.0
Ambrose Bierce,The Damned Thing,hebrew,0,505,0.7790103891047814,0.0
Andrew Lang,A Short History of Scotland,hebrew,4,505,0.7790103891047814,3.1160415564191255
Charles Dickens,Hunted Down,hebrew,1,505,0.7790103891047814,0.7790103891047814


## Construct and train model

<h6>skip from demo</h6>

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('vect',CountVectorizer(stop_words='english')),
    ('tfidf',TfidfTransformer()),
    ('norm',Normalizer(norm='l2')),
    ])
word_counts = pipeline.fit_transform(content)
vocab = np.array(pipeline.named_steps['vect'].get_feature_names()) # get list of words

## Display model results

<h6>skip from demo</h6>

In [8]:
from IPython.display import HTML, display
import tabulate
column_names  = ['index','author','book']
for i in range(15):
    column_names.append('term_{}'.format(i))
    column_names.append('value_{}'.format(i))
    
table = [ column_names ]
for book_index in range(word_counts.shape[0]):
    order = (-word_counts[book_index]).toarray().argsort().flatten()[:15]
    terms = vocab[order]
    term_values = word_counts[book_index,order].toarray().flatten()
    row = [book_index, authors[book_index],book_names[book_index]]
    for i in range(15):
        row.append(terms[i])
        row.append(term_values[i])
    table.append(row)
display(HTML(tabulate.tabulate(table, tablefmt='html')))    

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32
index,author,book,term_0,value_0,term_1,value_1,term_2,value_2,term_3,value_3,term_4,value_4,term_5,value_5,term_6,value_6,term_7,value_7,term_8,value_8,term_9,value_9,term_10,value_10,term_11,value_11,term_12,value_12,term_13,value_13,term_14,value_14
0,Alexander Pope,The Rape of the Lock and Other Poems,pope,0.6573657284356688,belinda,0.16218317743386504,heav,0.12954342875232128,ev,0.11956618933510252,poem,0.11835373427162642,poet,0.11416928690626084,essay,0.11350400375186982,man,0.10822218446362229,th,0.10786164332439392,arbuthnot,0.09709034805606433,rs,0.0961902468599021,rape,0.09434534538098105,ry,0.09302283287244584,hervey,0.08535658202921222,thro,0.08190661879281051
1,Alexander Pope,"The Works of Alexander Pope, Volume 1",footnote,0.5555059671550552,pope,0.5308800896635788,curll,0.156246347203662,wakefield,0.13688478517987232,dryden,0.1302586342243329,warton,0.1287232378713182,warburton,0.1008472994142614,pastorals,0.09574279608489315,wycherley,0.09556719993379764,chaucer,0.08896808883513758,letters,0.08747411266322867,ry,0.08712565711760478,ecl,0.08365279735023187,ev,0.07881855315149983,bowles,0.07726769665104685
2,Alfred Russel Wallace,Contributions to the Theory of Natural Selection,species,0.3909963542739217,papilio,0.1739084503472553,birds,0.1589458489478906,papilionidæ,0.13325058305128715,nidification,0.1330907274846597,mimicry,0.1274561625316291,_p,0.12046383439790988,animals,0.11831473612954788,selection,0.11693784449830269,insects,0.11374085269058838,celebes,0.11048066052757867,colour,0.10984464843087201,heliconidæ,0.10432041940318863,genus,0.09953958224753912,form,0.09654548733885755
3,Alfred Russel Wallace,Is Mars Habitable?,mars,0.46569967685620584,lowell,0.41853466381490056,canals,0.2702621012435497,temperature,0.26202350068744806,surface,0.19761711875193452,planet,0.16577526197822443,radiation,0.14893386464496344,heat,0.1480591259513544,mr,0.12062798400369185,atmosphere,0.11150180960467057,temperatures,0.10264575092313555,vapour,0.10247722307482303,water,0.10194694650983742,earth,0.08593445354438158,polar,0.08353523065632139
4,Alfred Russel Wallace,Island Life,species,0.3323666367511654,genera,0.26693563772140994,islands,0.2448513551344504,flora,0.1632505526483673,zealand,0.1629671712836435,madagascar,0.1432518401353176,_var,0.14003144613206156,glacial,0.13686210303832608,fauna,0.13625969856067663,excentricity,0.1209138477727639,peculiar,0.11965121449725191,birds,0.11689940583815028,plants,0.11625232283012249,distribution,0.11610667260186512,australia,0.11462464212195439
5,Alfred Russel Wallace,"The Malay Archipelago, Volume 1",java,0.28130950103727875,celebes,0.2278762875758464,species,0.1921885787781182,borneo,0.1889272315734323,malay,0.1841669566792233,timor,0.17676384924107716,islands,0.17234316689874338,lombock,0.16535661099730445,sumatra,0.1448047935497486,mias,0.13779717583108705,dyaks,0.13227846667117332,rajah,0.1320105635632571,archipelago,0.11118903151518349,birds,0.10935046596741559,island,0.1006382095586369
6,Alfred Russel Wallace,"The Malay Archipelago, Volume 2",aru,0.27929860619356595,ternate,0.1945564323957136,ceram,0.19436690791256003,papuan,0.17109222514136718,malay,0.16429812013929393,islands,0.16373809835101383,guinea,0.1533786785282984,birds,0.14814460896257917,species,0.14769794843998682,gilolo,0.1292333698381473,moluccas,0.127516209262384,batchian,0.12301720650058953,dorey,0.12144613144998871,waigiou,0.1176686323049117,sago,0.11606445001045727
7,Ambrose Bierce,A Cynic Looks at Life,life,0.1817529837045641,thou,0.1730061152927458,man,0.16425347559312145,civilization,0.12980723127885477,know,0.11566098963017714,good,0.1148260880872634,penalty,0.10932795810603554,death,0.1039015655196591,woman,0.09894022504304723,god,0.09822623972722024,world,0.09790747154790909,women,0.09186629194635804,flammarion,0.09013797607553604,civilized,0.08984991460881489,men,0.08760573842920337
8,Ambrose Bierce,"A Son of the Gods, and A Horseman in the Sky",druse,0.42430232911816007,bierce,0.20087724519998126,skirmishers,0.14371816971714593,horseman,0.13416622018586058,horse,0.12495344567981116,enemy,0.12028199239250965,rifle,0.11829086032560822,tra,0.1084058350298486,line,0.10190583997471725,crest,0.10156832545999371,man,0.10121665677413158,commander,0.09796613008799389,cliff,0.09587080961298822,carter,0.09578974548767742,away,0.09038610588729858


## Search for a book by phrase

In [9]:
def books_by_phrase(model,word_counts,search_query):
    from sklearn.metrics.pairwise import cosine_similarity
    phrases_matrix = model.transform([search_query]) # convert search string into a vector using trained model
    similarities = cosine_similarity(word_counts,phrases_matrix[0]).flatten() # compute similarity of the search string to each book
    similarity_order =(-similarities).argsort().flatten()[:15] #  sort books by similarity to the term
    return similarity_order, similarities, search_query


In [10]:
from IPython.display import HTML, display
import tabulate

similarity_order, similarities, search_query = books_by_phrase(pipeline,word_counts,'war fighting')
print('Search: {}'.format(search_query))
table = [['#', 'Author','Book','Similarity']]
for i,sim in enumerate(similarity_order):
    table.append([i,authors[sim],book_names[sim],similarities[sim]])  
display(HTML(tabulate.tabulate(table, tablefmt='html')))    

Search: war fighting


0,1,2,3
#,Author,Book,Similarity
0,Herbert George Wells,War and the Future,0.2863289931211813
1,Abraham Lincoln,Lincoln's Second Inaugural Address,0.23414988146724086
2,Herbert George Wells,What is Coming?,0.20509503039627658
3,Winston Churchill,An Essay On The American Contribution And The Democratic Idea,0.15534795004140514
4,Herbert George Wells,In The Fourth Year,0.1481594322425235
5,Edmund Burke,"The Works of the Right Honourable Edmund Burke, Vol. 05 (of 12)",0.14510441429305215
6,Winston Churchill,A Traveller in War-Time,0.14438665019656297
7,Herman Melville,White Jacket,0.13824318736529212
8,John Stuart Mill,The Contest in America,0.11756485170579577


In [11]:
def print_similar_books(pipeline,word_counts,authors,book_names,search_query):
    from IPython.display import HTML, display
    import tabulate

    similarity_order, similarities, search_query = books_by_phrase(pipeline,word_counts,search_query)
    print('Search: {}'.format(search_query))
    table = [['#', 'Author','Book','Similarity']]
    for i,sim in enumerate(similarity_order):
        table.append([i,authors[sim],book_names[sim],similarities[sim]])  
    display(HTML(tabulate.tabulate(table, tablefmt='html')))    

In [12]:
print_similar_books(pipeline,word_counts,authors,book_names,'ancient egypt')

Search: ancient egypt


0,1,2,3
#,Author,Book,Similarity
0,Henry Rider Haggard,Moon of Israel,0.12972513646887246
1,Henry Rider Haggard,The Ancient Allan,0.11767520768817864
2,Charles Kingsley,The Gospel of the Pentateuch,0.10334551139802409
3,Herbert George Wells,A Short History of the World,0.09917230163484164
4,Henry Rider Haggard,Morning Star,0.09264723972883739
5,Henry Rider Haggard,Cleopatra,0.09209156539817485
6,Charlotte Mary Yonge,The Chosen People,0.09176265918946656
7,Jacob Abbott,Cleopatra,0.08692287937091871
8,Andrew Lang,"Myth, Ritual And Religion, Vol. 2 (of 2)",0.07220997950663849


In [13]:
print_similar_books(pipeline,word_counts,authors,book_names,'napoleon')

Search: napoleon


0,1,2,3
#,Author,Book,Similarity
0,George Bernard Shaw,The Man of Destiny,0.5956064477906537
1,Thomas Hardy,The Dynasts,0.4160908567601338
2,William Makepeace Thackeray,The Second Funeral of Napoleon,0.2008900289750264
3,G K Chesterton,The Crimes of England,0.1407308253067979
4,Elizabeth Barrett Browning,The Letters of Elizabeth Barrett Browning (2 of 2),0.11021908998689281
5,Sir Arthur Conan Doyle,Uncle Bernac,0.07984642767188722
6,Ralph Waldo Emerson,Representative Men,0.07851829028888328
7,George Alfred Henty,Through Russian Snows,0.07681647703235125
8,Sir Arthur Conan Doyle,Through the Magic Door,0.055461138015455894


In [14]:
print_similar_books(pipeline,word_counts,authors,book_names,'politics revolution society')

Search: politics revolution society


0,1,2,3
#,Author,Book,Similarity
0,Edmund Burke,"The Works of the Right Honourable Edmund Burke, Vol. 04 (of 12)",0.12985253071654268
1,John Stuart Mill,Socialism,0.11527666405902308
2,Daniel Defoe,"An Answer to a Question that Nobody thinks of, viz., But what if the Queen should Die?",0.10763331984331073
3,John Morley,"Critical Miscellanies, Volume 3, Essay 8, France in the Eighteenth Century",0.10474443568386532
4,Bertrand Russell,The Practice and Theory of Bolshevism,0.10085773199849041
5,Edmund Burke,Selections from the Speeches and Writings of Edmund Burke,0.09579529391756486
6,John Morley,Studies in Literature,0.09280093007408849
7,John Stuart Mill,On Liberty,0.08677001076171999
8,John Morley,On Compromise,0.08366302180264625


In [15]:
print_similar_books(pipeline,word_counts,authors,book_names,'food cooking')

Search: food cooking


0,1,2,3
#,Author,Book,Similarity
0,Lewis Carroll,Feeding the Mind,0.05578203145568527
1,Thomas Robert Malthus,An Essay on the Principle of Population,0.0530331959942741
2,James Otis,Richard of Jamestown,0.04705107567078652
3,Jack London,The People of the Abyss,0.0431555594109479
4,Sir Francis Galton,The Art of Travel,0.04204379877672076
5,P B Shelley,A Vindication of Natural Diet,0.03750145398116882
6,Herbert George Wells,The Food of the Gods and How It Came to Earth,0.035578622772523515
7,John Galsworthy,Another Sheaf,0.03537026558594059
8,Jonathan Swift,A Modest Proposal,0.03307225506738709
