**Content based book recommendation system**, which will determine which books are close to each other based on how similar the discussed topics are. The dataset consists of books written by Darwin.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import glob
import re, os
import pickle
from gensim.models import TfidfModel
from gensim import similarities
from gensim import corpora
from scipy.cluster import hierarchy

%matplotlib inline

In [2]:
folder = "datasets_books/"
files = glob.glob(folder + '*.txt')
files.sort()

files

['datasets_books\\Autobiography.txt',
 'datasets_books\\CoralReefs.txt',
 'datasets_books\\DescentofMan.txt',
 'datasets_books\\DifferentFormsofFlowers.txt',
 'datasets_books\\EffectsCrossSelfFertilization.txt',
 'datasets_books\\ExpressionofEmotionManAnimals.txt',
 'datasets_books\\FormationVegetableMould.txt',
 'datasets_books\\FoundationsOriginofSpecies.txt',
 'datasets_books\\GeologicalObservationsSouthAmerica.txt',
 'datasets_books\\InsectivorousPlants.txt',
 'datasets_books\\LifeandLettersVol1.txt',
 'datasets_books\\LifeandLettersVol2.txt',
 'datasets_books\\MonographCirripedia.txt',
 'datasets_books\\MonographCirripediaVol2.txt',
 'datasets_books\\MovementClimbingPlants.txt',
 'datasets_books\\OriginofSpecies.txt',
 'datasets_books\\PowerMovementPlants.txt',
 'datasets_books\\VariationPlantsAnimalsDomestication.txt',
 'datasets_books\\VolcanicIslands.txt',
 'datasets_books\\VoyageBeagle.txt']

Basic pre-processing 

In [3]:
txts = []
titles = []

for n in files:
    f = open(n, encoding='utf-8-sig')
    # Remove non-alpha-numeric characters
    data = re.sub('[\W_]+', ' ', f.read())
    # Store the texts and titles of the books
    titles.append(os.path.basename(n).replace('.txt', ''))
    txts.append(data)

[len(t) for t in txts]

[123231,
 496068,
 1776539,
 617088,
 913713,
 624232,
 335920,
 523021,
 797401,
 901406,
 1047518,
 1010643,
 767492,
 1660866,
 298319,
 916267,
 1093567,
 1043499,
 341447,
 1149574]

In [4]:
# Finding Darwin's most famous book: "On the Origin of Species" for analysis
for i in range(len(titles)):
    if(titles[i]=="OriginofSpecies"):
        ori = i
print(ori)

15


Tokenizing the corpus. <br/>
Transforming each text into a list of the individual words.

In [6]:
# Defining a list of stop words
stoplist = set('for a of the and to in to be which some is at that we i who whom show via may my our might as well'.split())

txts_lower_case = [txt.lower() for txt in txts]
txts_split = [txt.split() for txt in txts_lower_case]
# Removing tokens which are part of the list of stop words
texts = [[word for word in txt if word not in stoplist] for txt in txts_split]

texts[ori][:10]

['on',
 'origin',
 'species',
 'but',
 'with',
 'regard',
 'material',
 'world',
 'can',
 'least']

Stemming

In [11]:
texts_stem = pickle.load(open("datasets_books/texts_stem.p", "rb"))

texts_stem[ori][:10]

Building a bag-of-words model

In [None]:
dictionary = corpora.Dictionary(texts_stem)
bows = [dictionary.doc2bow(txt) for txt in texts_stem]

bows[ori][:5]

most common words of a given book

In [None]:
df_bow_origin = pd.DataFrame(bows[ori])

df_bow_origin.columns = ['index', 'occurrences']
df_bow_origin['token'] = df_bow_origin['index'].apply(lambda x: dictionary[x])

df_bow_origin = df_bow_origin.sort_values('occurrences', ascending=False)
df_bow_origin.head(10)

Build a tf-idf model

In [None]:
model = TfidfModel(bows)

model[bows[ori]]

In [None]:
df_tfidf = pd.DataFrame(model[bows[ori]])

df_tfidf.columns = ['id', 'score']
df_tfidf['token'] = df_tfidf['id'].apply(lambda x: dictionary[x])

df_tfidf = df_tfidf.sort_values('score', ascending=False)
df_tfidf.head(10)

Compute distance between texts

In [None]:
sims = similarities.MatrixSimilarity(model[bows])

sim_df = pd.DataFrame(list(sims))
sim_df.columns = titles
sim_df.index = titles

sim_df

The book most similar to "On the Origin of Species"

In [None]:
v = sim_df['OriginofSpecies']

v_sorted = v.sort_values()
v_sorted.plot.barh(x='lab', y='val', rot=0).plot()

plt.xlabel("Score")
plt.ylabel("Book")
plt.title("Similarity")

Which books have similar content?

In [None]:
# Computing the clusters from the similarity matrix,
# using the Ward variance minimization algorithm
Z = hierarchy.linkage(sims, 'ward')

hierarchy.dendrogram(Z, leaf_font_size=8, labels=sim_df.index, orientation='left')