### Importing necessary dependencies

In [1]:
import pandas as pd
import numpy as np
import math

### Reading the dataset and removing unnamed columns

In [2]:
df = pd.read_csv('books_with_blurbs.csv')

df.drop(df.columns[df.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)

df.dropna(inplace=True)

df

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,ISBN,Title,Author,Year,Publisher,Blurb
0,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,"Here, for the first time in paperback, is an o..."
1,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,"The fascinating, true story of the world's dea..."
2,399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,Winnie and Helen have kept each others worst s...
3,425176428,What If?: The World's Foremost Military Histor...,Robert Cowley,2000,Berkley Publishing Group,Historians and inquisitive laymen alike love t...
4,1881320189,Goodbye to the Buttermilk Sky,Julia Oliver,1994,River City Pub,This highly praised first novel by fiction wri...
...,...,...,...,...,...,...
57505,451458877,Tainted Trail,Wen Spencer,2002,Roc,"Ukiah Oregon, half-man and half-alien raised b..."
57506,399148736,Twelve Mile Limit,Randy Wayne White,2002,Penguin Putnam,"On a Friday in early November, four people hea..."
57507,399148841,The Man With the Red Tattoo (James Bond 007),Raymond Benson,2002,Putnam Publishing Group,On a quiet late-night flight from Tokyo to Lon...
57508,553578979,"Iron Fist (Star Wars: X-Wing Series, Book 6)",Aaron Allston,1998,Bantam,They are the Rebel Alliance's ultimate strike ...


### Cleaning Blurb Text

In [3]:
import re

def clean_text(text):
    text = text.lower()
    text = text.strip()
    text = text.encode('ascii', 'ignore').decode()
    text = ' '.join(text.split())
    res = re.sub(r'[^\w\s]', '', text) 
    return res

df['summary'] = df['Blurb'].apply(clean_text)
# df['summary'] = df['Author'].apply(clean_text).str.cat(df['Blurb'].apply(clean_text), sep =" ") 
df = df.head(15000)

df

Unnamed: 0,ISBN,Title,Author,Year,Publisher,Blurb,summary
0,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,"Here, for the first time in paperback, is an o...",here for the first time in paperback is an out...
1,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,"The fascinating, true story of the world's dea...",the fascinating true story of the worlds deadl...
2,399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,Winnie and Helen have kept each others worst s...,winnie and helen have kept each others worst s...
3,425176428,What If?: The World's Foremost Military Histor...,Robert Cowley,2000,Berkley Publishing Group,Historians and inquisitive laymen alike love t...,historians and inquisitive laymen alike love t...
4,1881320189,Goodbye to the Buttermilk Sky,Julia Oliver,1994,River City Pub,This highly praised first novel by fiction wri...,this highly praised first novel by fiction wri...
...,...,...,...,...,...,...,...
14995,441240941,Flood,Richard Martin Stern,1981,Ace Books,The forces of greed confront the forces of nat...,the forces of greed confront the forces of nat...
14996,373074646,"Castle Of Dreams (Silhouette Intimate Moments,...",Maura Seger,1992,Silhouette,"BURIED TREASURE..., Held hostage by terrorists...",buried treasure held hostage by terrorists bri...
14997,446360074,Rage of Angels,Sidney Sheldon,1988,Springer-Verlag,"SHELDON'S MOST MEMORABLE HEROINE YET ...,JENNI...",sheldons most memorable heroine yet jennifer ...
14998,373706413,Forbidden (Women Who Dare) (Harlequin Superrom...,Ellen James,1995,Harlequin,"has said to hell with security, her ex-lover ...",has said to hell with security her exlover and...


### Vectorizing text content using TF-IDF

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')


#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(df['summary'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape


(15000, 98482)

### Calculating Similarity Score from vectorized input

In [5]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)


### Creating an Index on Book Titles

In [6]:
indices = pd.Series(df.index, index=df['Title']).drop_duplicates()

### Building a recommender based on cosine similarity

In [7]:
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = indices[title]

    sim_scores = list(enumerate(cosine_sim[idx]))

    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    sim_scores = sim_scores[1:11]

    movie_indices = [i[0] for i in sim_scores]

    return sim_scores

### Testing the results

In [8]:
# Scores without considering author details
scores = get_recommendations('Goodbye to the Buttermilk Sky')

print("Printing similarity.....")

for score in scores:
    print('{name} - {sc}'.format(name=df['Title'].iloc[score[0]], sc=score[1]))

Printing similarity.....
Crazy in Alabama - 0.12915328943155102
The Art &amp - 0.12656561397553184
To the Edge of the Sky: A Story of Love, Betrayal, Suffering, and the Strength of Human Courage - 0.1019376123912428
Arena - 0.10012447120654527
The Women on the Porch (Southern Classics Series) - 0.09670360324021388
Memorias de Una Vaca - 0.0958978115982818
The Carousel - 0.09531858622787225
Bears on Wheels (Bright &amp - 0.09113426228857655
Ten Apples Up on Top! (Bright &amp - 0.09113426228857655
Once upon a More Enlightened Time: More Politically Correct Bedtime Stories - 0.08973260467952754


### Checking the similarity of book read and recommended book

In [9]:
predicted = df.iloc[scores[0][0]]
print('Actual summary - ')
actual = df.iloc[indices['Goodbye to the Buttermilk Sky']]
print(actual['summary'])
print("Predicted...")
print(predicted['summary'])

Actual summary - 
this highly praised first novel by fiction writer julia oliver is the story of one young womans struggle with fidelity and identity in depressionera rural alabama a beautifully narrated novel of time and place goodbye to the buttermilk sky recreates a southern summer when the depression and the boll weevil turned hopes to dust with the extraordinary talent to make the reader see the ball canning jars on the kitchen table hear the clicks on the party line and feel the bittersweet moments of 20yearold callie tatums first experiences with adult desire oliver portrays a young wifes increasingly dangerous infidelity with cinematic precision and palpable suspense soon with only her housekeeper as a confidant callie breaks societys rules about race and class as well as her marriage vows the result is a chain of events that will lead to tragedy and a womans stunning decision about love passion and the future of her lifeoriginally published in cloth in 1994 goodbye to the butt