In [None]:
# importing pandas and numpy 
import numpy as np
import pandas as pd

In [None]:
# importing books.csv as a DataFrame
books = pd.read_csv("books.csv")

In [None]:

books.head(1)

In [None]:
# Taking only useful columns
books = books[['wikipedia_id', 'title', 'author', 'genre', 'summary']]
books.info()
# Saving Author name for future use
temp = books['author']

In [None]:
# Drops all rows whick have a NULL value at anu column
books.isnull().sum()
books.dropna(inplace=True)
books.isnull().sum()
books.info()

In [None]:
# https://docs.python.org/3/library/ast.html
import ast

In [None]:
# function for cleaning the list or dict (takes special symbols literally as strings)
def clean(obj):
    obj = ast.literal_eval(obj)
    li = []
    for i in obj:
        li.append(obj[i])
    return li

In [None]:
books['genre'] = books['genre'].apply(clean)

In [None]:
# string to list
books['summary'] = books['summary'].apply(lambda x: x.split())
# Remove Spaces
books['summary'] = books['summary'].apply(lambda x:[i.replace(" ","") for i in x])
books['author'] = books['author'].apply(lambda x: x.replace(" ", ""))
books['genre'] = books['genre'].apply(lambda x: [i.replace(" ", "") for i in x])
books['author'] = books['author'].apply(lambda x: x.split())

In [None]:
# Combining author, genre, and summary as tags 
books['tags'] = books['genre'] + books['author'] + books['summary']
books['tags'] = books['tags'].apply(lambda x:[i.lower() for i in x])

In [None]:
# new dataframe
dframe = books[['wikipedia_id', 'title', 'tags']]

In [None]:
# converting list of strings to a singular large string with spaces
dframe['tags'] = dframe['tags'].apply(lambda x:" ".join(x))
dframe.head()

In [None]:
# Importing scikit learn : https://scikit-learn.org/stable/index.html   (Simple and efficient tools for predictive data analysis)
# Importing nltk : https://www.nltk.org/    (NLTK is a leading platform for building Python programs to work with human language data)
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features= 5000, stop_words='english')
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [None]:
# function to 'stem' strings. More on it : https://www.nltk.org/api/nltk.stem.porter.html#module-nltk.stem.porter
def stem(text):
    temp = []
    
    for i in text.split():
        temp.append(ps.stem(i))
    
    return " ".join(temp)

In [None]:
dframe['tags'] = dframe['tags'].apply(stem)

In [None]:
# Appling Count Vectorisation to plot data , more on it :https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
cv = CountVectorizer(max_features= 5000, stop_words='english')
vectors = cv.fit_transform(dframe['tags']).toarray()
vectors.shape
cv.get_feature_names()

In [None]:
# Function which gives cosines of all points with respect to all other points. More on it : https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.cosine_similarity.html
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Creating a list for cosines
similarity = cosine_similarity(vectors) 

In [None]:
# just a temporary function to verify results
def recommend(title):
    book_index = dframe[dframe['title'] == title].index[0]
    plots = similarity[book_index]
    recommended = sorted(enumerate(plots), reverse=True, key= lambda x:x[1])[1:6]
    for recommendation in recommended:
        print(dframe.iloc[recommendation[0]].title)
    
    return

In [None]:
# The pickle module implements binary protocols for serializing and de-serializing a Python object structure.
import pickle

In [None]:
# new dataframe
book_info = dframe[['wikipedia_id', 'title']]
book_info['author'] = temp

In [None]:
# function which returns a list of indexes top 5 recommendations of all elements
def extract(similarity):
    main = []
    for element in similarity:
        temp = []
        recommended = []
        temp = sorted(enumerate(element), reverse=True, key= lambda x:x[1])[1:6]
        for i in temp:
            recommended.append(i[0])
        main.append(recommended)
    return main

In [None]:
main = extract(similarity)

In [None]:
# https://docs.python.org/3/library/pickle.html
pickle.dump(book_info.to_dict(), open('finbook_dict.pkl', 'wb'))
pickle.dump(main, open('fin_recommend.pkl', 'wb'))