In [14]:
import numpy as np
from scipy.sparse import lil_matrix
import os
import pickle
import csv
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import LatentDirichletAllocation as LDA
from scipy.spatial import distance

books_file = 'books.csv'
users_ratings = 'books_ratings.csv'


In [15]:
def find_most_similar(vector,matrix):
    return np.argsort(distance.cdist([book_row],books_vector,metric='cosine'))

def get_book_name(books_list,size):
    book_names=[]
    for book in books_list[0,1:size+1]:
        book_names.append(books_df.iloc[book]["original_title"])
    return book_names
    

<h2>Read "Books.csv"</h2>

In [16]:
books_df=pd.read_csv(books_file,index_col='book_id') ## book_id as index
books_df.head()

Unnamed: 0_level_0,authors,original_publication_year,original_title,language_code
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Suzanne Collins,2008.0,The Hunger Games,eng
1,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,eng
2,Stephenie Meyer,2005.0,Twilight,en-US
3,Harper Lee,1960.0,To Kill a Mockingbird,eng
4,F. Scott Fitzgerald,1925.0,The Great Gatsby,eng


<h2>Read "books_ratings.csv"</h2>

In [17]:
users_liked={}
loc_user_id={}
loc_id_user={}
c=0
with open(users_ratings) as file1:
    csv_users=csv.reader(file1)
    next(csv_users)
    for row in csv_users:
        user_id,book_id,rating=row[0],row[1],int(row[2])
        if rating>=4:
            if user_id not in loc_user_id.keys():
                loc_user_id[user_id]=c
                loc_id_user[c]=user_id
                c+=1
            users_liked[user_id]=users_liked.get(user_id,dict())
            users_liked[user_id][book_id]=rating

<h2> Define sparse matrix with local user id as rows and book ids as columns</h2>

In [18]:
sparse_matrix=lil_matrix((c,books_df.shape[0]),dtype='int')
sparse_matrix.shape

(13071, 10000)

<h2>fill the sparse matrix with data</h2>

In [19]:
for user,liked in users_liked.items():
    for book_str,rating in liked.items():
        book=int(book_str)
        sparse_matrix[loc_user_id[user],book]=rating

In [20]:
print("sparcity=",sparse_matrix.nnz/(sparse_matrix.shape[0]*sparse_matrix.shape[1]))

sparcity= 0.005159077346798256


In [25]:
lda = LDA(n_topics=30,max_iter=50)

In [26]:
lda.fit(sparse_matrix)
books_vector=lda.components_.transpose()



In [27]:
for count,book_row in enumerate(books_vector):
    similarity_list=find_most_similar(book_row,books_vector)
    print(books_df.iloc[count]["original_title"])
    print(get_book_name(similarity_list,3))
# #     print(np.argsort(ms))
#     print("---",books_df["original_title"].iloc[count],"---","most similar to:","---",books_df["original_title"].iloc[np.argsort(ms)[0][1]],"---")
    if count==100:
        break

The Hunger Games
['Catching Fire', 'Revenge of the Spellmans', 'The Book Thief']
Harry Potter and the Philosopher's Stone
['Waiting to Exhale', 'Keys to Drawing', "Just Me in the Tub (Mercer Mayer's Little Critter)"]
Twilight
['Confessions of a Shopaholic', 'Jemima J', 'Eclipse']
To Kill a Mockingbird
['Of Mice and Men ', 'Lord of the Flies ', 'The Adventures of Huckleberry Finn']
The Great Gatsby
['The Catcher in the Rye', 'Of Mice and Men ', 'The Adventures of Huckleberry Finn']
The Fault in Our Stars
['Evergreen', ' 絶対彼氏 (Zettai Kareshi) 1', '紳士同盟† 1']
The Hobbit or There and Back Again
[' The Fellowship of the Ring', 'Le Comte de Monte-Cristo', 'The Princess Bride']
The Catcher in the Rye
['The Great Gatsby', 'Animal Farm: A Fairy Story', 'Of Mice and Men ']
Angels & Demons 
['The Firm', 'The Maze Runner', 'The Lost Symbol']
Pride and Prejudice
['Jane Eyre', 'Sense and Sensibility', 'Little Women']
The Kite Runner 
['Memoirs of a Geisha', 'The Secret Life of Bees', 'Divine Secrets 

A Time to Kill
['The Bourne Identity', 'Jurassic Park', 'Mother of Pearl']
Un di Velt Hot Geshvign
['The Color Purple', 'I Know Why the Caged Bird Sings', 'September']
Paper Towns
['Blue Bloods', 'The Princess Diaries', 'The Summoning']
The Princess Bride
['Het Achterhuis: Dagboekbrieven 14 juni 1942 - 1 augustus 1944', 'Fahrenheit 451', 'Le Comte de Monte-Cristo']
The Outsiders
['Number the Stars', 'The True Confessions of Charlotte Doyle', 'Knuffle Bunny:  A Cautionary Tale']
The Maze Runner
['The Firm', 'Lone Survivor: The Eyewitness Account of Operation Redwing and the Lost Heroes of SEAL Team 10', 'My Utmost for His Highest']
Freakonomics: A Rogue Economist Explores the Hidden Side of Everything
['The Tipping Point: How Little Things Can Make a Big Difference', 'Thirteen Moons', 'Into Thin Air: A Personal Account of the Mt. Everest Disaster']
The Secret Garden
['Winter Solstice', 'The Lemonade War', "Grandfather's Journey"]
Cien años de soledad
['Lolita', 'Physics of the Impossibl