In [1]:
import numpy as np
from scipy.sparse import lil_matrix
import os
import pickle
import csv
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import LatentDirichletAllocation as LDA
from scipy.spatial import distance

books_file = 'books.csv'
users_ratings = 'books_ratings.csv'


In [64]:
def find_most_similar(vector,matrix):
    return np.argsort(distance.cdist([vector],matrix,metric='cosine'))[::-1]

def get_book_name(books_list,size):
    book_names=[]
    for book in books_list[0:size]:
        book_names.append(books_df.iloc[book]["original_title"])
    return book_names
    
def find_similar_books(books_vector,books_list,list_length):
    for row in books_list:
        book=books_vector[row,:]
        similarity_list=find_most_similar(book,books_vector)[0]
        print(books_df.iloc[row]["original_title"])
        print(get_book_name(similarity_list,list_length))

def show_groups(books_vector,list_length,list_of_groups=None):
    books_vector_transpose=books_vector.transpose()
    if list_of_groups==None:
        iteration_range=range(0,n_topics)
    else:
        iteration_range=list_of_groups
    for group in iteration_range:
        print("Group",group)
        print(get_book_name(books_vector_transpose[group].argsort()[::-1],list_length))

def create_new_user(reading_list,model):
    user_matrix=lil_matrix((1,books_df.shape[0]),dtype='int')
    for book in reading_list:
        user_matrix[0,book]=5
    result=model.transform(user_matrix)
    return result[0]

def user_recommended_books(user,books_vector):
    temp_mulp=np.sum(np.multiply(new_user,books_vector),axis=1)
    user_recommendations=books_df[~books_df.original_title.isin(new_user_books)]
    user_recommendations=books_df.loc[:,['authors','original_publication_year','original_title']]
    user_recommendations['user_index']=temp_mulp
    user_recommendations=user_recommendations[user_recommendations.user_index>200]
    return user_recommendations.sort_values('user_index',ascending=False)

def get_recommended_authors(recommended_df):
    authors=recommendation.groupby('authors').mean().sort_values('user_index',ascending=False).index
    return list(authors)

<h2>Read "Books.csv"</h2>

In [3]:
books_df=pd.read_csv(books_file,index_col='book_id')## book_id as index
books_df['authors']=books_df.authors.apply(lambda x: x.split(",")[0])
print(books_df.shape)
books_df.head(10)

(10000, 4)


Unnamed: 0_level_0,authors,original_publication_year,original_title,language_code
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Suzanne Collins,2008.0,The Hunger Games,eng
1,J.K. Rowling,1997.0,Harry Potter and the Philosopher's Stone,eng
2,Stephenie Meyer,2005.0,Twilight,en-US
3,Harper Lee,1960.0,To Kill a Mockingbird,eng
4,F. Scott Fitzgerald,1925.0,The Great Gatsby,eng
5,John Green,2012.0,The Fault in Our Stars,eng
6,J.R.R. Tolkien,1937.0,The Hobbit or There and Back Again,en-US
7,J.D. Salinger,1951.0,The Catcher in the Rye,eng
8,Dan Brown,2000.0,Angels & Demons,en-CA
9,Jane Austen,1813.0,Pride and Prejudice,eng


<h2>Read "books_ratings.csv"</h2>

In [4]:
users_liked={}
loc_user_id={}
loc_id_user={}
c=0
with open(users_ratings) as file1:
    csv_users=csv.reader(file1)
    next(csv_users)
    for row in csv_users:
        user_id,book_id,rating=row[0],row[1],int(row[2])
        if rating>=4:
            if user_id not in loc_user_id.keys():
                loc_user_id[user_id]=c
                loc_id_user[c]=user_id
                c+=1
            users_liked[user_id]=users_liked.get(user_id,dict())
            users_liked[user_id][book_id]=rating

<h2> Define sparse matrix with local user id as rows and book ids as columns</h2>

In [5]:
sparse_matrix=lil_matrix((c,books_df.shape[0]),dtype='int')
sparse_matrix.shape

(13071, 10000)

<h2>fill the sparse matrix with data</h2>

In [6]:
for user,liked in users_liked.items():
    for book_str,rating in liked.items():
        book=int(book_str)
        sparse_matrix[loc_user_id[user],book]=rating

In [7]:
print("sparcity=",sparse_matrix.nnz/(sparse_matrix.shape[0]*sparse_matrix.shape[1]))

sparcity= 0.005159077346798256


<h2>Train the LDA model</h2>

In [8]:
n_topics=100
lda = LDA(n_topics=n_topics,max_iter=50)

In [9]:
lda.fit(sparse_matrix)
books_vector=lda.components_.transpose()
users_vector=lda.transform(sparse_matrix)
books_df["group"]=np.argmax(books_vector,axis=1)



In [10]:
find_similar_books(books_vector,[497,3481,8502,263,1183,120,2343,112,267,159,4671],10)

Война и миръ
['Война и миръ', 'Анна Каренина', 'Dubliners', 'The Magus', 'Madame Bovary', 'A Portrait of the Artist as a Young Man', 'Heart of Darkness', 'Der Steppenwolf: Erzählung', 'Pygmalion and Three Other Plays', 'Four Great Tragedies: Hamlet, Othello, King Lear, Macbeth (Signet Classics)']
Das Schloß
['Das Schloß', "L'Existentialisme est un humanisme ", 'Dog on It', 'Aura', 'Saving Francesca', 'The Snowman', 'Ensaio Sobre a Lucidez', 'A Briefer History of Time', ' سمفونی مردگان', nan]
Buddenbrooks: Verfall einer Familie
['Buddenbrooks: Verfall einer Familie', 'Alexander Hamilton', 'The Fire Next Time', 'The War of Art: Break Through the Blocks and Win Your Inner Creative Battles', 'Mythologies', 'The Johnstown Flood', 'Герой нашего времени', '午後の曳航 [Gogo no eikō]', 'A Distant Mirror: The Calamitous 14th Century', 'Battle Cry of Freedom']
The Sun Also Rises
['The Sun Also Rises', 'Phänomenologie des Geistes', 'Ball Four', 'Записки из Мёртвого дома', 'The Day of the Locust', 'In d

In [11]:
show_groups(books_vector,10)

Group 0
['A Game of Thrones', 'A Clash of Kings', 'A Storm of Swords', 'A Feast for Crows', 'The Name of the Wind', 'Dune', 'I, Robot', '2001: A Space Odyssey', 'Sphere', 'A Dance with Dragons']
Group 1
['Under the Dome', 'The Colour of Magic', 'Night Watch', 'Mort', 'Guards! Guards!', 'Reaper Man', 'Small Gods', 'Going Postal', 'Hogfather', 'Wyrd Sisters']
Group 2
['The Wizard Heir', 'The High Lord', 'The Novice', 'The Last Kingdom', 'Darkest Mercy', "The Magicians' Guild", 'Percy Jackson and the Olympians Boxed Set', 'The Pale Rider', 'Enemy of God', 'The Lords of the North']
Group 3
['Shutter Island', 'Dance Upon The Air', 'Face the Fire (Three Sisters Island, #3)', 'Tell No One', 'Heaven and Earth (Three Sisters Island, #2)', 'The Woods', 'Jewels of the Sun', 'Tears of the Moon', 'Born in Fire ', 'Black Rose']
Group 4
['Angels & Demons ', 'Deception Point', 'Digital Fortress', 'Left to Tell: Discovering God Amidst the Rwandan Holocaust', 'Kane and Abel', 'The Book of Negroes', 'Fiv

['The Perks of Being a Wallflower', "The No. 1 Ladies' Detective Agency ", 'Just Listen', 'This Lullaby', 'Tears of the Giraffe', 'Morality for Beautiful Girls', 'The Kalahari Typing School for Men', 'Crank (Crank, #1)', 'The Full Cupboard of Life', 'Someone Like You']
Group 35
['Ἰλιάς', 'Ὀδύσσεια', 'Inferno', 'Frankenstein; or, The Modern Prometheus', 'American Psycho', 'Paradise Lost', 'La Divina Commedia', 'Tales of Caunterbury', 'Leaves of Grass', 'Οἰδίπους Τύραννος']
Group 36
['Wake', 'Heart-Shaped Box', "Don't Let the Pigeon Drive the Bus", 'Altered Carbon', 'The Snowy Day', '84, Charing Cross Road', 'The Wasp Factory', 'The Player of Games', 'Consider Phlebas', 'Princess of the Midnight Ball']
Group 37
['The Shack: Where Tragedy Confronts Eternity', 'Mere Christianity', 'Same Kind of Different as Me', nan, 'The Sparrow', 'Blue Like Jazz: Nonreligious Thoughts on Christian Spirituality', 'The Black Cauldron', 'The Book of Three', 'The Hiding Place', 'The High King ']
Group 38
['H

['La casa de los espíritus', 'Nickel and Dimed: On (Not) Getting By in America', 'Bird by Bird: Some Instructions on Writing and Life', 'Darkly Dreaming Dexter', 'We Were the Mulvaneys', 'Kitchen Confidential: Adventures in the Culinary Underbelly', 'A Fine Balance', 'The Absolutely True Diary of a Part-Time Indian', 'Black Like Me', 'Eva Luna']
Group 71
['Gone', 'Soulless', 'Among the Betrayed', 'The Last Oracle', 'The Death and Life of Charlie St. Cloud', 'Hunger ', 'When We Were Orphans', 'Among the Brave (Shadow Children, #5)', 'Black Order', 'Among the Enemy (Shadow Children, #6)']
Group 72
['To Kill a Mockingbird', 'The Catcher in the Rye', 'Lord of the Flies ', 'Nineteen Eighty-Four', 'Of Mice and Men ', 'Animal Farm: A Fairy Story', 'The Great Gatsby', 'The Adventures of Huckleberry Finn', 'Where the Sidewalk Ends: The Poems and Drawings of Shel Silverstein', 'Brave New World']
Group 73
['The Things They Carried', 'Invisible Man', 'A Streetcar Named Desire', 'The Glass Menageri

In [12]:
new_user_books=[497,3481,8502,263,1183,120,2343,112,267,159]
books_df.loc[new_user_books]

Unnamed: 0_level_0,authors,original_publication_year,original_title,language_code,group
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
497,Leo Tolstoy,1869.0,Война и миръ,eng,47
3481,Franz Kafka,1926.0,Das Schloß,eng,44
8502,Thomas Mann,1901.0,Buddenbrooks: Verfall einer Familie,eng,66
263,Ernest Hemingway,1926.0,The Sun Also Rises,en-US,10
1183,F. Scott Fitzgerald,1933.0,Tender Is the Night,en-US,53
120,Vladimir Nabokov,1955.0,Lolita,eng,72
2343,Charles Bukowski,1975.0,Factotum,,75
112,Joseph Heller,1961.0,Catch-22,en-US,10
267,Kazuo Ishiguro,2005.0,Never Let Me Go,eng,87
159,Charles Dickens,1860.0,Great Expectations,eng,66


In [99]:
new_user=create_new_user(new_user_books,lda)
np.where(new_user>1/n_topics)

(array([10, 44, 66, 75, 87], dtype=int64),)

In [14]:
recommendation=user_recommended_books(new_user,books_vector)

In [15]:
get_recommended_authors(recommendation)

['Albert Camus',
 'Oscar Wilde',
 'Ernest Hemingway',
 'Milan Kundera',
 'Franz Kafka',
 'Joseph Heller',
 'Fyodor Dostoyevsky',
 'Ken Kesey',
 'Gabriel García Márquez',
 'Jack Kerouac',
 'Joseph Conrad',
 'Miguel de Cervantes Saavedra',
 'Anthony Burgess',
 'Vladimir Nabokov',
 'Mikhail Bulgakov',
 'John Steinbeck',
 'Sylvia Plath',
 'Hermann Hesse',
 'Leo Tolstoy',
 'James Joyce',
 'Herman Melville',
 'John Kennedy Toole',
 'Erich Maria Remarque',
 'Cormac McCarthy',
 'Daniel Defoe',
 'William Faulkner',
 'George Orwell',
 'Charles Dickens',
 'Lewis Carroll',
 'Aldous Huxley',
 'Haruki Murakami',
 'J.D. Salinger',
 'Umberto Eco',
 'Samuel Beckett',
 'Jeffrey Eugenides',
 'Kurt Vonnegut Jr.',
 'F. Scott Fitzgerald',
 'Italo Calvino',
 'Ian McEwan',
 'Michael Chabon',
 'Voltaire',
 'Henry Miller',
 'Jonathan Safran Foer',
 'Kazuo Ishiguro',
 'Arundhati Roy',
 'Gustave Flaubert',
 'Ralph Ellison',
 'Salman Rushdie']

In [16]:
books_df[books_df.authors.str.contains("Ferdina")]

Unnamed: 0_level_0,authors,original_publication_year,original_title,language_code,group
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
4671,Louis-Ferdinand Céline,1932.0,Voyage au bout de la nuit,eng,10


In [26]:
1/n_topics
(new_user>(1/n_topics))

array([[False, False, False, False, False, False, False, False, False,
        False,  True, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False,  True,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False,  True, False, False, False, False, False,
        False, False, False,  True, False, False, False, False, False,
        False, False, False, False, False, False,  True, False, False,
        False, False, False, False, False, False, False, False, False,
        False]])