In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
pd.set_option('display.max_rows', 5)

In [2]:
df_users = pd.read_csv("data/user_book_ratings.csv")
df_books = pd.read_csv("data/books_metadata.csv")

In [3]:
df_users.head(20)

Unnamed: 0.1,Unnamed: 0,user_id,book_id,rating
0,0,0,5602347,5
1,1,0,30,5
...,...,...,...,...
18,18,0,3709963,0
19,19,0,249158,4


In [4]:
df_books

Unnamed: 0,book_id,title,series,author,description,genres,pages,publisher,firstPublishDate,awards,setting,coverImg
0,14796360,The Hunger Games,The Hunger Games #1,Suzanne Collins,WINNING MEANS FAME AND FORTUNE.LOSING MEANS CE...,"['Young Adult', 'Fiction', 'Dystopia', 'Fantas...",374,Scholastic Press,,['Locus Award Nominee for Best Young Adult Boo...,"['District 12, Panem', 'Capitol, Panem', 'Pane...",https://i.gr-assets.com/images/S/compressed.ph...
1,7743507,Harry Potter and the Order of the Phoenix,Harry Potter #5,"J.K. Rowling, Mary GrandPré (Illustrator)",There is a door at the end of a silent corrido...,"['Fantasy', 'Young Adult', 'Fiction', 'Magic',...",870,Scholastic Inc.,06/21/03,['Bram Stoker Award for Works for Young Reader...,['Hogwarts School of Witchcraft and Wizardry (...,https://i.gr-assets.com/images/S/compressed.ph...
...,...,...,...,...,...,...,...,...,...,...,...,...
26762,11115191,Attracted to Fire,,DiAnn Mills (Goodreads Author),Special Agent Meghan Connors' dream of one day...,"['Christian Fiction', 'Christian', 'Suspense',...",416,Tyndale House Publishers,September 16th 2011,['HOLT Medallion by Virginia Romance Writers N...,['West Texas (United States)'],https://i.gr-assets.com/images/S/compressed.ph...
26763,602931,Anasazi,Sense of Truth #2,Emma Michaels,"'Anasazi', sequel to 'The Thirteenth Chime' by...","['Mystery', 'Young Adult']",190,Bokheim Publishing,August 3rd 2011,[],[],https://i.gr-assets.com/images/S/compressed.ph...


In [13]:
df_users = df_users.groupby('user_id')['book_id'].apply(list).reset_index(name='books_list')
df_ratings = df_users.groupby('user_id')['rating'].apply(list).reset_index(name='ratings_list')
df_ratings

Unnamed: 0,user_id,ratings_list
0,0,"[5, 5, 5, 4, 4, 5, 5, 5, 4, 5, 5, 5, 5, 5, 4, ..."
1,1,"[4, 4, 0, 5, 0, 0, 5, 0, 0, 3, 0, 5, 0, 5, 5, ..."
...,...,...
819365,876143,"[4, 5, 4, 4, 5, 5, 5, 4]"
819366,876144,"[0, 0]"


From the above grouping, we can see that we have approx. 820k users in our database.<br/>
df_users now contain all users with, for each user, a list of the books the have rated while df_ratings contain all users with a list of ratings they have given.

In [6]:

def list_converter(number_list): 
    author_list = []
    for i in number_list:
        name = df_books.loc[df_books["book_id"]==i,"author"].to_string(index=False)
        author_list.append(name)
    return author_list


Get authors for all books a given user has read and merge the two dataframes:

In [15]:
df_users["authors"]= df_users.apply(lambda row : list_converter(row["books_list"]),axis=1)
df_users["ratings"]= df_ratings["ratings_list"]
df_users

KeyboardInterrupt: 

In [8]:
# def averages(names, values):
#     # Group the items by name.
#     value_lists = defaultdict(list)
#     for name, value in zip(names, values):
#         value_lists[name].append(value)
    
#     # Take the average of each list.
#     # 9.2.5
#     result = {}
#     mean = sum(values) / len(values)
#     values_std = [(v - mean) for v in values]
#     for name, values in value_lists.items():
#         result[name] = sum(values_std) / float(len(values_std))
#     return result
     
def averages(names, non_stand_values):
    mean = sum(non_stand_values) / len(non_stand_values)
    values = [x - mean for x in non_stand_values]
    
    # Group the items by name.
    value_lists = defaultdict(list)
    for name, value in zip(names, values):
        value_lists[name].append(value)
        
    # Take the average of each list.
    result = {}
    for name, values in value_lists.items():
        result[name] = sum(values) / float(len(values))
    return result

df_users["author_rating"]= df_users.apply(lambda row : averages(row["authors"],row["ratings"]),axis=1)

    
averages(["john","danny","john","john"],[0,5,5,5])

{'john': -0.4166666666666667, 'danny': 1.25}

In [9]:
df_users

Unnamed: 0,user_id,movies_list,authors,ratings,author_rating
0,0,"[5602347, 30, 12528798, 25026517, 835, 2202194...","[Bill Bryson, J.R.R. Tolkien, Frank Herbert, N...","[5, 5, 5, 4, 4, 5, 5, 5, 4, 5, 5, 5, 5, 5, 4, ...","{'Bill Bryson': 0.12264150943396235, 'J.R.R. T..."
1,1,"[9003477, 627206, 395614, 1730369, 91760, 2409...","[Tracy Chevalier (Goodreads Author), Betty Edw...","[4, 4, 0, 5, 0, 0, 5, 0, 0, 3, 0, 5, 0, 5, 5, ...",{'Tracy Chevalier (Goodreads Author)': 0.81999...
...,...,...,...,...,...
14,14,"[17800990, 19009257, 29230876, 30741235, 33079...","[Charlotte Stein (Goodreads Author), Beth Kery...","[5, 5, 5, 5, 5, 5, 5, 0, 0, 5, 0, 5, 0, 0, 0, ...",{'Charlotte Stein (Goodreads Author)': 3.66071...
15,15,"[9926547, 633409, 1277772, 24473485, 13066640,...","[Sara Gruen (Goodreads Author), John Fowles, U...","[3, 0, 3, 3, 3, 2, 0, 4, 4, 0, 3, 0, 5, 0, 4, ...",{'Sara Gruen (Goodreads Author)': 1.0612244897...


In [10]:
df = df_users.drop(columns=['authors', 'ratings'])
df.iloc[0,2]

{'Bill Bryson': 0.12264150943396235,
 'J.R.R. Tolkien': 2.6226415094339623,
 'Frank Herbert': 2.6226415094339623,
 'Neal Stephenson (Goodreads Author)': 1.4226415094339624,
 'Dan Harrington, Bill Robertie': 1.6226415094339623,
 'Donald A. Norman': 2.6226415094339623,
 'Khaled Hosseini (Goodreads Author), Berliani M....': 2.6226415094339623,
 'Steven Pressfield (Goodreads Author)': 0.12264150943396235,
 'David Allen': 1.6226415094339623,
 'Ayn Rand, Leonard Peikoff (Goodreads Author) (A...': 2.6226415094339623,
 'Jane Austen, Anna Quindlen (Introduction)': 2.6226415094339623,
 'David McCullough': 0.12264150943396235,
 'Lance Armstrong, Sally Jenkins': 2.6226415094339623,
 'Malcolm Gladwell': 1.6226415094339623,
 'Clayton M. Christensen, Michael E. Raynor': 1.6226415094339623,
 'Clayton M. Christensen, L.J. Ganser (Narrator)': -2.3773584905660377,
 'H.G. Wells, Greg Bear (Introduction), Carlo Pag...': -2.3773584905660377,
 'Jack Kerouac': 1.6226415094339623,
 'Naomi Klein': -2.3773584905

In [11]:
%%file UP_MapReduce.py
 
from mrjob.job import MRJob
from mrjob.step import MRStep
import pandas as pd

df_book = pd.read_csv("data/books_metadata.csv")

class UP_MapReduce(MRJob): #class that inherits from MRJob 

    def string_to_list(self,s, awards=False):
        if awards:
            s_ = s.strip("[]").split(", ")
            s = []
            for aw in s_:
                s.append(aw[1:-1]) # Chop off the " from front and back, somewhat tideous but works
        else:
            s = s.strip("[]").split("', '")
            # Remove "'" from start of first and end of last item
            s[0] = s[0][1:]
            s[-1] = s[-1][:-1]
        return s

    def get_genre_list(self, bookID):
        return self.string_to_list(df_book[df_book["book_id"]==bookID]["genres"], awards=False)
        
    
    def steps(self):
        #need to define the order of mappers and reducers, so that the functions takes in the right key and value
        #only one mapper and reducer per step, so therefore need several steps
        #do we need the combiner between the mapper and reducer??
        return [ 
            MRStep(mapper=self.mapper_1, reducer=self.reducer_1),
#             MRStep(mapper=self.mapper_2, reducer = self.reducer_2),
#             MRStep(mapper = self.mapper_3, reducer = self.reducer_3),
#             MRStep(mapper = self.mapper_4)
        ]

    
    #key - ignored
    #values - corpus of documents (i.e. the data frame books_metadata)
    def mapper_1(self, _, line):  #each line is a row, i.e. a book 
        if line.find("book_id") == -1:
            list = line.split( ",")
            userID = list[2]
            bookID = list[3]
            rating = list[4]
            genres = self.get_genre_list(bookID)
            for genre in genres:
                yield (userID,genre), rating  # returns key-value pairs ((word, id), 1)

    #key - the key which was yielded by the mapper, i.e. (word, id). (key[0] = word, key[1]=id)
    #values - A generator which yields all values yielded by the mapper which correspond to key, i.e. a list of 1's
    def reducer_1(self, userID_genre, rating): 
        yield (userID_genre[0], userID_genre[1]), average(rating) # returns key-value pairs ((word, id), n)

if __name__ == '__main__':
    UP_MapReduce.run()

Overwriting UP_MapReduce.py


In [12]:
df = pd.read_csv("data/small_user_book_rating.csv")
df.iloc[0,1]

FileNotFoundError: [Errno 2] No such file or directory: 'data/small_user_book_rating.csv'

In [None]:
! python UP_MapReduce.py data/small_user_book_rating.csv > output

No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory C:\Users\imad3\AppData\Local\Temp\UP_MapReduce.imad3.20221124.150757.757370
Running step 1 of 1...

Error while reading from C:\Users\imad3\AppData\Local\Temp\UP_MapReduce.imad3.20221124.150757.757370\step\000\mapper\00000\input:

Traceback (most recent call last):
  File "C:\Users\imad3\Desktop\DataScienceFinalProject\UP_MapReduce.py", line 57, in <module>
    UP_MapReduce.run()
  File "C:\Users\imad3\anaconda3\lib\site-packages\mrjob\job.py", line 616, in run
    cls().execute()
  File "C:\Users\imad3\anaconda3\lib\site-packages\mrjob\job.py", line 687, in execute
    self.run_job()
  File "C:\Users\imad3\anaconda3\lib\site-packages\mrjob\job.py", line 636, in run_job
    runner.run()
  File "C:\Users\imad3\anaconda3\lib\site-packages\mrjob\runner.py", line 503, in run
    self._run()
  File "C:\Users\imad3\anaconda3\lib\site-packages\mrjob\sim.py", line 161, in _run
  

In [None]:
small_user_book_rating = df_user.head(1000)

In [None]:
small_user_book_rating.to_csv("data/small_user_book_rating.csv")

In [None]:
df_book = pd.read_csv("data/books_metadata.csv")


def string_to_list(s, awards=False):
        print()
        if awards:
            s_ = s.strip("[]").split(", ")
            s = []
            for aw in s_:
                s.append(aw[1:-1]) # Chop off the " from front and back, somewhat tideous but works
        else:
            s = s.strip("[]").split("', '")
            # Remove "'" from start of first and end of last item
            s[0] = s[0][1:]
            s[-1] = s[-1][:-1]
        return s

    
with open("data/small_user_book_rating.csv", 'r') as file:
    next(file) #Skip header
    while (line := file.readline().rstrip()):
        #Retrieve user, book id and associated rating
        _, _, user_id, book_id, rating = line.split(",")
        
        print(book_id)
        print(df_book[df_book["book_id"]==book_id]["genres"])
        
        break

5602347
Series([], Name: genres, dtype: object)
