In [1]:
import warnings
warnings.filterwarnings('ignore')
from elasticsearch import Elasticsearch, helpers
import pandas as pd
import os
import sys
import logging
import numpy as np
from tqdm import tqdm

data_dir = '../../data/recommendation'

# Loading data

In [2]:
books = pd.read_csv(os.path.join(data_dir, 'Books.csv')).dropna()

# users = pd.read_csv(os.path.join(data_dir, 'Users.csv'))
user_ratings = pd.read_csv(os.path.join(data_dir, 'Ratings.csv'))

In [3]:
books.info()
books.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 271354 entries, 0 to 271359
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ISBN                 271354 non-null  object
 1   Book-Title           271354 non-null  object
 2   Book-Author          271354 non-null  object
 3   Year-Of-Publication  271354 non-null  object
 4   Publisher            271354 non-null  object
 5   Image-URL-S          271354 non-null  object
 6   Image-URL-M          271354 non-null  object
 7   Image-URL-L          271354 non-null  object
dtypes: object(8)
memory usage: 18.6+ MB


Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [4]:
user_ratings.info()
user_ratings.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   User-ID      1149780 non-null  int64 
 1   ISBN         1149780 non-null  object
 2   Book-Rating  1149780 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 26.3+ MB


Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [5]:
def weighted_rating(v, R, C,m=5):
    w_r =  (v/(v+m))*R + (m/(m+v))*C
    return w_r

In [6]:
%%time
m=5
R = user_ratings[['ISBN', 'Book-Rating']].groupby(['ISBN'], as_index=True).mean()
C = user_ratings['Book-Rating'].mean()
df = pd.concat([R, user_ratings.groupby(['ISBN'], as_index=True).size().to_frame()],axis=1)
df['weighted_average'] = df.apply(lambda row: (row[0]/(row[0]+m))*row['Book-Rating'] + (m/(row[0]+m))*C, axis=1)
df.head()
# isbn = np.unique(user_ratings['ISBN'])
# book_ratings = pd.DataFrame(columns=['ISBN','Book-Rating'])
# book_ratings['ISBN'] = isbn
# book_ratings['Book-Weighted-Rating'] = [weighted_rating(v.loc[i,0], R.loc[i,['Book-Rating']],C) for i in tqdm(isbn)]

# book_ratings.info()
# book_ratings.head()
# sys.exit()

Wall time: 14.2 s


Unnamed: 0_level_0,Book-Rating,0,weighted_average
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
330299891,3.0,2,2.904964
375404120,1.5,2,2.476393
586045007,0.0,1,2.389125
9022906116,3.5,2,3.047822
9032803328,0.0,1,2.389125


# Data analysis and processing

In [7]:
%%time
word = [i.split(' ') for i in books['Book-Title']]
word = [{'_id': idx, 'name': w} for idx,w in enumerate([i for s in word for i in s])]
title = [{'_id': idx,
          'ID_book': row['ISBN'],
          'name': row['Book-Title'], 
          'processed': row['Book-Title'].lower(),
          'author': row['Book-Author'], 
          'year': row['Year-Of-Publication'],
          'Publisher': row['Publisher'],
          'url': row['Image-URL-M'],
          'weighted_ratings':-1 if row['ISBN'] not in df.index else df.loc[row['ISBN'],'weighted_average']} for idx, row in tqdm(books.iterrows(), position=0)]

271354it [00:57, 4731.33it/s]


Wall time: 59 s


# Search engine

In [34]:
class Search():
    def __init__(self, index_name):
        super().__init__()
        self.logger = logging.getLogger(__name__)
        self.__es = Elasticsearch([{'scheme': 'http', 'host':'localhost','port':9200}], basic_auth=('admin', 'es_pw'))
        
        self.__index_name = index_name
        if self.__es.indices.exists(index=self.__index_name):
            self.logger.debug('Deleting existing index ' + self.__index_name)
            self.__es.indices.delete(index=self.__index_name)
        
        self.__es.indices.create(index=self.__index_name)
        self.__es.cluster.health(wait_for_status='yellow')
    
    def index(self, type_name, id_value, content):
        self.logger.debug('index %s/%s : %s', type_name, id_value, content)
        self.__es.index(index=self.__index_name, doc_type=type_name, id=id_value, body=content)
        
    def upload_content_bulk(self, data):
        helpers.bulk(self.__es, data, index=self.__index_name,doc_type='_doc', request_timeout=200)
    
    def map(self, type_name, mapping):
        self.logger.debug('map %s', type_name)
        self.__es.indices.put_mapping(index=self.__index_name, doc_type=type_name, body={type_name: mapping})
    
    def search(self, type_name, query={'match_all': {}}):
        self.logger.debug('search %s : %s', type_name, query)
        return self.__es.search(index=self.__index_name, doc_type=type_name, body={'query': query})
    
    def search_book(self, type_name):
#         query = {"query": {"multi_match": {"query": type_name,}}}
        query = {"query":{"fuzzy" : { "name" : type_name}}}
        self.logger.debug('search %s : %s', type_name, query)
        return self.__es.search(index=self.__index_name, doc_type=type_name, body=query)
    
    def get(self, type_name, id_value):
        self.logger.debug('get %s/%s', type_name, id_value)
        document = self.__es.get(index=self.__index_name, doc_type=type_name, id=id_value)
        self.logger.debug('got document ' + document)
        return document
    
    def delete(self, type_name, id_value):
        self.logger.debug('delete %s/%s', type_name, id_value)
        self.__es.delete(index=self.__index_name, doc_type=type_name, id=id_value)

    def optimize(self):
        """ 
        forcemerge allows removal of deleted documents and reducing the number of segments
        (documents are marked as tombstone [like cassandra] but not purged from the segment's 
        index for performance reasons)
        """
        self.logger.debug('optimize')
        self.__es.forcemerge(self.__index_name)

    @property
    def es(self):
        return self.__es

    def __eq__(self, other):
        return self.__es == other.__es

    def __str__(self):
        return self.__es.__str__()

    def __hash__(self):
        return self.__es.__hash__()

In [23]:
%%time
es_typo = Search(index_name='typo_check')
es_typo.upload_content_bulk(word)

Wall time: 1min 40s


In [24]:
es_typo.search('Clara')

{'took': 304,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 0, 'relation': 'eq'},
  'max_score': None,
  'hits': []}}

# Book title search

In [35]:
%%time
es_book = Search(index_name='book_name')
es_book.upload_content_bulk(title)

Wall time: 1min 2s


In [36]:
es_book.search_book('Clara Callan')

{'took': 1570,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 0, 'relation': 'eq'},
  'max_score': None,
  'hits': []}}