In [1]:
from elasticsearch import Elasticsearch
from neo4j import GraphDatabase
from datetime import datetime
import tensorflow_hub as hub
import pandas as pd
import numpy as np
import json

In [2]:
embed = hub.load('model')

In [3]:
books = pd.read_csv('data/books.csv', header=0, index_col=False)
print(len(books))
books.head(2)

6810


Unnamed: 0,isbn13,isbn10,title,subtitle,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count
0,9780002005883,2005883,Gilead,,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0
1,9780002261982,2261987,Spider's Web,A Novel,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0


In [7]:
books['description'].fillna(value='', inplace=True)

In [8]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6810 entries, 0 to 6809
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   isbn13          6810 non-null   int64  
 1   isbn10          6810 non-null   object 
 2   title           6810 non-null   object 
 3   subtitle        2381 non-null   object 
 4   authors         6738 non-null   object 
 5   categories      6711 non-null   object 
 6   thumbnail       6481 non-null   object 
 7   description     6810 non-null   object 
 8   published_year  6804 non-null   float64
 9   average_rating  6767 non-null   float64
 10  num_pages       6767 non-null   float64
 11  ratings_count   6767 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 638.6+ KB


In [10]:
driver = GraphDatabase.driver("bolt://52.66.250.236:7687", auth=("neo4j", "rpx"))

In [11]:
query="""
MATCH(b:Book) RETURN b.bookId AS bookID, b.title AS title, b.node2vec_8dim_1itr_all AS node2vec
"""
with driver.session(database="bookV1") as session:
    result = session.run(f"{query}")
    neo_df = pd.DataFrame(([dict(record) for record in result]))
neo_df.head(2)

Unnamed: 0,bookID,title,node2vec
0,9780002005883,Gilead,"[-0.08761764317750931, 0.3534783124923706, -0...."
1,9780002261982,Spider's Web,"[-0.7694340348243713, 0.04503050819039345, 0.5..."


In [12]:
df = pd.merge(books, neo_df[['bookID', 'node2vec']], left_on='isbn13', right_on='bookID', how='inner')
# df['node2vec']=[json.loads(row) for row in df['node2vec']]
df.head(2)

Unnamed: 0,isbn13,isbn10,title,subtitle,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,bookID,node2vec
0,9780002005883,2005883,Gilead,,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,9780002005883,"[-0.08761764317750931, 0.3534783124923706, -0...."
1,9780002261982,2261987,Spider's Web,A Novel,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0,9780002261982,"[-0.7694340348243713, 0.04503050819039345, 0.5..."


In [15]:
df[['title_desc_vec', 'node2vec']].apply(lambda x: x[0]+x[1], axis=1)

KeyError: "['title_desc_vec'] not in index"

In [None]:
df['title_vec'] = np.array(embed(df['title'])).tolist()
df['title_desc'] = df[['title', 'description']].apply(lambda x: ', '.join(x), axis=1)
df['title_desc_vec'] =  np.array(embed(df['title_desc'])).tolist()
df['title_page_vec'] = df[['title_vec', 'num_pages']].apply(lambda x: x[0]+[float(x[1])], axis=1)
df['title_desc_node2vec'] = df[['title_desc_vec', 'node2vec']].apply(lambda x: x[0]+x[1], axis=1)

In [None]:
123+12345
123
123

In [None]:
# datetime.strptime('9/16/2006', '%m/%d/%y')
# df['publication_date']=["{:02d}/{:02d}/{:04d}".format(*map(int, row.split('/'))) for row in df['publication_date']]

In [None]:
data=[]
for itr in df[0:100].itertuples():
    data.append({"index": { "_index": "books_lexical", "_id": itr.bookID}})
    data.append({
            "bookId": itr.bookID,
            "title": itr.title,
            "title_syn": itr.title,
            "title_desc": itr.title_desc,
            "authors": itr.authors,
            "num_pages": itr.num_pages,
            "published_year": itr.published_year,
            "title_vec": itr.title_vec,
            "title_desc_vec": itr.title_desc_vec,
            "title_page_vec": itr.title_page_vec,
            "title_desc_node2vec": itr.title_desc_node2vec   
        }
    )
es_client = Elasticsearch(hosts=['http://admin:admin@52.66.250.236:9200/'], verify_certs=False)
res = es_client.bulk(index='books_lexical',body=data, refresh=True, request_timeout=120)
print(res)
es_client.transport.close()

In [None]:
# Elastics test indexing
test_idx_data=[
    { "index": { "_index": "test", "_id": "1" } },
    { "my_vector1": [1.5, 2.5], "price": 12.2 },
    { "index": { "_index": "test", "_id": "2" } },
    { "my_vector1": [2.5, 3.5], "price": 7.1 },
    { "index": { "_index": "test", "_id": "3" } },
    { "my_vector1": [3.5, 4.5], "price": 12.9 },
    { "index": { "_index": "test", "_id": "4" } },
    { "my_vector1": [5.5, 6.5], "price": 1.2 },
    { "index": { "_index": "test", "_id": "5" } },
    { "my_vector1": [4.5, 5.5], "price": 3.7 },
    { "index": { "_index": "test", "_id": "6" } },
    { "my_vector2": [1.5, 5.5, 4.5, 6.4], "price": 10.3 },
    { "index": { "_index": "test", "_id": "7" } },
    { "my_vector2": [2.5, 3.5, 5.6, 6.7], "price": 5.5 },
    { "index": { "_index": "test", "_id": "8" } },
    { "my_vector2": [4.5, 5.5, 6.7, 3.7], "price": 4.4 },
    { "index": { "_index": "test", "_id": "9" } },
    { "my_vector2": [1.5, 5.5, 4.5, 6.4], "price": 8.9 }
]
es_client = Elasticsearch(hosts=['http://admin:admin@52.66.250.236:9200/'], verify_certs=False)
res = es_client.bulk(index='test',body=test_idx_data, refresh=True, request_timeout=120)
es_client.transport.close()