# Bluk insert

다량의 document를 효율적으로 입력하기 위한 방법을 학습합니다.</br>
document를 개개별로 입력하는 것보다 수행속도가 빠르며, 불필요한 오버헤드가 없습니다.

### library 및 변수 셋팅

In [8]:
import sys
import json
import requests
import pandas as pd
from dotenv import load_dotenv
from azure.identity import DefaultAzureCredential
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents import SearchClient
from azure.search.documents.indexes.models import (
    ComplexField,
    SimpleField,
    SearchFieldDataType,
    SearchableField,
    SearchIndex
)
import os

load_dotenv()

endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")
credential = AzureKeyCredential(os.getenv("AZURE_SEARCH_ADMIN_KEY")) if os.getenv("AZURE_SEARCH_ADMIN_KEY") else DefaultAzureCredential()

index_name = "good-books"

# index의 schema를 json 파일에 저장하여 놓았습니다.
index_schema_json_file = "./data/good-books-index.json"

# 데이터 파일
books_url = "https://raw.githubusercontent.com/Azure-Samples/azure-search-sample-data/main/good-books/books.csv"
batch_size = 1000


### Index 생성

In [10]:
# index schema 
with open(index_schema_json_file) as json_file:
    index_schema = json.load(json_file)

# Index 생성
index = SearchIndex(name=index_name, fields=index_schema['fields'], suggesters=index_schema['suggesters'])
index_client = SearchIndexClient(endpoint=endpoint, credential=credential)
result = index_client.create_or_update_index(index)
print(f' {result.name} 생성')

 good-books 생성


### bulk insert

[update_documents](https://learn.microsoft.com/ko-kr/python/api/azure-search-documents/azure.search.documents.searchclient?view=azure-python#azure-search-documents-searchclient-upload-documents) 함수를 사용하여 bulk insert를 수행합니다.

In [14]:
# 데이터 파일 로드 - 10000 rows
df = pd.read_csv(books_url)
df.head()

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780000000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780000000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780000000000.0,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780000000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780000000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [16]:
# csv 데이터 -> json으로 변환
books_data_json = json.loads(df.to_json(orient="records"))

batch_array = []
count = 0
batch_counter = 0

search_client = SearchClient(
    endpoint=endpoint,
    credential=credential,
    index_name=index_name)

# 원하는 형태로 변경하기 위해 데이터 변경을 합니다.
for i in books_data_json:
    count += 1
    batch_array.append(
        {
            "id": str(i["book_id"]),
            "goodreads_book_id": int(i["goodreads_book_id"]),
            "best_book_id": int(i["best_book_id"]),
            "work_id": int(i["work_id"]),
            "books_count": i["books_count"] if i["books_count"] else 0,
            "isbn": str(i["isbn"]),
            "isbn13": str(i["isbn13"]),
            "authors": i["authors"].split(",") if i["authors"] else None,
            "original_publication_year": int(i["original_publication_year"])
            if i["original_publication_year"]
            else 0,
            "original_title": i["original_title"],
            "title": i["title"],
            "language_code": i["language_code"],
            "average_rating": int(i["average_rating"])
            if i["average_rating"]
            else 0,
            "ratings_count": int(i["ratings_count"]) if i["ratings_count"] else 0,
            "work_ratings_count": int(i["work_ratings_count"])
            if i["work_ratings_count"]
            else 0,
            "work_text_reviews_count": i["work_text_reviews_count"]
            if i["work_text_reviews_count"]
            else 0,
            "ratings_1": int(i["ratings_1"]) if i["ratings_1"] else 0,
            "ratings_2": int(i["ratings_2"]) if i["ratings_2"] else 0,
            "ratings_3": int(i["ratings_3"]) if i["ratings_3"] else 0,
            "ratings_4": int(i["ratings_4"]) if i["ratings_4"] else 0,
            "ratings_5": int(i["ratings_5"]) if i["ratings_5"] else 0,
            "image_url": i["image_url"],
            "small_image_url": i["small_image_url"],
        }
    )

    # 1000 레코드씩 입력 합니다.
    if count % batch_size == 0:
        search_client.upload_documents(documents=batch_array)
        batch_counter += 1

        print(f"Batch sent! - #{batch_counter}")
        batch_array = []

if len(batch_array) > 0:
    search_client.upload_documents(documents=batch_array)
    batch_counter += 1

    print(f"Final batch sent! - #{batch_counter}")

print("Done!")


Batch sent! - #1
Batch sent! - #2
Batch sent! - #3
Batch sent! - #4
Batch sent! - #5
Batch sent! - #6
Batch sent! - #7
Batch sent! - #8
Batch sent! - #9
Batch sent! - #10
Done!


### 데이터 확인

In [18]:
results =  search_client.search(query_type='simple',
    search_text="*" ,
    top=10,
    include_total_count=True)

# 검색 결과로 HotelName, Description 필드와 함께 검색 score도 표시합니다. 
print ('Total Documents Matching Query:', results.get_count())
for result in results:
    print(result)

Total Documents Matching Query: 10000
{'ratings_5': 10036, 'ratings_2': 733, 'original_title': 'Homage to Catalonia', 'work_text_reviews_count': 1500, 'image_url': 'https://images.gr-assets.com/books/1394868278m/9646.jpg', 'language_code': 'eng', 'isbn': '156421178', 'work_ratings_count': 25881, 'ratings_1': 176, 'books_count': 151, 'authors': ['George Orwell', ' Lionel Trilling'], 'original_publication_year': 1938, 'goodreads_book_id': 9646, 'title': 'Homage to Catalonia', 'work_id': 2566499, 'best_book_id': 9646, 'id': '4004', 'isbn13': '9780000000000.0', 'ratings_4': 10529, 'small_image_url': 'https://images.gr-assets.com/books/1394868278s/9646.jpg', 'ratings_3': 4407, 'average_rating': 4.0, 'ratings_count': 22227, '@search.score': 1.0, '@search.reranker_score': None, '@search.highlights': None, '@search.captions': None}
{'ratings_5': 12300, 'ratings_2': 1867, 'original_title': 'The Girl In The Ice', 'work_text_reviews_count': 2987, 'image_url': 'https://images.gr-assets.com/books/1