In [2]:
corpus_ja = [
    "東京は大阪の東にある",
    "大阪は東京の西にある",
    "京都は大阪の北にある",
    "札幌は東京の北にある",
    "那覇は大阪の南にある"
]

In [3]:
import MeCab

# MeCab の形態素解析器オブジェクトを作成
tagger = MeCab.Tagger()

# MeCab を用いた日本語トークナイザ関数
def mecab_tokenizer(text):
    """ 日本語テキストをトークン化 """
    node = tagger.parseToNode(text)
    tokens = []
    while node:
        if node.surface != "":  # 空白行を除外
            tokens.append(node.surface)
        node = node.next
    return tokens

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF ベクトルを作成
# カスタムトークナイザを使用するため token_pattern=None を指定
tfidf_vectorizer = TfidfVectorizer(tokenizer=mecab_tokenizer, token_pattern=None)
tfidf_vectors = tfidf_vectorizer.fit_transform(corpus_ja)

import pandas as pd

# TF-IDF ベクトルを Pandas DataFrame に変換して表示
tfidf_vectors_df = pd.DataFrame(tfidf_vectors.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
tfidf_vectors_df.head()

Unnamed: 0,ある,に,の,は,京都,北,南,大阪,札幌,東,東京,西,那覇
0,0.291391,0.291391,0.291391,0.291391,0.0,0.0,0.0,0.344517,0.0,0.611516,0.409539,0.0,0.0
1,0.291391,0.291391,0.291391,0.291391,0.0,0.0,0.0,0.344517,0.0,0.0,0.409539,0.611516,0.0
2,0.280952,0.280952,0.280952,0.280952,0.589609,0.475693,0.0,0.332176,0.0,0.0,0.0,0.0,0.0
3,0.27476,0.27476,0.27476,0.27476,0.0,0.465209,0.0,0.0,0.576615,0.0,0.386166,0.0,0.0
4,0.265314,0.265314,0.265314,0.265314,0.0,0.0,0.556792,0.313687,0.0,0.0,0.0,0.0,0.556792


In [5]:
query = "大阪は京都の南にある"

# TF-IDF クエリベクトルを作成
query_vector = tfidf_vectorizer.transform([query])

# 各文書ベクトルとクエリベクトル間のスコア (ドット積) を計算
scores = (tfidf_vectors * query_vector.T).toarray()

# 各文書とスコアを DataFrame に変換して降順で表示
scores_df = pd.DataFrame({'docs': corpus_ja, 'scores': scores.flatten()})
scores_df.sort_values('scores', ascending=False).head()

Unnamed: 0,docs,scores
2,京都は大阪の北にある,0.730651
4,那覇は大阪の南にある,0.689983
0,東京は大阪の東にある,0.417311
1,大阪は東京の西にある,0.417311
3,札幌は東京の北にある,0.291591


In [6]:
print(tfidf_vectors)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 35 stored elements and shape (5, 13)>
  Coords	Values
  (0, 10)	0.4095392593497724
  (0, 3)	0.29139055604235914
  (0, 7)	0.34451733585505817
  (0, 2)	0.29139055604235914
  (0, 9)	0.6115158017123286
  (0, 1)	0.29139055604235914
  (0, 0)	0.29139055604235914
  (1, 10)	0.4095392593497724
  (1, 3)	0.29139055604235914
  (1, 7)	0.34451733585505817
  (1, 2)	0.29139055604235914
  (1, 1)	0.29139055604235914
  (1, 0)	0.29139055604235914
  (1, 11)	0.6115158017123286
  (2, 3)	0.2809520279449709
  (2, 7)	0.3321756390643166
  (2, 2)	0.2809520279449709
  (2, 1)	0.2809520279449709
  (2, 0)	0.2809520279449709
  (2, 4)	0.589609378371542
  (2, 5)	0.47569313398762636
  (3, 10)	0.386165564720347
  (3, 3)	0.27475997980494
  (3, 2)	0.27475997980494
  (3, 1)	0.27475997980494
  (3, 0)	0.27475997980494
  (3, 5)	0.4652090851374417
  (3, 8)	0.5766146700528488
  (4, 3)	0.26531423910850577
  (4, 7)	0.3136867442223721
  (4, 2)	0.26531423910850577
  (4, 1)	

In [7]:
import wikipedia
import pandas as pd

# Wikipedia の言語を日本語に設定
wikipedia.set_lang("ja")

# 都道府県名のリスト
prefectures = ["北海道","青森県","岩手県","秋田県","宮城県","山形県","福島県","茨城県","栃木県","群馬県","埼玉県","千葉県","東京都","神奈川県","新潟県","富山県","石川県","福井県","山梨県","長野県","岐阜県","静岡県","愛知県","三重県","滋賀県","京都府","大阪府","兵庫県","奈良県","和歌山県","鳥取県","島根県","岡山県","広島県","山口県","徳島県","香川県","愛媛県","高知県","福岡県","佐賀県","長崎県","熊本県","大分県","宮崎県","鹿児島県","沖縄県"]

# 各都道府県ごとに「〇〇の観光地」というタイトルの Wikipedia ページを取得
pages = [wikipedia.page(prefecture + "の観光地", auto_suggest=False) for prefecture in prefectures]

# 抽出したデータを Pandas DataFrame に格納
df = pd.DataFrame({
    'title': [page.title for page in pages],  # 各 Wikipedia ページのタイトル
    'url': [page.url for page in pages],  # 各 Wikipedia ページの URL
    'content': [page.content for page in pages]  # 各 Wikipedia ページの内容
})

# 各 Wikipedia ページの内容を corpus_ja に格納
corpus_ja = df.content.tolist()

In [8]:
corpus_ja

['北海道の観光地（ほっかいどうのかんこうち）は、北海道内の主要な観光地に関する項目である。「北海道」はブランド総合研究所による「都道府県の魅力度ランキング」で2018年現在、10年連続で1位に選ばれるなど人気が高い観光地である。\n\n\n== 対象別 ==\n\n\n=== 文化財等 ===\n\n\n==== 世界遺産 ====\n知床\n北海道・北東北の縄文遺跡群\nキウス周堤墓群\n北黄金貝塚\n入江・高砂貝塚\n大船遺跡\n垣ノ島遺跡\n\n\n==== 国の名勝 ====\n旧岩船氏庭園（香雪園）（函館市）\n天都山（網走市）\nピリカノカ（名寄市・石狩市・枝幸町・浜頓別町・えりも町・遠軽町・豊浦町・室蘭市・帯広市・中札内村・平取町・新冠町）\n九度山（クトウンヌプリ）\n黄金山（ピンネタイオルシペ）\n神威岬（カムイエトウ）\n襟裳岬（オンネエンルム）\n瞰望岩（インカルシ）\nカムイチャシ\n絵鞆半島外海岸\n十勝幌尻岳（ポロシリ）\n幌尻岳（ポロシリ）\nオキクルミのチャシ及びムイノカ\n\n\n==== 特別天然記念物 ====\n阿寒湖のマリモ\n野幌原始林\nアポイ岳高山植物群落\n昭和新山\n大雪山\n\n\n==== 重要伝統的建造物群保存地区 ====\n元町・末広町（函館市）\n\n\n==== 重要文化的景観\u3000 ====\nアイヌの伝統と開拓による沙流川流域の文化的景観\n\n\n==== 登録記念物 ====\n函館公園\n\n\n==== 重要文化財・史跡等 ====\n\n特別史跡\n五稜郭\n\n\n==== 文化施設 ====\n博物館・美術館\n\n水族館\n新さっぽろサンピアザ水族館\nサケのふるさと 千歳水族館\nおたる水族館\n市立室蘭水族館\n登別マリンパークニクス\n稚内市立ノシャップ寒流水族館\nオホーツクとっかりセンター\n山の水族館\n標津サーモン科学館\n\n\n==== その他 ====\n洞爺湖有珠山ジオパーク\nアポイ岳ジオパーク\n白滝ジオパーク\n三笠ジオパーク\nとかち鹿追ジオパーク\n十勝岳ジオパーク\nシリパ岬\n北海道遺産\n北海道ガーデン街道\n\n\n=== 公園等 ===\n\n\n==== 国立・国定・国営公園 ====\n国立公園\n利尻礼文サロベツ国立公園\n支

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

# 日本語トークナイザを指定して TF-IDF ベクトルを学習
vectorizer = TfidfVectorizer(tokenizer=mecab_tokenizer, token_pattern=None)
vectorizer.fit(corpus_ja)

# Sparse Vector (疎ベクトル) 取得関数
def get_sparse_embedding(text):
    """ 入力テキストを TF-IDF 疎ベクトルに変換 """
    tfidf_vector = vectorizer.transform([text])
    values = []
    dims = []
    for i, tfidf_value in enumerate(tfidf_vector.data):
        values.append(float(tfidf_value))
        dims.append(int(tfidf_vector.indices[i]))
    return {"values": values, "dimensions": dims}

In [11]:
from vertexai.language_models import TextEmbeddingModel, TextEmbeddingInput

model = TextEmbeddingModel.from_pretrained("textembedding-gecko-multilingual")

# ドキュメント用 Dense Vector (密ベクトル) 取得関数
def get_document_dense_embedding(text):
    """ 入力ドキュメントを密ベクトルに変換 """
    input = TextEmbeddingInput(text=text, task_type="RETRIEVAL_DOCUMENT")
    return model.get_embeddings([input])[0].values

# クエリ用 Dense Vector (密ベクトル) 取得関数
def get_query_dense_embedding(text):
    """ 入力クエリを密ベクトルに変換 """
    input = TextEmbeddingInput(text=text, task_type="RETRIEVAL_QUERY")
    return model.get_embeddings([input])[0].values

In [12]:
# Vector Search 用のインデックスファイルを作成
items = []
for i in range(len(df)):
    id = i
    title = df.title[i]
    url = df.url[i]
    content = df.content[i]
    dense_embedding = get_document_dense_embedding(content)
    sparse_embedding = get_sparse_embedding(content)
    items.append({"id": id, "title": title, "url": url, "embedding": dense_embedding, "sparse_embedding": sparse_embedding})
items[0]

{'id': 0,
 'title': '北海道の観光地',
 'url': 'https://ja.wikipedia.org/wiki/%E5%8C%97%E6%B5%B7%E9%81%93%E3%81%AE%E8%A6%B3%E5%85%89%E5%9C%B0',
 'embedding': [0.059132542461156845,
  -0.031391605734825134,
  0.0008935830555856228,
  0.052873317152261734,
  -0.03662215918302536,
  -0.04367884621024132,
  -0.007036830298602581,
  0.02071959711611271,
  -0.002152226632460952,
  0.0025782426819205284,
  -0.010692545212805271,
  0.021710965782403946,
  0.05125866085290909,
  -0.054943207651376724,
  -0.008876468054950237,
  0.02062331885099411,
  -0.013011354953050613,
  -0.03350530192255974,
  -0.038624998182058334,
  -0.02116355299949646,
  0.010514289140701294,
  0.055579692125320435,
  -0.0611334964632988,
  0.04246290400624275,
  -0.011129066348075867,
  3.223287785658613e-05,
  0.003273614216595888,
  -0.01303123403340578,
  -0.01542520709335804,
  -0.03782333806157112,
  -0.035879313945770264,
  -0.009366322308778763,
  0.01237538456916809,
  0.011262339539825916,
  0.04744435474276543,
  0.

In [1]:
# Project ID & リージョンを設定
PROJECT_ID = ! gcloud config get project
PROJECT_ID = PROJECT_ID[0]
LOCATION = "asia-northeast1"

In [2]:
PROJECT_ID, LOCATION


('gen-lang-client-0471694923', 'asia-northeast1')

In [17]:
# インデックスファイル格納用の GCS バケットを作成
BUCKET_URI = f"gs://{PROJECT_ID}-vs-hybridsearch-ja"
! gsutil mb -l $LOCATION -p $PROJECT_ID $BUCKET_URI

Creating gs://gen-lang-client-0471694923-vs-hybridsearch-ja/...


In [18]:
# インデックスファイルを GCS バケットに格納
with open("items.json", "w") as f:
    for item in items:
        f.write(f"{item}\n")
! gsutil cp items.json $BUCKET_URI

Copying file://items.json [Content-Type=application/json]...
/ [1 files][  1.7 MiB/  1.7 MiB]                                                
Operation completed over 1 objects/1.7 MiB.                                      


In [3]:
# Vertex AI を初期化
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=LOCATION)

In [20]:
# Index を作成
my_hybrid_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
    display_name="vs-hybridsearch-ja-index",
    contents_delta_uri=BUCKET_URI,
    dimensions=768,
    approximate_neighbors_count=20,
    shard_size="SHARD_SIZE_SMALL"
)

Creating MatchingEngineIndex
Create MatchingEngineIndex backing LRO: projects/1010478563120/locations/asia-northeast1/indexes/7556600370076581888/operations/3717599996359475200
MatchingEngineIndex created. Resource name: projects/1010478563120/locations/asia-northeast1/indexes/7556600370076581888
To use this MatchingEngineIndex in another session:
index = aiplatform.MatchingEngineIndex('projects/1010478563120/locations/asia-northeast1/indexes/7556600370076581888')


In [6]:
# Index Endppoint を作成
my_index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
    display_name=f"vs-hybridsearch-ja-index-endpoint",
    public_endpoint_enabled=False,
    enable_private_service_connect=True,
    project_allowlist=[PROJECT_ID],
)

Creating MatchingEngineIndexEndpoint
Create MatchingEngineIndexEndpoint backing LRO: projects/1010478563120/locations/asia-northeast1/indexEndpoints/8864297927502200832/operations/965021014733881344
MatchingEngineIndexEndpoint created. Resource name: projects/1010478563120/locations/asia-northeast1/indexEndpoints/8864297927502200832
To use this MatchingEngineIndexEndpoint in another session:
index_endpoint = aiplatform.MatchingEngineIndexEndpoint('projects/1010478563120/locations/asia-northeast1/indexEndpoints/8864297927502200832')


In [4]:
my_hybrid_index = aiplatform.MatchingEngineIndex('projects/1010478563120/locations/asia-northeast1/indexes/7556600370076581888')

In [None]:
my_index_endpoint = aiplatform.MatchingEngineIndexEndpoint('projects/1010478563120/locations/asia-northeast1/indexEndpoints/8864297927502200832')

NotFound: 404 IndexEndpoint `projects/1010478563120/locations/asia-northeast1/indexEndpoints/7532358337707376640` is not found.

In [8]:
! gcloud network-connectivity service-connection-policies create try-policy \
--project=gen-lang-client-0471694923 --network=projects/1010478563120/global/networks/try-vpc \
--service-class=gcp-vertexai --region=asia-northeast1 --subnets=try-subnet

API [networkconnectivity.googleapis.com] not enabled on project 
[gen-lang-client-0471694923]. Would you like to enable and retry (this will take
 a few minutes)? (y/N)?  

Command killed by keyboard interrupt

^C


In [10]:
# Index を Index Endpoint にデプロイ
DEPLOYED_HYBRID_INDEX_ID = f"vs_hybridsearch_ja_deployed"
my_index_endpoint.deploy_index(
    index=my_hybrid_index,
    deployed_index_id=DEPLOYED_HYBRID_INDEX_ID,
    min_replica_count=1,   
    psc_automation_configs=[("gen-lang-client-0471694923", "projects/1010478563120/global/networks/try-vpc")]
)

Deploying index MatchingEngineIndexEndpoint index_endpoint: projects/1010478563120/locations/asia-northeast1/indexEndpoints/8864297927502200832
Deploy index MatchingEngineIndexEndpoint index_endpoint backing LRO: projects/1010478563120/locations/asia-northeast1/indexEndpoints/8864297927502200832/operations/5436532494759362560
MatchingEngineIndexEndpoint index_endpoint Deployed index. Resource name: projects/1010478563120/locations/asia-northeast1/indexEndpoints/8864297927502200832


<google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint.MatchingEngineIndexEndpoint object at 0x7fdd7c2ca230> 
resource name: projects/1010478563120/locations/asia-northeast1/indexEndpoints/8864297927502200832