### Przetwarzanie danych oraz załadowanie do bazy

In [1]:
import pandas as pd

df = pd.read_csv("valid_entries.csv", header=0)
print(df.head().T)


                                                             0  \
Title                                     Bratysława jest mała   
Publisher                              Oficyna Wydawnicza ATUT   
Author                                            Pavol Rankov   
Category                                     literatura piękna   
Pages                                                    189.0   
Tags                                       literatura słowacka   
Language                                                polski   
Description  Co haluszki mówią o słowackiej duszy? Kto i dl...   
Cover        http://books.google.com/books/content?id=ElAnE...   

                                                             1  \
Title                                     Gdy zniknęły gołębie   
Publisher                                               Czarne   
Author                                            Sofi Oksanen   
Category                                     literatura piękna   
Pages    

In [2]:
from pymilvus import MilvusException, Collection, CollectionSchema, FieldSchema, DataType, utility,connections
import numpy as np
def connect():
    connections.connect(
        host = "172.25.161.17",
        port = "19530"
    )
    try:
        print("Connecting to database...")
        collections = utility.list_collections()
        print(f"List all collections:\n", collections)
    except MilvusException as e:
        print(e)

In [68]:
from pymilvus import MilvusException, Collection, CollectionSchema, FieldSchema, DataType, utility
import numpy as np

def create_collection(name: str):
    id = FieldSchema(
        name="id",
        dtype=DataType.INT64,
        is_primary=True,
        auto_id=True
    )
    title = FieldSchema(
        name="title",
        dtype=DataType.VARCHAR,
        max_length=3000
    )
    # vectorTitle = FieldSchema(
    #     name="vectorTitle",
    #     dtype=DataType.FLOAT_VECTOR,
    #     dim=1024
    # )

    publisher = FieldSchema(
        name="publisher",
        dtype=DataType.VARCHAR,
        max_length=5000
    )
    author = FieldSchema(
            name="author",
            dtype=DataType.VARCHAR,
            max_length=5000
        )
    # vectorAuthor = FieldSchema(
    #     name="vectorAuthor",
    #     dtype=DataType.FLOAT_VECTOR,
    #     dim=1024
    # )
    category = FieldSchema(
        name="category",
        dtype=DataType.VARCHAR,
        max_length=5000
    )
    vectorCategory = FieldSchema(
        name="vectorCategory",
        dtype=DataType.FLOAT_VECTOR,
        dim=1024
    )
    
    pages = FieldSchema(
        name="pages",
        dtype=DataType.FLOAT,
    )

    tags = FieldSchema(
        name="tags",
        dtype=DataType.VARCHAR,
        max_length=3000
    )
    vectorTags = FieldSchema(
        name="vectorTags",
        dtype=DataType.FLOAT_VECTOR,
        dim=1024
    )

    language = FieldSchema(
        name="language",
        dtype=DataType.VARCHAR,
        max_length=3000
    )

    description = FieldSchema(
        name="description",
        dtype=DataType.VARCHAR,
        max_length=65535
    )

    vectorDescription = FieldSchema(
        name="vectorDescription",
        dtype=DataType.FLOAT_VECTOR,
        dim=1024
    )
    cover = FieldSchema(
            name="cover",
            dtype=DataType.VARCHAR,
            max_length=65535
        )

    schema = CollectionSchema(
        fields=[id, title,publisher,author,category,vectorCategory,pages,tags,vectorTags,language,description,vectorDescription,cover],
        description="Books collection",
        enable_dynamic_field=True
    )
    new_collection = Collection(
        name=name,
        schema=schema,
        using='default',
        shards_num=4
    )
    return new_collection

In [69]:
def createIndex(inserted_rows,name):
    collection=create_collection(name)
    nlist = 4 * int(np.round(np.sqrt(inserted_rows)))
    index_params = {
        "index_type": "IVF_FLAT",
        "metric_type": "COSINE",
        "params": {
            "nlist": nlist
        }
    }
    
    try:
        indexes = utility.list_indexes(
        collection_name=name
        )
        if(len(indexes) > 0):
            print("Droping previous index...")
            collection.release()
            collection.drop_index()
        print(f"Creating index to collection: {collection.name}...")
       # collection.create_index(field_name="vectorTitle",index_params=index_params,index_name="SimpleIndex1")
       # collection.create_index(field_name="vectorAuthor",index_params=index_params,index_name="SimpleIndex2")
        collection.create_index(field_name="vectorCategory",index_params=index_params,index_name="SimpleIndex3")
        collection.create_index(field_name="vectorTags",index_params=index_params,index_name="SimpleIndex4")
        collection.create_index(field_name="vectorDescription",index_params=index_params,index_name="SimpleIndex5")
        print("Succesfull")
    except MilvusException as e:
        print(e)


In [70]:
connect()
createIndex(300,"Books")

Connecting to database...
List all collections:
 ['UserPreferences', 'USER5', 'USER6']
Creating index to collection: Books...
Succesfull


### tworzenie modelu danych do wstawienia

In [4]:
from milvus_model.dense import JinaEmbeddingFunction
jina_api_key = "xxxx"
def initializeJina():
    global ef
    ef = JinaEmbeddingFunction(
        "jina-embeddings-v3", 
        jina_api_key,
        task="text-matching",
        dimensions=1024
    )
    print("......init JINA")

initializeJina()

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


......init JINA


In [31]:
def createSingleEmbedd(text):
    print("########single embede")
    embeddings=ef(text)
    print(embeddings[0])
    return embeddings[0]

In [67]:
z=createSingleEmbedd("lubie ksiazki historyczne interesuje mnie caly swiat raczej w historycznych realiach najbardziej interesuje mnie starozytnosc najbradziej interesuje mnie egipt")
pd.DataFrame(z).to_csv('output.txt', index=False, header=False)
with open('output.txt', 'r') as file:
    lines = file.readlines()

# Dodajemy przecinek na końcu każdej linii
with open('output.txt', 'w') as file:
    for line in lines:
        file.write(line.strip() + ',' + '\n')

########single embede
[ 0.04480504 -0.1434625   0.09870063 ...  0.00378283 -0.01837308
 -0.00113279]


In [71]:
from milvus_model.dense import JinaEmbeddingFunction
# Title                                     Bratysława jest mała   
# Publisher                              Oficyna Wydawnicza ATUT   
# Author                                            Pavol Rankov   
# Category                                     literatura piękna   
# Pages                                                    189.0   
# Tags                                       literatura słowacka   
# Language                                                polski   
# Description  Co haluszki mówią o słowackiej duszy? Kto i dl...   
# Cover 

data ={
    'title': df["Title"].tolist(),
    #'vectorTitle': df["Title"].apply(createSingleEmbedd),
    'publisher': df["Publisher"].tolist(),
    'author': df["Author"].tolist(),
    #'vectorAuthor':df["Author"].apply(createSingleEmbedd),
    'category':df["Category"].tolist(),
    'vectorCategory':df["Category"].apply(createSingleEmbedd),
    'pages':df["Pages"].tolist(),
    'tags': df["Tags"].tolist(),
    'vectorTags': df["Tags"].apply(createSingleEmbedd),
    'language': df["Language"].tolist(),
    'description': df["Description"].tolist(),
    'vectorDescription': df["Description"].apply(createSingleEmbedd),
    'cover': df["Cover"].tolist()
}
df1 = pd.DataFrame(data)
print(df1.head().T)
try:
    collection = Collection("Books")
    collection.insert(df1)
    collection.load()
except MilvusException as e:
    print(e)

########single embede
[-0.07084485  0.03296698  0.08101126 ... -0.00138667 -0.00718326
 -0.01479883]
########single embede
[-0.07084485  0.03296698  0.08101126 ... -0.00138667 -0.00718326
 -0.01479883]
########single embede
[-0.07084485  0.03296698  0.08101126 ... -0.00138667 -0.00718326
 -0.01479883]
########single embede
[-0.07084485  0.03296698  0.08101126 ... -0.00138667 -0.00718326
 -0.01479883]
########single embede
[-0.07084485  0.03296698  0.08101126 ... -0.00138667 -0.00718326
 -0.01479883]
########single embede
[-0.07084485  0.03296698  0.08101126 ... -0.00138667 -0.00718326
 -0.01479883]
########single embede
[-0.07084485  0.03296698  0.08101126 ... -0.00138667 -0.00718326
 -0.01479883]
########single embede
[-0.07084485  0.03296698  0.08101126 ... -0.00138667 -0.00718326
 -0.01479883]
########single embede
[-0.07084485  0.03296698  0.08101126 ... -0.00138667 -0.00718326
 -0.01479883]
########single embede
[-0.07084485  0.03296698  0.08101126 ... -0.00138667 -0.00718326
 -0.