In [379]:
# !pip install weaviate-client
import torch
# from weaviate.exceptions import RequestsConnectionError
import weaviate
# !pip install markdown
import markdown
import re
import string
import json
import pandas as pd
# !pip install bs4
from bs4 import BeautifulSoup

In [380]:
def extract_info_from_md(md_filename):
    with open(md_filename, 'r',encoding='utf_8') as md_file:
        md_content = md_file.read()
    
    # Split the content by lines
    lines = md_content.split('\n')
    # Remove special characters from author using string.punctuation
    title = lines[0].translate(str.maketrans('', '', string.punctuation))
    author = lines[2].split('，')[0].translate(str.maketrans('', '', string.punctuation)).strip()
    content = '\n'.join(lines[3:])  # Join the rest of the lines as content
    # Convert the content to HTML using markdown package
    html_content = markdown.markdown(content)
    # Parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')

    # Get the text without HTML tags
    cleaned_content = soup.get_text()

    # print(cleaned_content)
    
    return title, author, cleaned_content

In [381]:
def process_md_files_in_folder(folder_path):
    data = []
    
    for filename in os.listdir(folder_path):
        if filename.endswith(".md"):
            md_filename = os.path.join(folder_path, filename)
            title, author, content = extract_info_from_md(md_filename)
            data.append((title, author, content))
    
    df = pd.DataFrame(data, columns=["title", "author", "content"])
    return df

In [382]:
folder_path = 'dataset'
df = process_md_files_in_folder(folder_path)
df

Unnamed: 0,title,author,content
0,科技投行｜重新认识和界定技术资产,杨川,从2019年到2022年底，科创板IPO融资规模近7600亿，再融资规模超1000亿，总市值...
1,创新驱动引擎：原理、模式与布局,杨川,前言：\n驱动一词如果从工程学角度，就涉及到驱动模式与动力能源这两大要素。\n创新驱动战略也...


In [401]:
class_obj = {
    # Class definition
    "class": "FooClass1",

    # Property definitions
    "properties": [
        {
            "name": "title",
            "dataType": ["text"],
        },
        {
            "name": "author",
            "dataType": ["text"],
        },
        {
            "name": "content",
            "dataType": ["text"],
        },
    ],

    # Specify a vectorizer
    "vectorizer": "text2vec-huggingface",

    # Module settings
    "moduleConfig": {
        "text2vec-huggingface": {
            "vectorizeClassName": False,
            "model": "sentence-transformers/facebook-dpr-question_encoder-single-nq-base",
            "modelVersion": "002",
            "type": "text"
        },
    },
}

In [402]:
# Weaviate configuration
client = weaviate.Client(
    url = "https://sinofaith-test-8hx8vad5.weaviate.network",  # Replace with your endpoint
    auth_client_secret=weaviate.AuthApiKey(api_key="F7OZJspabN7h5c1DrZpDIcdhBQIiKTWnsofu"),  # Replace w/ your Weaviate instance API key
    additional_headers = {
        "HUGGINGFACE_APIKEY": "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"  # Replace with your inference API key
    }
)

In [403]:
client.schema.create_class(class_obj)

In [404]:
client.schema.get("FooClass1")

{'class': 'FooClass1',
 'invertedIndexConfig': {'bm25': {'b': 0.75, 'k1': 1.2},
  'cleanupIntervalSeconds': 60,
  'stopwords': {'additions': None, 'preset': 'en', 'removals': None}},
 'moduleConfig': {'text2vec-huggingface': {'model': 'sentence-transformers/facebook-dpr-question_encoder-single-nq-base',
   'modelVersion': '002',
   'type': 'text',
   'vectorizeClassName': False}},
 'multiTenancyConfig': {'enabled': False},
 'properties': [{'dataType': ['text'],
   'indexFilterable': True,
   'indexSearchable': True,
   'moduleConfig': {'text2vec-huggingface': {'skip': False,
     'vectorizePropertyName': False}},
   'name': 'title',
   'tokenization': 'word'},
  {'dataType': ['text'],
   'indexFilterable': True,
   'indexSearchable': True,
   'moduleConfig': {'text2vec-huggingface': {'skip': False,
     'vectorizePropertyName': False}},
   'name': 'author',
   'tokenization': 'word'},
  {'dataType': ['text'],
   'indexFilterable': True,
   'indexSearchable': True,
   'moduleConfig': {'

In [405]:
from weaviate.util import generate_uuid5

with client.batch(
    batch_size=10,  # Specify batch size
    num_workers=2,   # Parallelize the process
) as batch:
    for _, row in df.iterrows():
        question_object = {
            "title": row.title,
            "author": row.author,
            "content": row.content,
        }
        batch.add_data_object(
            question_object,
            class_name="FooClass1",
            uuid=generate_uuid5(question_object)
        )

In [406]:
# show all case see if this working
res = client.query.get("FooClass1", ["title", "author", "content"]).with_additional(["id", "vector"]).with_limit(2).do()

print(json.dumps(res, indent=4, ensure_ascii=False))

{
    "data": {
        "Get": {
            "FooClass1": [
                {
                    "_additional": {
                        "id": "abf0a4bc-79e0-5e3d-8317-c833473d2c35",
                        "vector": [
                            0.065584056,
                            0.28925824,
                            0.23344772,
                            -0.38317525,
                            -0.08875944,
                            0.07105221,
                            0.64903724,
                            -0.7365322,
                            -0.048443463,
                            -0.24623615,
                            -0.24936068,
                            0.016688298,
                            -0.2043252,
                            -0.22883943,
                            0.66418964,
                            -0.24319267,
                            0.2600103,
                            0.1320533,
                            0.23900512,
           

In [408]:
# do the query
response = (
    client.query
    .get("FooClass1", ["title", "author", "content"])
    .with_near_text({
        "concepts": ["科技投行"]
    })
    .with_limit(2)
    .with_additional(["distance"])
    .do()
)
print(json.dumps(response, indent=4 ,ensure_ascii=False))

{
    "data": {
        "Get": {
            "FooClass1": [
                {
                    "_additional": {
                        "distance": 0.2491107
                    },
                    "author": "杨川",
                    "content": "从2019年到2022年底，科创板IPO融资规模近7600亿，再融资规模超1000亿，总市值近6万亿，其中7家头部科技公司市值均破千亿；六大战略性产业，国家级专精特新，成为科创版上市公司的主要构成。\n市值最高的头部公司中，行业标杆企业相继涌现。科创板上市企业大多轻资产、高估值，共26家企业市值超过400亿元，科创板整体PB和PE估值水平高于A股主板，这表科创板大力支持极具潜力、高估值的科技创新企业发展。这也说明科创板对未实现盈利的高新技术企业容纳度更高，是高价值技术资产得到投资者认可的表现。\n毫无疑问，技术资产是未来支撑具有硬核科创属性和持续高效发展的上市企业市值和商誉的核心资产。但是对于技术资产的定义业界和学术界却一直存在争议。我们认为，面对战略新兴产业的大发展，结合全球的实践经验，有必要对技术资产做出一个清晰量化的、可落地，服务于投资、交易、并购估值和评价的务实定义。\n技术资产定义\n技术资产是科技类企业除财务资产以外的表达创新能力价值的资产。\n科技类企业指的是竞争优势与市场价值依靠研发与创新驱动的企业，包括科创企业与大型科技公式。\n人们往往会把技术本身，或者知识产权误认为技术资产。技术只是方法，知识产权也只是保护技术方法的权利，它们都是构成技术资产的要素。很多技术方法和相应的知识产权在类似摩尔定律快速迭代趋势中会很快失去效用。对科技类企业来说最重要的是持续创新能力。\n技术资产是由研发人或创新团队的技术成果、知识产权、非公开技术秘密（数据、经验和知识等），与持续创新能力组成的科创企业的核心竞争力资产。具体表现为研发人或创新团队在企业中的股权，期权，以及类股权的奖励与激励机制，如企业内部研发人或创新团队产生专利与企业的分成机制等。因此，技术资产本质上是一种能

从结果看出第一篇文章的distance 0.2491107 小于第二篇文章的distance 0.25142026, 因此查询结果正确