In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from joblib import Memory
import json
import openai
import os
import sys
from tenacity import retry, stop_after_attempt, wait_random_exponential
import time
from dotenv import load_dotenv, find_dotenv
import torch
from langchain import HuggingFacePipeline
from langchain.vectorstores import Chroma
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, TextStreamer, GPTQConfig
from kg_rag.config_loader import *
import ast
import requests

In [None]:
import os
from pathlib import Path
import sys

# 获取当前工作目录的绝对路径
current_dir = Path.cwd()

# 将配置文件的路径添加到系统路径中
config_path = os.path.join(current_dir, "key_config.py")
if config_path not in sys.path:
    sys.path.append(config_path)

# # 导入 API 密钥
from key_config import OPENAI_KEY
from key_config import DASHSCOPE_KEY

# # 现在可以使用 API_KEY 变量了
print(OPENAI_KEY)
print(DASHSCOPE_KEY)

client = openai.OpenAI(api_key=OPENAI_KEY)

In [None]:
#爬取论文一级消息链接
import urllib.request
import time
import feedparser
from http import HTTPStatus
import dashscope

# Base api query url
base_url = 'http://export.arxiv.org/api/query?';

# Search parameters
search_query = 'all:llm' # search for electron in all fields
start = 0                       # start at the first result
total_results = 20              # want 20 total results
results_per_iteration = 20       # 5 results at a time
wait_time = 1                   # number of seconds to wait beetween calls



# entities_prompt = '''Now that you are an entity recognition expert, you are now given a text, you need to identify all the entities of the text,and return Entities in the following json format: \nEntities:<List of extracted entities>\nPlease return only the json format of the entity and nothing else'''

abstract_prompt = '''Now that you are now an expert in summarizing summaries, you are now given a abstract of an article, please extract from it which field it belongs to (like continuous learning, large language models, reinforcement learning, etc.), problems, methods and effects, and return it in the following json format \n\{"field":, "problems":, "methods":, "effects":\}\nPlease return only the json and nothing else'''


def fetch_GPT_response(instruction, base_prompt, chat_model_id, temperature=0):
    print('Calling OpenAI...')
    response = client.chat.completions.create(
        model=chat_model_id,
        messages=[
            # {"role": "system", "content": base_prompt},
            {"role": "user", "content": instruction}
        ],
        temperature=temperature,
    )
    return response.choices[0].message.content.strip()

def get_GPT_response(instruction, base_prompt, chat_model_id, temperature=0):
    return fetch_GPT_response(instruction, base_prompt, chat_model_id, temperature)

dashscope.api_key = DASHSCOPE_KEY

def call_with_messages(system_prompts, question):
    messages = [{'role': 'system', 'content': system_prompts},
                {'role': 'user', 'content': question}]

    response = dashscope.Generation.call(
        dashscope.Generation.Models.qwen_72b_chat,
        messages=messages,
        result_format='message',  # set the result to be "message" format.
    )
    if response.status_code == HTTPStatus.OK:
        print(response)
    else:
        print('Request id: %s, Status code: %s, error code: %s, error message: %s' % (
            response.request_id, response.status_code,
            response.code, response.message
        ))
        
# call_with_messages()

def abstract_extractor(text):
    # chat_model_id, chat_deployment_id = get_gpt35()
    prompt_updated = abstract_prompt+ "\n" + "Text : " + text
    print('prompt_updated:',prompt_updated)

    resp = get_GPT_response(prompt_updated, abstract_prompt, 'gpt-4', temperature=0)
    print('resp:',resp)
    try:
        abstract_dict = json.loads(resp)
        print(abstract_dict)
        # return entity_dict["Diseases"]
        return abstract_dict
    except:
        return None

print ('Searching arXiv for %s' % search_query)

import json

# 初始化数据
title = "A Comprehensive Survey on Code Large Language Models"
author = "Jane Doe, John Smith"
published = "2024-02-28"

resp = {
    'field': 'Large Language Models, Software Engineering',
    'problems': 'Lack of systematic investigation into Code LLMs and their performance, frequent updates of Code LLMs influenced by base LLMs',
    'methods': 'Comprehensive survey and analysis of Code LLMs, categorization of Code LLMs based on their publishers, investigation of performance differences between general LLMs and Code LLMs, maintenance of performance of LLMs across multiple mainstream benchmarks',
    'effects': 'Assists developers of Code LLMs in choosing base models for the development of more advanced LLMs, provides insights for practitioners to better understand key improvement directions for Code LLMs'
}

knowledge_graph = {
    "edges": [],
    "entities": {}
}

# 初始构建知识图谱函数
def build_knowledge_graph(title, author, published, resp):
    print('resp:',resp)
    # 如果作者和发布日期还未作为独立的实体加入，则加入它们
    if author not in knowledge_graph["entities"]:
        knowledge_graph["entities"][author] = {}
        knowledge_graph["edges"].append([title, "author", author])
    
    if published not in knowledge_graph["entities"]:
        knowledge_graph["entities"][published] = {}
        knowledge_graph["edges"].append([title, "published", published])
        
    if resp['field'] not in knowledge_graph['entities']:
        knowledge_graph["entities"][resp['field']] = {}
        
    # print(resp)
        
    knowledge_graph['entities'][title] = resp
    


for i in range(start,total_results,results_per_iteration):
    
    print ("Results %i - %i" % (i,i+results_per_iteration))
    
    query = 'search_query=%s&start=%i&max_results=%i' % (search_query,
                                                         i,
                                                        results_per_iteration)

    # perform a GET request using the base_url and query
    response = urllib.request.urlopen(base_url+query).read()

    # parse the response using feedparser
    feed = feedparser.parse(response)

    # Run through each entry, and print out information
    for entry in feed.entries:
        print ('arxiv-id: %s' % entry.id.split('/abs/')[-1])
        print ('Link:  %s' % entry.id)
        print ('Title:  %s' % entry.title)
        print ('Abstract: %s' %entry.summary.replace("\n",""))
        # feedparser v4.1 only grabs the first author
        print ('First Author:  %s' % entry.author)
        print ('Publish_time: %s' % entry.published)
        print("=================================")
        # 使用初始函数构建知识图谱

        resp = abstract_extractor(entry.summary.replace("\n",""))
        build_knowledge_graph(entry.title, entry.author, entry.published, resp)
    
    # Remember to play nice and sleep a bit before you call
    # the api again!
    print ('Sleeping for %i seconds' % wait_time )
    time.sleep(wait_time)


file_name = 'knowledge_graph_with_dynamic_entities.json'
with open(file_name, 'w') as file:
    json.dump(knowledge_graph, file, indent=4)

print(f"Knowledge graph saved to {file_name}.")
# #下载指定链接的pdf
# from urllib import request

# url = "https://arxiv.org/pdf/2311.07989.pdf"
# request.urlretrieve(url,'1.pdf')

In [None]:
import json

# 初始化数据
title = "A Comprehensive Survey on Code Large Language Models"
author = "Jane Doe, John Smith"
published = "2024-02-28"

resp = {
    'field': 'Large Language Models, Software Engineering',
    'problems': 'Lack of systematic investigation into Code LLMs and their performance, frequent updates of Code LLMs influenced by base LLMs',
    'methods': 'Comprehensive survey and analysis of Code LLMs, categorization of Code LLMs based on their publishers, investigation of performance differences between general LLMs and Code LLMs, maintenance of performance of LLMs across multiple mainstream benchmarks',
    'effects': 'Assists developers of Code LLMs in choosing base models for the development of more advanced LLMs, provides insights for practitioners to better understand key improvement directions for Code LLMs'
}

KG = {
    "edges": [],
    "entities": {}
}

# 初始构建知识图谱函数
def build_knowledge_graph(title, author, published, resp):
    # 如果作者和发布日期还未作为独立的实体加入，则加入它们
    if author not in knowledge_graph["entities"]:
        KG["entities"][author] = {}
        KG["edges"].append([title, "author", author])
    
    if published not in knowledge_graph["entities"]:
        KG["entities"][published] = {}
        KG["edges"].append([title, "published", published])
        
    if resp['field'] not in knowledge_graph['entities']:
        KG["entities"][resp['field']] = {}
        
    # print(resp)
        
    KG['entities'][title] = resp
    

# 使用初始函数构建知识图谱
build_knowledge_graph(title, author, published, resp)

# # 假设有新的实体数据传入
# new_entity_name = "NewEntity"
# new_entity_info = {"attribute1": "value1", "attribute2": "value2"}

# # 向知识图谱添加新实体
# add_entity_to_graph(knowledge_graph, new_entity_name, new_entity_info)

# file_name = 'knowledge_graph_with_dynamic_entities.json'
# with open(file_name, 'w') as file:
#     json.dump(knowledge_graph, file, indent=4)

print(json.dumps(KG, indent=4))


In [None]:
knowledge_graph

In [None]:


# entities_prompt = '''Now that you are an entity recognition expert, you are now given a text, you need to identify all the entities of the text,and return Entities in the following json format: \nEntities:<List of extracted entities>\nPlease return only the json format of the entity and nothing else'''

abstract_prompt = '''Now that you are now an expert in summarizing summaries, you are now given a abstract of an article, please extract from it which field it belongs to (like continuous learning, large language models, reinforcement learning, etc.), problems, methods and effects, and return it in the following json format \n\{"field":, "problems":, "methods":, "effects":\}\nPlease return only the json and nothing else'''


def fetch_GPT_response(instruction, base_prompt, chat_model_id, temperature=0):
    print('Calling OpenAI...')
    response = client.chat.completions.create(
        model=chat_model_id,
        messages=[
            # {"role": "system", "content": base_prompt},
            {"role": "user", "content": instruction}
        ],
        temperature=temperature,
    )
    return response.choices[0].message.content.strip()

def get_GPT_response(instruction, base_prompt, chat_model_id, temperature=0):
    return fetch_GPT_response(instruction, base_prompt, chat_model_id, temperature)

def abstract_extractor(text):
    # chat_model_id, chat_deployment_id = get_gpt35()
    prompt_updated = abstract_prompt+ "\n" + "Text : " + text
    print('prompt_updated:',prompt_updated)

    resp = get_GPT_response(prompt_updated, abstract_prompt, 'gpt-4', temperature=0)
    print('resp:',resp)
    abstract_dict = json.loads(resp)
    print(abstract_dict)
    return abstract_dict

    

abstract = 'General large language models (LLMs), represented by ChatGPT, havedemonstrated significant potential in tasks such as code generation in softwareengineering. This has led to the development of specialized LLMs for softwareengineering, known as Code LLMs. A considerable portion of Code LLMs is derivedfrom general LLMs through model fine-tuning. As a result, Code LLMs are oftenupdated frequently and their performance can be influenced by the base LLMs.However, there is currently a lack of systematic investigation into Code LLMsand their performance. In this study, we conduct a comprehensive survey andanalysis of the types of Code LLMs and their differences in performancecompared to general LLMs. We aim to address three questions: (1) What LLMs arespecifically designed for software engineering tasks, and what is therelationship between these Code LLMs? (2) Do Code LLMs really outperformgeneral LLMs in software engineering tasks? (3) Which LLMs are more proficientin different software engineering tasks? To answer these questions, we firstcollect relevant literature and work from five major databases and open-sourcecommunities, resulting in 134 works for analysis. Next, we categorize the CodeLLMs based on their publishers and examine their relationships with generalLLMs and among themselves. Furthermore, we investigate the performancedifferences between general LLMs and Code LLMs in various software engineeringtasks to demonstrate the impact of base models and Code LLMs. Finally, wecomprehensively maintained the performance of LLMs across multiple mainstreambenchmarks to identify the best-performing LLMs for each software engineeringtask. Our research not only assists developers of Code LLMs in choosing basemodels for the development of more advanced LLMs but also provides insights forpractitioners to better understand key improvement directions for Code LLMs.'
resp = abstract_extractor(abstract)

In [None]:
print(resp)


for key, value in resp.items():
    print(f"{key}:")
    for item in value.split(', '):
        print(f"    {item}")
    print("")  # Adding an extra newline for better separation between sections

In [None]:
import os

# 设置环境变量
os.environ['http_proxy'] = 'http://gpu013:7890'
os.environ['https_proxy'] = 'http://gpu013:7890' # 如果你也需要设置HTTPS代理

# 验证环境变量是否设置成功
print(os.environ['http_proxy'])


In [None]:
config_file = config_data['GPT_CONFIG_FILE']
load_dotenv(config_file)

system_prompts["DISEASE_ENTITY_EXTRACTION"]

In [None]:
relation_prompt = '''You are now a relationship extraction expert, now given a series of entities, and the text related to this entity, you need to go through all the entities in pairs, if there is a relationship between the two entities, this relationship may not be inferred directly from the text, please return all their relationships in the following json format.
{'triple':list of 'entities-relation-entities'},
please return only the json result and nothing else.'''

relation_prompt

In [None]:


# entities_prompt = '''Now that you are an entity recognition expert, you are now given a text, you need to identify all the entities of the text,and return Entities in the following json format: \nEntities:<List of extracted entities>\nPlease return only the json format of the entity and nothing else'''

entities_pro_prompt = '''Now that you are an entity recognition expert, you are now given a text, you need to identify all the entities of the text,and return Entities in the following json format: \nEntities:<List of extracted entities>\nPlease report only Policy. Do not report any other entities like Genes, Proteins, Enzymes etc.'''


def fetch_GPT_response(instruction, base_prompt, chat_model_id, temperature=0):
    print('Calling OpenAI...')
    response = client.chat.completions.create(
        model=chat_model_id,
        messages=[
            # {"role": "system", "content": base_prompt},
            {"role": "user", "content": instruction}
        ],
        temperature=temperature,
    )
    return response.choices[0].message.content.strip()

def get_GPT_response(instruction, base_prompt, chat_model_id, temperature=0):
    return fetch_GPT_response(instruction, base_prompt, chat_model_id, temperature)

def entity_extractor(text):
    # chat_model_id, chat_deployment_id = get_gpt35()
    prompt_updated = entities_pro_prompt+ "\n" + "Text : " + text
    print('prompt_updated:',prompt_updated)

    resp = get_GPT_response(prompt_updated, entities_pro_prompt, 'gpt-4', temperature=0)
    print('resp:',resp)
    try:
        entities_dict = json.loads(resp)
        print(entities_dict)
        # return entity_dict["Diseases"]
    except:
        return None
    
    
text = '''从国家整体层面看，2020年政务数据的相关政策上有两大热点。
　　一是数据生产要素这一提法在政策层面得到了进一步明确，成为社会各界关注的热点。继《促进大数据发展行动纲要》、中央政治局第二次集体学习、十九届四中全会《决定》中提到数据要素之后，2020年3月30日，《中共中央国务院关于构建更加完善的要素市场化配置体制机制的意见》出台，这是中央关于要素市场化配置的第一份文件，明确提出“加快培育数据要素市场”，并强调要推进政府数据开放共享、提升社会数据资源价值、加强数据资源整合和安全保护的具体要求。
　　二是高度关注数据安全和隐私保护。2020年，我国公布了《中华人民共和国数据安全法（草案）》、《中华人民共和国个人信息保护法（草案）》，并发起了《全球数据安全倡议》，旨在明确数据安全法律责任，完善监管体系，保障国家安全、公民个人隐私权益和社会安全稳定。文件中，对政务数据的安全与开放也提出了明确的要求。'''
print(text)
entity_extractor(text)

In [None]:
# relation_prompt = '''You are now a relationship extraction expert, now given a series of entities, and the text related to this entity, you need to go through all the entities in pairs, if there is a relationship between the two entities, this relationship may not be inferred directly from the text, please return all their relationships in the following json format.
# {'triple':list of 'entities-relation-entities'},
# # please return only the json result and nothing else.'''

relation_prompt = '''You are now a relationship extraction expert, now given a series of entities, and the text related to this entity, please return all their relationships in the following json format.
{'triple':list of 'entities-relation-entities'},
please return only the json result and nothing else.'''

def triple_extractor(entities, text):
    # entities:'e1,e2,e3……'
    prompt_updated = relation_prompt+ "\n" +"entities:"+ entities + "\n" + "Text : " + text
    print('prompt_updated:',prompt_updated)

    resp = get_GPT_response(prompt_updated, relation_prompt, 'gpt-4', temperature=0)
    print('resp:',resp)
    try:
        triple_dict = json.loads(resp)
        print(triple_dict)
        # return entity_dict["Diseases"]
    except:
        return None
    
text = ''''小陈是小李的父亲，小李是小刚的父亲'''
print(text)
triple_extractor('小陈, 小李, 小刚', text)