In [21]:
import os
import json
import pandas as pd
from langchain.vectorstores import FAISS
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI  
from langchain import PromptTemplate

# 设置Chat-GPT API的密钥
OPENAI_KEY = 'your_key'

# 加载Chat-GPT模型
gpt_model = ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0.1, max_tokens=512, api_key=OPENAI_KEY)

# 论文文件夹路径
paper_folder = "F:\Code\Word_detecting\Vector_ku\Train_vector"

# 创建一个空的DataFrame用于保存问题、标签和论文标题
questions_df = pd.DataFrame(columns=["Paper Title", "Question", "Label"])

prompt_template = """Ask questions about the text content to generate the following five questions:
 1. What kind of spectral detection method was used in the related research of ...(the topic of the paper)? 
 2. If I use (the specific spectral detection method in the paper) to carry out in the related research of ...(the topic of the paper), What kind of feature processing methods can I take based on similar studies?
 3. If I use (the specific spectral detection method in the paper) to carry out in the related research of ...(the topic of the paper) , What kind of machine learning methods can i take based on similar studies?
 4.If I use (the specific spectral detection method in the paper) to carry out in the related research of ...(the topic of the paper) , What metric was chosen to evaluate model's performance, and how far can this metric go? (make sure you mention the specific spectral detecting method in this question!!!)
 5.You can ask anything related to the research of ...(the topic of the paper) using (the specific spectral detection method in the paper), generate questions randomly。
 dont point out the author's name,
 Notice:for the first four questions, dont use “how” to ask question ,
 In the questions gennerated, dont use words like “in the paper”, use “in the related study” instead of “in the study”,use 'in the related research' instead of 'in the research',
 Must make sure evevry question mentions the specific object of study and the specific spectral detecting method,
 You don't have to use 'if...' at all time, you can try another way to ask questions, but you must mentions mentions the specific object of study and the specific spectral detecting method,
            known information:
            {context}

            Question:
            {question}"""

# 设置每个文件保存的论文数量
batch_size = 100
# 用于记录已经处理的论文数量
processed_count = 0
# 用于记录保存的文件编号
file_number = 1

# 定义一个函数来解析论文文件名中的数字部分
def extract_paper_number(filename):
    return int(filename.split('_')[0])

# 遍历论文文件夹
for paper_filename in sorted(os.listdir(paper_folder), key=extract_paper_number):
    vs_path = os.path.join(paper_folder, paper_filename, 'Vector')
    
    # 检查路径是否存在
    if not os.path.exists(vs_path) or not os.path.isdir(vs_path):
        print(f"路径 {vs_path} 不存在或不是一个目录，跳过...")
        continue
    
    # 寻找.faiss文件
    faiss_files = [f for f in os.listdir(vs_path) if f.endswith('.faiss')]
    
    # 检查是否存在.faiss文件
    if not faiss_files:
        print(f"在路径 {vs_path} 中找不到 .faiss 文件，跳过...")
        continue
    
    index_name = faiss_files[0].split('.faiss')[0]
    print('当前论文:', index_name)

    vector_store = FAISS.load_local(vs_path, 
                                    HuggingFaceEmbeddings(model_name="BAAI/bge-small-en"),
                                    index_name=index_name,
                                    allow_dangerous_deserialization=True)
    # 构建查询问题
    query_question = f"Ask questions about the text content to generate the following five questions."
    # 使用FAISS进行文本检索
    knowledge_chain = RetrievalQA.from_llm(llm=gpt_model, 
                                           retriever=vector_store.as_retriever(search_kwargs={"k": 2}), 
                                           prompt = PromptTemplate(
                template=prompt_template,
                input_variables=["context", "question"]))
    result = knowledge_chain({"query": query_question})
    generated_questions = result['result'].split('\n')
    print("生成问题：", generated_questions) 
    # 检查是否生成了三个问题
    if len(generated_questions) != 5:
        print(f"未能生成五个问题：{generated_questions}")
        continue
    
    # 去掉问题中的序号并添加到DataFrame中
    generated_questions = [question.split('. ')[1] for question in generated_questions]

    temp_df = pd.DataFrame({
        "Paper Title": paper_filename*5,  # 使用第一个下划线后的部分作为论文标题
        "Question": generated_questions,
        "Label": ["Spectral Detection Method", "Feature Processing Method", "Mechine learning Method","Evaluated metrics","Random questions"]
    })
    questions_df = pd.concat([questions_df, temp_df], ignore_index=True)
    
    # 更新已处理的论文数量
    processed_count += 1
    
    # 检查是否达到了保存的批次大小或已处理完所有论文
    if processed_count % batch_size == 0 or processed_count == len(os.listdir(paper_folder)):
        # 将DataFrame保存到Excel文件
        excel_file = f"Questions(Five)_batch_{file_number}.xlsx" 
        questions_df.to_excel(excel_file, index=False)
        print(f"问题已保存到 {excel_file}")
        
        # 清空DataFrame以保存下一批数据
        questions_df = pd.DataFrame(columns=["Paper Title", "Question", "Label"])
        
        # 更新文件编号
        file_number += 1

print("处理完成！")




当前论文: 1_Cognitive_spectroscopy_for_evaluating_Ch
生成问题： ['1. What kind of spectral detection method was used in the related research of predicting Japanese green tea ranking?', '2. When using Fourier transform near-infrared reflectance spectroscopy to evaluate Japanese green tea ranking, what feature processing methods can be applied based on similar studies?', '3. In the related study of predicting Japanese green tea ranking using Fourier transform near-infrared reflectance spectroscopy, what machine learning methods have been utilized in similar research?', "4. When using Fourier transform near-infrared reflectance spectroscopy to predict Japanese green tea ranking, what metric was chosen to evaluate the model's performance, and how reliable is this metric?", '5. How can the prediction of Japanese green tea ranking be improved by incorporating different machine learning algorithms with Fourier transform near-infrared reflectance spectroscopy?']
当前论文: 6_Diffuse_reflectance_spectroscopy

按权重提取出训练的文本

In [10]:
import pandas as pd
import os
import shutil
import random

# 读取 Excel 文件
excel_file = 'F:\Code\Word_detecting\Label_ver7.xlsx'
df = pd.read_excel(excel_file)

# 统计每个标签的论文数量
label_counts = df['Label6 - Subcategories of Research Methods'].value_counts()

# 计算每个标签需要选取的论文数量，使得总数为500篇，并尽可能平均分配给每个标签
total_selected = 500
selected_per_label = {}
for label, count in label_counts.items():
    selected_per_label[label] = min(int(count / len(label_counts) * total_selected), count)

# 创建保存论文文件夹的目标文件夹
output_folder = 'F:\Code\Word_detecting\Vector_ku\Train_vector'
os.makedirs(output_folder, exist_ok=True)

# 获取大文件夹中所有小文件夹的路径
source_folder = 'F:\Code\Word_detecting\Vector_ku\Save_Vector_Store'
all_folders = [os.path.join(source_folder, folder_name) for folder_name in os.listdir(source_folder)]

# 从所有小文件夹中随机选取相应数量的小文件夹，并复制到目标文件夹中
selected_folders = random.sample(all_folders, total_selected)
for folder in selected_folders:
    shutil.copytree(folder, os.path.join(output_folder, os.path.basename(folder)), dirs_exist_ok=True)

print("论文文件夹已成功保存到指定目标文件夹！")


论文文件夹已成功保存到指定目标文件夹！


合并excel

In [22]:
import pandas as pd

# 读取 Excel 文件
file_paths = ['Questions(Five)_batch_1.xlsx', 'Questions(Five)_batch_2.xlsx', 'Questions(Five)_batch_3.xlsx', 'Questions(Five)_batch_4.xlsx', 'Questions(Five)_batch_5.xlsx']
dfs = [pd.read_excel(file) for file in file_paths]

# 检查行数是否一致
rows_check = all([dfs[i].shape[1] == dfs[i+1].shape[1] for i in range(len(dfs)-1)])
if not rows_check:
    raise ValueError("行数不一致！")

# 按行拼接DataFrame
merged_df = pd.concat(dfs, axis=0)

# 将结果保存为新的 Excel 文件
merged_df.to_excel('Train_questions.xlsx', index=False)


将标签添加到问题文件夹中

In [1]:
import pandas as pd

# 读取两个 Excel 文件
df1 = pd.read_excel('Label_ver7.xlsx')
df2 = pd.read_excel('Questions with labels.xlsx')

# 提取第一个 Excel 文件的序号和标签列
serial_number_column = df1.iloc[:, 0]  # 假设第一列是序号列
label_column = df1.iloc[:, 11]  # 假设第五列是标签列

# 提取第二个 Excel 文件的论文名称列
paper_names = df2.iloc[:, 0]  # 假设第一列是论文名称列

# 创建一个新的列，并将标签赋值给这一列
labels_for_second_excel = []
for paper_name in paper_names:
    # 提取论文名称中的序号
    try:
        serial_number = int(paper_name.split('_')[0])
    except ValueError:
        print(f"无法解析论文名称 {paper_name} 中的序号。")
        continue
    
    # 在第一个 Excel 文件中找到对应序号的标签
    try:
        label = label_column[serial_number_column == serial_number].values[0]
    except IndexError:
        print(f"无法找到序号为 {serial_number} 的标签。")
        continue
    
    labels_for_second_excel.append(label)

# 将标签赋值给第二个 Excel 文件的新列
df2['标签'] = labels_for_second_excel

# 保存修改后的第二个 Excel 文件
df2.to_excel('modified_second_excel.xlsx', index=False)

