In [68]:
import spacy
import fitz  # PyMuPDF
import os

In [88]:
nlp = spacy.load("en_core_web_sm")

def extract_text_from_first_page(pdf_path):
    # 打开PDF文件
    doc = fitz.open(pdf_path)
    # 提取第一页的文本
    first_page_text = doc[0].get_text()
    return first_page_text

def find_title_with_spacy(text):
    # 使用spaCy处理文本
    doc = nlp(text)
    # 尝试获取文档中的第一个句子作为标题
    first_sentence = next(doc.sents, None)
    if first_sentence:
        # 将句子转换为文本，并替换掉其中的换行符\n为单个空格
        title = first_sentence.text.replace('\n', ' ')
        # 清除Windows文件名中不兼容的字符
        
        return title
    else:
        return "Title Not Found"

In [89]:
pdf_path = 'papers/2201.09058.pdf'
# 从第一页提取文本
text = extract_text_from_first_page(pdf_path)

In [90]:
title = find_title_with_spacy(text)

print("Extracted Title:", title)

Extracted Title: DeepTrader: A Deep Reinforcement Learning Approach for Risk-Return Balanced Portfolio Management with Market Conditions Embedding Zhicheng Wang1,2, Biwei Huang3, Shikui Tu1,2, Kun Zhang3, Lei Xu1,2 1Department of Computer Science and Engineering, Shanghai Jiao Tong University 2Centre for Cognitive Machines and Computational Health (CMaCH), Shanghai Jiao Tong University 3Department of Philosophy, Carnegie Mellon University {wwwangzhch, tushikui, leixu}@sjtu.edu.cn, biweih@andrew.cmu.edu, kunz1@cmu.edu Abstract Most existing reinforcement learning (RL)-based port- folio management models do not take into account the market conditions, which limits their performance in risk-return balancing.


In [72]:
title

'DeepScalper: A Risk-Aware Reinforcement Learning Framework to Capture Fleeting Intraday Trading Opportunities Shuo Sun Nanyang Technological University Wanqi Xue Nanyang Technological University Rundong Wang∗ Nanyang Technological University Xu'

In [73]:
def clean_title(text):
    # 使用spaCy处理文本
    doc = nlp(text)
    # 初始化一个空列表来存储要排除的实体（人名和组织名）
    exclude_entities = []
    for ent in doc.ents:
        # 如果实体类型是人名或组织，添加到排除列表
        if ent.label_ in ["PERSON", "ORG"]:
            print(ent.text)
            exclude_entities.append(ent.text)
    
    # 提取的标题初始化为完整文本
    cleaned_title = text
    # 从标题中移除所有排除的实体
    for entity in exclude_entities:
        cleaned_title = cleaned_title.replace(entity, "")
    for ch in ['<', '>', ':', '"', '/', '\\', '|', '?', '*']:
        cleaned_title = cleaned_title.replace(ch, '')
    # 返回清理后的标题，这里简单地取第一句作为标题
    return cleaned_title.split('\n', 1)[0].strip()

In [74]:
title = clean_title(title)

Nanyang Technological University
Rundong Wang∗
Nanyang Technological University


In [75]:
title

'DeepScalper A Risk-Aware Reinforcement Learning Framework to Capture Fleeting Intraday Trading Opportunities Shuo Sun  Wanqi Xue    Xu'

In [76]:
def rename_pdf(pdf_path, title):
    # Sanitize title to make it a valid filename
    
    new_filename = title + ".pdf"  # Truncate long titles and add .pdf extension
    
    # Get directory of the original PDF
    #dir_name = os.path.dirname(pdf_path)
    
    # Construct new path with the same directory but new title
    #new_path = os.path.join(dir_name, new_filename)
    
    # Rename file
    os.rename(pdf_path, new_filename)
    print(f"File renamed to: {new_filename}")

In [78]:
rename_pdf(pdf_path, title)

File renamed to: DeepScalper A Risk-Aware Reinforcement Learning Framework to Capture Fleeting Intraday Trading Opportunities Shuo Sun  Wanqi Xue    Xu.pdf


In [79]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

In [80]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [91]:
prompt_text = "Extract the title of this paper:" + title

In [97]:
input_ids = tokenizer.encode(prompt_text, return_tensors='pt')

# 生成文本
output_sequences = model.generate(
    input_ids=input_ids,
    max_length=190,  # 设置生成文本的最大长度
    temperature=1.0,  # 控制文本生成的随机性
    num_return_sequences=1,  # 生成文本的数量
)

# 解码生成的文本
generated_text = tokenizer.decode(output_sequences[0], skip_special_tokens=True)
print(generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Extract the title of this paper:DeepTrader: A Deep Reinforcement Learning Approach for Risk-Return Balanced Portfolio Management with Market Conditions Embedding Zhicheng Wang1,2, Biwei Huang3, Shikui Tu1,2, Kun Zhang3, Lei Xu1,2 1Department of Computer Science and Engineering, Shanghai Jiao Tong University 2Centre for Cognitive Machines and Computational Health (CMaCH), Shanghai Jiao Tong University 3Department of Philosophy, Carnegie Mellon University {wwwangzhch, tushikui, leixu}@sjtu.edu.cn, biweih@andrew.cmu.edu, kunz1@cmu.edu Abstract Most existing reinforcement learning (RL)-based port- folio management models do not take into account the market conditions, which limits their performance in risk-return balancing. However, the current model is based on


In [98]:
# 假设生成的文本中标题在最前面，可以通过一些简单的字符串操作提取出来
# 这里只是一个示例，具体方法可能需要根据实际情况调整
title = generated_text.split('.')[0]
print("Generated Title:", title)

Generated Title: Extract the title of this paper:DeepTrader: A Deep Reinforcement Learning Approach for Risk-Return Balanced Portfolio Management with Market Conditions Embedding Zhicheng Wang1,2, Biwei Huang3, Shikui Tu1,2, Kun Zhang3, Lei Xu1,2 1Department of Computer Science and Engineering, Shanghai Jiao Tong University 2Centre for Cognitive Machines and Computational Health (CMaCH), Shanghai Jiao Tong University 3Department of Philosophy, Carnegie Mellon University {wwwangzhch, tushikui, leixu}@sjtu
