In [1]:
import glob
import os

In [2]:
pdf_path = "pdf/"

In [3]:
pdfs = glob.glob("{}/*.pdf".format(pdf_path))
pdfs

['pdf/NASA UAM market Study 2018.pdf',
 'pdf/Innovation Driving Sustainable Aviation - November 2021.pdf',
 'pdf/roland_berger_urban_air_mobility_1.pdf',
 'pdf/Roland_Berger_Urban_Air_Mobility 2018.pdf']

In [16]:
# 首先要下载停用词，nltk自然语言处理包具有16种不同语言存储的停用词列表。
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/zhangquan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
import hanlp
# 因为是英文文档，所以直接使用hanlp的英文包
#tokenizer = hanlp.utils.rules.tokenize_english 
#from hanlp.utils.lang.en.english_tokenizer import tokenize_english
#tokenizer = tokenize_english
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.tokenize import WhitespaceTokenizer 
from collections import defaultdict
import math
import operator
import pandas as pd
import xlwt


def feature_select(list_words):
    #总词频统计
    doc_frequency=defaultdict(int)
    for word_list in list_words:
        for i in word_list:
            doc_frequency[i]+=1
 
    #计算每个词的TF值
    word_tf={}  #存储没个词的tf值
    for i in doc_frequency:
        word_tf[i]=doc_frequency[i]/sum(doc_frequency.values())
 
    #计算每个词的IDF值
    doc_num=len(list_words)
    word_idf={} #存储每个词的idf值
    word_doc=defaultdict(int) #存储包含该词的文档数
    for i in doc_frequency:
        for j in list_words:
            if i in j:
                word_doc[i]+=1
    for i in doc_frequency:
        word_idf[i]=math.log(doc_num/(word_doc[i]+1))
 
    #计算每个词的TF*IDF的值
    word_tf_idf={}
    for i in doc_frequency:
        word_tf_idf[i]=word_tf[i]*word_idf[i]
 
    # 对字典按值由大到小排序
    # 这里可以调整输出关键词的个数
    dict_feature_select=sorted(word_tf_idf.items(),key=operator.itemgetter(1),reverse=True)
    return dict_feature_select[-10:]



def extract_pdf_content(pdf):
    rsrcmgr = PDFResourceManager()
    codec = 'utf-8'
    outfp = StringIO()
    laparams = LAParams()
    device = TextConverter(rsrcmgr=rsrcmgr, outfp=outfp, laparams=laparams)
    with open(pdf, 'rb') as fp:
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0
        caching = True
        pagenos=set()
        for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
            interpreter.process_page(page)
    stop_words = set(stopwords.words('english')) 
    #word_tokens = word_tokenize(outfp.getvalue()) 
    word_tokens = WhitespaceTokenizer().tokenize(outfp.getvalue()) 
    mystr = [w for w in word_tokens if not w in stop_words]
    device.close()
    outfp.close()
    return mystr


#  将数据写入新文件
def data_write(file_path, datas, pdf):
    f = xlwt.Workbook()
    sheet1 = f.add_sheet(u'sheet1',cell_overwrite_ok=True) #创建sheet
    
    #将数据写入第 i 行，第 j 列
    i = 0
    for data in datas:
        for j in range(len(data)):
                sheet1.write(i,j,data[j])
        i = i + 1        
    f.save(file_path) #保存文件

In [9]:
mydict = {}
datas = []
j = 0
for pdf in pdfs:    
    key = pdf.split('/')[-1]    
    if not key in mydict:        
        print("Extracting content from {} ...".format(pdf))  
        mydict[key] = extract_pdf_content(pdf)
        features=feature_select([mydict[key]])
        #print(features[0])
        data=[pdf,features[9][0],features[8][0],features[7][0],features[6][0],features[5][0],features[4][0],features[3][0],features[2][0],features[1][0],features[0][0]]
        for i in range (0, len(data)):
            data[i] = str (data[i])
        str1 = " \n"
        str1 = str1.join(data)
        with open("test.txt","a") as f:
                f.write(str1)
                f.write("\n")


Extracting content from pdf/NASA UAM market Study 2018.pdf ...
Extracting content from pdf/Innovation Driving Sustainable Aviation - November 2021.pdf ...
Extracting content from pdf/roland_berger_urban_air_mobility_1.pdf ...
Extracting content from pdf/Roland_Berger_Urban_Air_Mobility 2018.pdf ...
