-
Notifications
You must be signed in to change notification settings - Fork 0
/
Text mining from PDF.R
81 lines (63 loc) · 1.98 KB
/
Text mining from PDF.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#The implementation was coded in R
#import libraies
library(pdftools)
library(tmcn)
library(stringr)
library(jiebaR)
library(chinese.misc) #if the PDF is in Chinese version
library(wordcloud)
library(corpus)
library(tm)
files <- list.files(pattern = "pdf$")
length(files)
#combine all files into a single file
pdf_combine(files[1:300], output = "joined-1.pdf")
pdf_combine(files[301:600], output = "joined-2.pdf")
pdf_combine(files[601:900], output = "joined-3.pdf")
pdf_combine(files[901:1162], output = "joined-4.pdf") #the script might collapse when dealing with too many files in one combine
#it is much safer to simply combine them seperately
...
pdf_combine(c("joined-1.pdf", "joined-2.pdf", "joined-3.pdf", "joined-4.pdf"), output = "final-joined.pdf")
#extract texts from the combined PDF file
pdf.text <- pdftools::pdf_text("final-joined.pdf")
texts <- paste(pdf.text)
View(texts)
#segment the texts
cutter <- worker()
segWords <- segment(texts, cutter)
segWords
#remove numbers from the texts
no_number <- gsub("[0-9a-zA-Z/.]+?", "", segWords) #regular expression
no_number
#define stopwords in a .txt file and remove
new_cutter <- worker(type = "tag", stop_word="stop_words.txt")
pos <- segment(no_number, new_cutter)
pos
length(pos)
#define the texts length
res = list()
for (i in 1:461253) { #this can be adjusted based on the length of 'pos'
tryCatch(
expr = {
if (pos[i][(tag="n")] != 'NA') {
res <- append(res, pos[i])
}
},
error = function(e){
}
)
}
res
res1 <- as.character(res)
#define the number of final texts with frequency
keys <- worker("keywords",topn=100)
final <- vector_keywords(res1, keys)
final
#visualization of the extracted rules
library(plyr)
tableWord <- count(res1)
View(tableWord)
library(wordcloud)
windowsFonts(myFont = windowsFont("楷体")) ## Use Chinese color cloud font if they are in Chinese
wordcloud(tableWord[,1], tableWord[,2],
random.order = F, col = (length(freq)), min.freq=10, family = "myFont")