In [1]:
#load text mining library in R
library(tm)

Loading required package: NLP


In [2]:
#set working directory to the peshitta directory
setwd("/Users/Coeckie/downloads/syrtxt/peshitta")

In [3]:
#list all the documents into one file
list.files(getwd()) -> peshitta

In [4]:
#create character vectors for all lines in the documents
lapply(peshitta,readLines) -> peshitta_vector

In [5]:
#corpus creation out of the character vectors
Corpus(VectorSource(peshitta_vector)) -> peshitta_corpus

In [6]:
#create list of stopwords 
c("%verse", "%bookname", "%language", "syriac", "1", "2", "3", "4", "5", "6", "7", "8", "9") -> myStopwords

In [7]:
#remove stopwords
tm_map(peshitta_corpus, removeWords, myStopwords)

“transformation drops documents”

<<SimpleCorpus>>
Metadata:  corpus specific: 1, document level (indexed): 0
Content:  documents: 72

In [8]:
#create document term matrix of the corpus
DocumentTermMatrix(peshitta_corpus) -> dtm

In [13]:
#List frequency of words in decreasing order
peshitta -> rownames(dtm)
colSums(as.matrix(dtm)) -> freq
length(freq)
order(freq,decreasing=TRUE) -> ord

In [15]:
#Write frequency table to a csv file and print it below
freq[ord]
write.csv(freq[ord], "word_freq.csv")

In [16]:
#load topicmodels library
library(topicmodels)

In [18]:
#fill out parameters for Gibbs sampling
burnin <- 4000
iter <- 2000
thin <- 500
seed <-list(2003,5,63,100001,765)
nstart <- 5
best <- TRUE

In [19]:
#number of topics
k <- 40

In [20]:
#perform LDA Topic Modelling
Results <- LDA(dtm,k, method="Gibbs", control=list(nstart=nstart, seed = seed, best=best, burnin = burnin, iter = iter, thin=thin))

In [22]:
#write csv of topic assignments to documents
Results.topics <- as.matrix(topics(Results))
write.csv(Results.topics,file=paste("LDAGibbs",k,"DocsToTopics.csv"))

In [23]:
#write csv of the first 100 terms for every topic
Results.terms <- as.matrix(terms(Results,100))
write.csv(Results.terms,file=paste("LDAGibbs",k,"TopicsToTerms.csv"))

In [24]:
#write csv of the probabilities associated with each topic assignment
topicProbabilities <- as.data.frame(Results@gamma)
write.csv(topicProbabilities,file=paste("LDAGibbs",k,"TopicProbabilities.csv"))