In [1]:
#import modules
import os.path
import nltk
from gensim import corpora
from gensim.models import LsiModel
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt

def getData(path,file): #functie pentru preluarea informatiei din fisier
    documentsList = []
    titles=[]
    with open( os.path.join(path, file) ,"r") as fin:
        for line in fin.readlines():
            text = line.strip()
            documentsList.append(text)
    print("Numarul total de documente:",len(documentsList))
    titles.append( text[0:min(len(text),100)] ) #considera titlu primele 100 cuvinte
    return documentsList,titles

def prepreparingData(docList): # functie pentru a desparti in termeni textul, a sterge si se categorisesc cuvintele cu aceeasi radacina(stemming)
    # initializare regex tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    # se creaza lista de stop words in engleza, mai exact cuvinte de legatura sau fara relevanta in mesajul textului
    en_stop = set(stopwords.words('english'))
    # pentru cuvintele regasite sub diferite forme
    p_stemmer = PorterStemmer()
    texts = []
    for i in docList:
        raw = i.lower() # se aduce textul la lowercase
        tokens = tokenizer.tokenize(raw)
        # eliminare stop words
        stopped_tokens = [i for i in tokens if not i in en_stop]
        # stemming
        stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
        texts.append(stemmed_tokens)
    return texts

def preparingDoc(prepreparedDoc): # functie cu scopul de a crea matricea termeni-documente 
    # fiecarui cuvant dintr-un document ii este atribuit un index conturandu-se astfel un dictionar
    dictionary = corpora.Dictionary(prepreparedDoc)
    # crearea matricei termeni documente
    DocumentsTermsMatrix = [dictionary.doc2bow(doc) for doc in prepreparedDoc]  
    return dictionary,DocumentsTermsMatrix

def createLSAModel(document,nrTopics,words): # cream LSA utilizand gensim
    dictionary,DocumentsTermsMatrix = preparingDoc(document)
    #LSA 
    lsamodel = LsiModel(DocumentsTermsMatrix, num_topics = nrTopics, id2word = dictionary)  
    print(lsamodel.print_topics(num_topics = nrTopics, num_words = words))
    return lsamodel

numberTopics = 7
words = 10
documentList,titles = getData("","papers.csv")
prepreparedText = prepreparingData(documentList)
model = createLSAModel(prepreparedText,numberTopics,words)

def coherenceValues(dictionary, documentTermMatrix, preparedDocument, maxNrTopics, start=2, step=3):
    coherenceValues = []
    modelList = []
    for nrtopics in range(start, maxNrTopics, step):
        model = LsiModel(documentTermMatrix, nrtopics=numberTopics, id2word = dictionary)
        modelList.append(model)
        coherencemodel = CoherenceModel(model=model, texts=preparedDocument, dictionary=dictionary, coherence='c_v')
        coherenceValues.append(coherencemodel.get_coherence())
    return modelList, coherenceValues
def plot_graph(preparedDoc,start, stop, step):
    dictionary,documentsTermsMatrix=preparingDoc(preparedDoc)
    modelList, coherenceValues = coherenceValues(dictionary, documentsTermsMatrix,preparedDoc,
                                                            stop, start, step)
    # Show graph
    x = range(start, stop, step)
    plt.plot(x, coherenceValues)
    plt.xlabel("Nr Topics")
    plt.ylabel("Coerenta")
    plt.legend(("coherence_values"), loc='best')
    plt.show()



start,stop,step=2,12,1
plot_graph(prepreparedText,start,stop,step)

Total Number of Documents: 4798879
