In [1]:
import re
import pandas as pd
import preprocess.pre as pre
df = pd.read_csv('../data/nlp.csv')
df.drop(['PostTypeId', 'ParentId', 'OwnerUserId', 'OwnerDisplayName','LastEditorUserId', 'LastEditorDisplayName', 'AcceptedAnswerId', 'Score',
       'ViewCount', 'AnswerCount', 'CommentCount', 'FavoriteCount'],axis=1,inplace=True)
df['nltk'] = False
df['spacy'] = False
df['gensim'] = False
df['stanford-nlp'] = False
df['scikit-learn'] = False

def isnltk(str):
    a = re.search(r'\bnltk\b', str)
    if a is None:
        return False
    else:
        return True
def isspacy(str):
    a = re.search(r'\bspacy\b', str)
    if a is None:
        return False
    else:
        return True
def isgensim(str):
    a = re.search(r'\bgensim\b', str)
    if a is None:
        return False
    else:
        return True
def isstanford_nlp(str):
    a = re.search(r'\b(stanford-nlp|stanford nlp)\b', str)
    if a is None:
        return False
    else:
        return True
def issklearn(str):
    a = re.search(r'\b(scikit-learn|scikit learn|sklearn)\b', str)
    if a is None:
        return False
    else:
        return True

In [2]:
for index, row in df.iterrows():
    title = str(row['Title'])
    tags = str(row['Tags'])
    body = str(row['Body'])
    title = pre.processbody(title)
    # print(title)
    body = pre.processbody(body)
    tags = pre.preprocesstag(tags)
    df.at[index, 'nltk'] = isnltk(title)|isnltk(body)|isnltk(tags)
    df.at[index, 'spacy'] = isspacy(title)|isspacy(body)|isspacy(tags)
    df.at[index, 'gensim'] = isgensim(title)|isgensim(body)|isgensim(tags)
    df.at[index, 'stanford-nlp'] = isstanford_nlp(title)|isstanford_nlp(body)|isstanford_nlp(tags)
    df.at[index, 'scikit-learn'] = issklearn(title)|issklearn(body)|issklearn(tags)

nltk = df[df['nltk']==True]
spacy = df[df['spacy']==True]
gensim = df[df['gensim']==True]
stanfordnlp = df[df['stanford-nlp']==True]
sklearn = df[df['scikit-learn']==True]

nltk.to_csv('../analysislib/nlp/nltk.csv')
spacy.to_csv('../analysislib/nlp/spacy.csv')
gensim.to_csv('../analysislib/nlp/gensim.csv')
stanfordnlp.to_csv('../analysislib/nlp/stanfordnlp.csv')
sklearn.to_csv('../analysislib/nlp/sklearn.csv')



In [3]:
import pandas as pd
import preprocess.pre as pre
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models import Phrases
# Remove rare and common tokens.
from gensim.corpora import Dictionary
# Train LDA model.
from gensim.models import LdaModel
from pprint import pprint
import numpy as np
import logging
from gensim.models import CoherenceModel
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


In [4]:
df = pd.read_csv('../analysislib/nlp/gensim.csv')
docs = []
for index, row in df.iterrows():
    body = pre.preprocess(row['Body'])
    docs.append(body)
print(len(docs))
print(docs[0][:50])

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]

# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)
dictionary.save('../analysislib/nlp/gensim.dict')
# dictionary = Dictionary.load('../data/nlp.dict')

# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]
np.save('../analysislib/nlp/gensim.npy',np.array(corpus))
# corpus = np.load('../data/nlp_corpus.npy').tolist()
print('Number of unique tokens: %d' % len(dictionary))
# print('Number of documents: %d' % len(corpus))

2020-10-30 20:54:04,937 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-10-30 20:54:05,167 : INFO : built Dictionary(13927 unique tokens: ['array', 'computer', 'consume', 'convert', 'corpus']...) from 2301 documents (total 355651 corpus positions)
2020-10-30 20:54:05,168 : INFO : saving Dictionary object under ../analysislib/nlp/gensim.dict, separately None
2020-10-30 20:54:05,173 : INFO : saved ../analysislib/nlp/gensim.dict
  np.save('../analysislib/nlp/gensim.npy',np.array(corpus))


2301
p x csr matrix obtained using scikit tfidf vectori
Number of unique tokens: 13927


In [7]:
best_coherence = -100
best_num_topics = 0
coherences = []
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

    # # Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token
for i in range(5,15):

    num_topics = i
    model = LdaModel(
        corpus=corpus,
        id2word=id2word,
        chunksize=chunksize,
        alpha='auto',
        eta='auto',
        iterations=iterations,
        num_topics=num_topics,
        passes=passes,
        eval_every=eval_every
    )

    top_topics = model.top_topics(corpus)  # , num_words=20)
    coherence_model_lda = CoherenceModel(model=model, texts=docs, corpus=corpus, dictionary=dictionary, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()

    coherences.append(coherence_lda)
    if coherence_lda > best_coherence:
        best_num_topics = i
        best_coherence = coherence_lda
        # model.save('../model/nlp_10/nlp_10.model')
    model.print_topics(num_topics=i, num_words=15)

print("best coherence: "+str(best_coherence))
print("best topic nums: "+str(best_num_topics))
print(coherences)

num_topics = best_num_topics
model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)
top_topics = model.top_topics(corpus)  # , num_words=20)
coherence_model_lda = CoherenceModel(model=model, texts=docs, corpus=corpus, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
model.save('../model/nlp_gensim/nlp_gensim.model')
model.print_topics(num_topics=best_num_topics, num_words=15)
print(coherence_lda)

2020-10-30 21:11:18,472 : INFO : using autotuned alpha, starting with [0.2, 0.2, 0.2, 0.2, 0.2]
2020-10-30 21:11:18,475 : INFO : using serial LDA version on this node
2020-10-30 21:11:18,485 : INFO : running online (multi-pass) LDA training, 5 topics, 20 passes over the supplied corpus of 2301 documents, updating model once every 2000 documents, evaluating perplexity every 0 documents, iterating 400x with a convergence threshold of 0.001000
2020-10-30 21:11:18,487 : INFO : PROGRESS: pass 0, at document #2000/2301
2020-10-30 21:11:24,469 : INFO : optimized alpha [0.05905144, 0.040657654, 0.04430619, 0.08075576, 0.08834776]
2020-10-30 21:11:24,473 : INFO : merging changes from 2000 documents into a model of 2301 documents
2020-10-30 21:11:24,479 : INFO : topic #0 (0.059): 0.027*"model" + 0.026*"code" + 0.018*"gt" + 0.017*"gensim" + 0.015*"py" + 0.014*"word" + 0.014*"vec" + 0.013*"python" + 0.013*"pre" + 0.012*"file"
2020-10-30 21:11:24,480 : INFO : topic #1 (0.041): 0.036*"word" + 0.030*

best coherence: 0.5798851261871809
best topic nums: 5
[0.5798851261871809, 0.558132382173376, 0.5394200484926956, 0.5447569280658122, 0.5069833558568301, 0.5480042168118564, 0.538884590999023, 0.5403941362042827, 0.4989240945307145, 0.543255647259518]
0.5433445778114379
