In [31]:
# Splitting text data and storing them in a list (of articles)
import io
docs = io.open("raw_data.txt", mode="r", encoding="utf-8", errors="ignore").read().split('\n') # list of strings 
titles_raw = [docs[i] for i in range(len(docs)) if i % 2 == 0] # list of string titles
contents_raw = [docs[i] for i in range(len(docs)) if i % 2 == 1] # list of string contents
titles = []
contents = []
for i in range(len(titles_raw)):
    if contents_raw[i] != '':
        titles.append(titles_raw[i])
        contents.append(contents_raw[i])
titles = list(set(titles))
contents = list(set(contents))

print(len(titles))
print(len(contents))


611
611


In [32]:
# Preprocessing/ cleaning the data
import re
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize

# remove text between parenthesis
# contents = list(map(lambda x: re.sub(r"\(.*\)","",x), contents))

# remove all digits from text
contents = list(map(lambda x: re.sub(r"\d+","",x), contents))

stop = set(stopwords.words('english')) # set of stopwords
lemma = WordNetLemmatizer()
def clean(doc):
    # remove stopwords and words that are too short
    return [lemma.lemmatize(i, 'v') for i in word_tokenize(doc) if i not in stop and len(i) > 2]
cleaned = [clean(page.lower()) for page in contents]

print(len(cleaned))

611


In [33]:
import numpy as np
# Building word dicitonary
from gensim import corpora
# create the term dictionary of our corpus; terms are unique; each term is assigned an index
dictionary = corpora.Dictionary(cleaned)
print(dictionary)
dictionary.filter_extremes(no_below=3, no_above=0.7)
print(dictionary)
#filtering for words that are semantically related within the dictionary 
stoplist = set('also use make people know many call include part find become like mean often different usually take wikt come give well get since type list say change see refer actually iii aisne kinds pas ask would way something need things want every str'.split())
stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id]
dictionary.filter_tokens(stop_ids)
print(dictionary)
dictionary.filter_n_most_frequent(50)
print(dictionary)

# This saves the dictionary to the local disk
dictionary.save_as_text('./dictionary.txt')


Dictionary(25740 unique tokens: ['.another', 'abstraction', 'act', 'adapt', 'algebraic']...)
Dictionary(5934 unique tokens: ['.another', 'abstraction', 'act', 'adapt', 'algebraic']...)
Dictionary(5896 unique tokens: ['.another', 'abstraction', 'act', 'adapt', 'algebraic']...)
Dictionary(5846 unique tokens: ['.another', 'abstraction', 'act', 'adapt', 'algebraic']...)


In [34]:
# Creating document-term matrix from vocabulary (dictionary)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in cleaned]
print(len(doc_term_matrix))
print(len(doc_term_matrix[1]))
print(doc_term_matrix[11])

611
738
[(4, 3), (9, 1), (10, 1), (14, 1), (15, 5), (20, 1), (38, 1), (39, 1), (50, 1), (60, 1), (63, 1), (69, 1), (75, 1), (85, 1), (93, 1), (98, 4), (103, 1), (109, 1), (174, 1), (196, 1), (202, 2), (217, 4), (226, 22), (252, 8), (253, 1), (290, 9), (320, 1), (331, 1), (348, 4), (349, 1), (426, 5), (427, 1), (471, 1), (483, 1), (517, 1), (527, 1), (528, 1), (533, 1), (537, 2), (557, 2), (562, 1), (564, 7), (592, 1), (613, 1), (625, 2), (627, 1), (641, 1), (655, 1), (679, 1), (681, 1), (689, 1), (699, 1), (703, 1), (745, 1), (750, 1), (751, 1), (757, 1), (769, 1), (788, 1), (813, 1), (832, 6), (890, 2), (900, 1), (940, 1), (1001, 1), (1010, 1), (1024, 3), (1031, 1), (1059, 1), (1096, 1), (1106, 1), (1119, 1), (1141, 3), (1150, 1), (1188, 1), (1202, 1), (1271, 2), (1276, 1), (1292, 1), (1328, 6), (1347, 3), (1386, 1), (1405, 2), (1430, 2), (1498, 1), (1532, 1), (1561, 1), (1572, 1), (1577, 1), (1599, 1), (1612, 2), (1625, 2), (1770, 1), (2091, 1), (2092, 1), (2097, 1), (2103, 3), (2178

In [35]:
# import numpy as np
# M = np.array([[0 for i in range(len(dictionary))] for i in range(len(dictionary))])
# for i in range (len(doc_term_matrix)):
#     for j in range (len(doc_term_matrix[i])):
#         for k in range(j+1, len(doc_term_matrix[i])):
#             M[doc_term_matrix[i][j][0]][doc_term_matrix[i][k][0]] +=1
#             M[doc_term_matrix[i][k][0]][doc_term_matrix[i][j][0]] +=1
#         freqMax = max(M[j])
#         M[j] = M[j] / freqMax
        
# #         M[j] = [x / freqMax for x in M[j]]

# for j in range(len(M)):
#     minM = min(M[j])
#     maxM = max(M[j])
#     R = minM/maxM
#     rel = R/(1+R)
#     for k in range(len(M[j])):
#         M[j][k] = rel
# M


In [38]:
# Training LDA model
# LDA automatically finds the mixture of similar words together, thus forming the topic or theme. we use this 
# unsupervised learning technique to identify the categories to which these articles belong, and the groups/clusters
# within the collection. 

from gensim.models.ldamodel import LdaModel as Lda

ldamodel = Lda(doc_term_matrix, num_topics=15, id2word = dictionary)

# Showing the 15 identified topics after the model is trained, where top 10 key terms are listed for each topic
for topic in ldamodel.print_topics(num_topics=15, num_words=10):
    print(topic[0]+1, " ", topic[1],"\n")

1   0.007*"matrices" + 0.005*"vectors" + 0.005*"coordinate" + 0.005*"group" + 0.005*"map" + 0.005*"transformation" + 0.004*"geometry" + 0.004*"euclidean" + 0.004*"manifold" + 0.004*"eigenvalues" 

2   0.007*"matrices" + 0.007*"row" + 0.007*"group" + 0.006*"line" + 0.006*"rank" + 0.006*"map" + 0.004*"vectors" + 0.004*"methods" + 0.004*"polynomial" + 0.004*"elements" 

3   0.009*"ring" + 0.008*"coordinate" + 0.005*"vectors" + 0.005*"group" + 0.004*"map" + 0.004*"multiplication" + 0.003*"matrices" + 0.003*"elements" + 0.003*"addition" + 0.003*"algorithm" 

4   0.011*"polynomial" + 0.006*"vectors" + 0.005*"sum" + 0.004*"model" + 0.004*"ring" + 0.004*"coefficients" + 0.004*"subspace" + 0.003*"polynomials" + 0.003*"matrices" + 0.003*"map" 

5   0.011*"group" + 0.009*"row" + 0.006*"matrices" + 0.005*"zero" + 0.005*"finite" + 0.004*"solution" + 0.004*"quadratic" + 0.004*"numerical" + 0.004*"ring" + 0.004*"algorithm" 

6   0.008*"polynomial" + 0.005*"image" + 0.004*"matrices" + 0.004*"ring" + 0

In [41]:
# Clustering documents based on topics extracted from LDA model 
from operator import itemgetter
def cluster(doc_term_matrix, num):
    doc_topics = ldamodel.get_document_topics(doc_term_matrix, minimum_probability=0.20)
    result = [[] for i in range(num)]
    for k,topic in enumerate(doc_topics):
        # Some articles do not have a topic
        if topic:
            topic.sort(key = itemgetter(1), reverse=True)
            result[topic[0][0]].append(k)
    for k in range(len(result)):
        print('Articles(ID) in Cluster ' + str(k+1) + ': ' + ', '.join(map(str, result[k])))
        print()
    return result
cluster_result = cluster(doc_term_matrix, 15)

Articles(ID) in Cluster 1: 17, 21, 40, 41, 45, 74, 82, 94, 113, 116, 119, 172, 196, 197, 229, 259, 266, 296, 301, 309, 328, 394, 410, 435, 458, 468, 476, 492, 493, 503, 528, 545, 556, 563, 589

Articles(ID) in Cluster 2: 0, 7, 8, 13, 30, 42, 53, 73, 85, 98, 102, 106, 108, 114, 124, 147, 164, 173, 187, 189, 190, 201, 202, 207, 210, 212, 214, 215, 220, 224, 228, 243, 244, 247, 283, 293, 307, 316, 317, 318, 341, 346, 364, 367, 377, 388, 389, 411, 422, 426, 427, 438, 455, 477, 481, 530, 531, 537, 567, 580, 593, 596, 598, 601

Articles(ID) in Cluster 3: 64, 103, 105, 111, 205, 264, 276, 354, 379, 385, 396, 398, 478, 489, 508, 517, 544, 554, 587, 595, 599

Articles(ID) in Cluster 4: 14, 20, 24, 37, 48, 79, 96, 176, 181, 184, 231, 235, 277, 295, 314, 343, 344, 352, 371, 393, 424, 500, 524, 540, 541, 572, 604

Articles(ID) in Cluster 5: 5, 11, 38, 51, 57, 60, 61, 62, 70, 77, 97, 110, 127, 128, 137, 141, 153, 158, 159, 194, 213, 216, 218, 219, 246, 249, 253, 258, 270, 273, 291, 319, 324, 332, 3

In [42]:
# Showing the exact document titles in each cluster
for k in range(len(cluster_result)):
    print('Articles in Cluster ' + str(k+1) + ': ' + ', '.join(map(lambda x: titles[x], cluster_result[k])))
    print()

Articles in Cluster 1: Tensor operator, Set (mathematics), Triangle inequality, Null vector, Defective matrix, Approximation theory, Series acceleration, Remez algorithm, Frame (linear algebra), Joint spectral radius, Matrix norm, Row equivalence, Sesquilinear form, Graded (mathematics), Complex conjugate vector space, MATLAB, K-SVD, Orthogonal Procrustes problem, Entanglement-assisted stabilizer formalism, Row and column spaces, Rayleigh quotient, Quotient space (linear algebra), De Casteljau's algorithm, Explicit algebraic stress model, Self-adjoint, Sparse grid, Uzawa iteration, Predictor–corrector method, Wolfram Language, Linear equation over a ring, Range (mathematics), Matrix multiplication, Generalized singular value decomposition, Hilbert–Poincaré series, Dynamic relaxation

Articles in Cluster 2: Resolvent set, Piecewise linear continuation, Stiffness matrix, Portal:Linear algebra, S-procedure, Rod calculus, Engineering, Whitney inequality, Graphics processing unit, Clenshaw 

In [56]:
term_topics = ldamodel.get_term_topics('convex', minimum_probability=0.000001)
print(term_topics)

[(0, 0.0018019278), (1, 0.0003391568), (2, 0.0002601787), (3, 0.00022626804), (4, 0.00021372795), (5, 0.00057895511), (6, 0.00030629951), (7, 0.00067538727), (8, 0.00044926157), (9, 0.00035495567), (10, 0.00099961704), (11, 0.00024742377), (12, 0.00029577976), (13, 0.00029819994), (14, 0.00031532196)]


In [57]:
# Getting related documents based on a term 
def get_related_documents(term, top, doc_term_matrix):
    print('------- Top', top, 'articles related to',term,'-------')
    related_docs = []
    doc_topics = ldamodel.get_document_topics(doc_term_matrix, minimum_probability=0.20)
    term_topics = ldamodel.get_term_topics(term, minimum_probability=0.000001)
    term_topics.sort(key = itemgetter(1), reverse=True)
    for k,topic in enumerate(doc_topics):
        if topic:
            topic.sort(key = itemgetter(1), reverse=True)
            if topic[0][0] == term_topics[0][0]:
                related_docs.append((k,topic[0][1]))
    related_docs.sort(key = itemgetter(1), reverse=True)
    result = []
    for j,doc in enumerate(related_docs):
        print(titles[doc[0]],"\n",doc[1],"\n")   
        result.append(titles[doc[0]])
        if j == top - 1:
            break
related_docs = get_related_documents('convex', 7, doc_term_matrix)


------- Top 7 articles related to convex -------
Entanglement-assisted stabilizer formalism 
 0.996538 

Row and column spaces 
 0.996451 

Matrix multiplication 
 0.995357 

Remez algorithm 
 0.995189 

Defective matrix 
 0.994574 

Complex conjugate vector space 
 0.992651 

Frame (linear algebra) 
 0.992157 



In [45]:
def get_theme(doc, cluster_result):
    doc_id = titles.index(doc)
    if doc_id == -1:
        print('Document not found.')
        return
    for i, cluster in enumerate(cluster_result):
        if doc_id in cluster:
            return i+1
    return 0
cluster_num = get_theme('Absolutely convex set', cluster_result)
print(cluster_num)

13


In [46]:
# Implementing tf-idf model; the only information needed from the previous part is the doc_term_matrix
from gensim.models import TfidfModel, LsiModel
tfidf_model = TfidfModel(doc_term_matrix, dictionary = dictionary)
print(tfidf_model)
vector = tfidf_model[doc_term_matrix[0]]
print(vector[0])


TfidfModel(num_docs=611, num_nnz=143903)
(0, 0.056852536755099284)


In [47]:
# Implementing LSI model; the only information needed from the previous part is the doc_term_matrix
lsi_model = LsiModel(doc_term_matrix, id2word=dictionary)
print(lsi_model)

LsiModel(num_terms=5846, num_topics=200, decay=1.0, chunksize=20000)


In [48]:
# Creating the similarity matrix from simple bag-of-words model (# of documents * # of documents)
from gensim import similarities

index = similarities.MatrixSimilarity(doc_term_matrix, num_features=len(dictionary))
print(len(index[doc_term_matrix[610]])) # 611 * 611 matrix

611


In [49]:
# Training tf-idf model from bag-of-word dataset
model_tfidf = TfidfModel(doc_term_matrix, id2word=dictionary, normalize=False)

In [50]:
# Applying tf-idf model to all vectors
from gensim.corpora import MmCorpus
MmCorpus.serialize('./corpus_tfidf.mm', model_tfidf[doc_term_matrix], progress_cnt=100)

In [51]:
corpus_tfidf = MmCorpus('./corpus_tfidf.mm') # Loading back the corpus file after applying tf-idf
model_lsi = LsiModel(corpus_tfidf, num_topics=15, id2word=dictionary)
# Applying LSI model to all vectors
index = similarities.MatrixSimilarity(model_lsi[corpus_tfidf], num_features=len(dictionary))
print(index)
index.save('./lsi_index.mm') # Saving the similarity matrix to a local matrix market file named './lsi_model.mm'

MatrixSimilarity<611 docs, 5846 features>


In [52]:
# Loading the similarity matrix back from the local file
similarity_matrix = similarities.MatrixSimilarity.load('./lsi_index.mm')
print(len(similarity_matrix))

611
