In [15]:
# Splitting text data and storing them in a list (of articles)
import io
docs = io.open("raw_data.txt", mode="r", encoding="utf-8", errors="ignore").read().split('\n') # list of strings 
titles_raw = [docs[i] for i in range(len(docs)) if i % 2 == 0] # list of string titles
contents_raw = [docs[i] for i in range(len(docs)) if i % 2 == 1] # list of string contents
titles = []
contents = []
for i in range(len(titles_raw)):
    if contents_raw[i] != '':
        titles.append(titles_raw[i])
        contents.append(contents_raw[i])
titles = list(set(titles))
contents = list(set(contents))

print(titles)
print(len(contents))

['Invariant subspace', 'Gershgorin circle theorem', "Wilkinson's polynomial", 'Computer vision', 'Angles between flats', 'Dieudonné determinant', 'Coordinate space', 'James H. Wilkinson', 'Fredholm alternative', 'Matrix congruence', 'Pseudovector', 'Three-dimensional rotation operator', 'Function (mathematics)', 'Order of accuracy', 'Discrete Fourier transform', 'Generalized singular value decomposition', 'Adaptive stepsize', 'Propagation of uncertainty', 'Rank factorization', 'Image (mathematics)', 'Faddeev–LeVerrier algorithm', 'Range (mathematics)', 'James Joseph Sylvester', 'Diagonal matrix', 'Characteristic polynomial', 'Sinc numerical methods', 'Finite field', 'Rule of Sarrus', 'Projection-valued measure', 'Rigid body dynamics', 'Mathematics', 'Multilinear algebra', 'Haynsworth inertia additivity formula', 'Local convergence', 'Boundary particle method', 'Conjugate transpose', 'Nonlinear eigenproblem', 'Inner product space', 'Matrix analysis', 'Charles Sanders Peirce', "Estrin's 

In [16]:
# Preprocessing/ cleaning the data
import re
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize

# remove text between parenthesis
# contents = list(map(lambda x: re.sub(r"\(.*\)","",x), contents))

# remove all digits from text
contents = list(map(lambda x: re.sub(r"\d+","",x), contents))

stop = set(stopwords.words('english')) # set of stopwords
lemma = WordNetLemmatizer()
def clean(doc):
    # remove stopwords and words that are too short
    return [lemma.lemmatize(i, 'v') for i in word_tokenize(doc) if i not in stop and len(i) > 2]
cleaned = [clean(page.lower()) for page in contents]

print(len(cleaned))

611


In [17]:
import numpy as np
# Building word dicitonary
from gensim import corpora
# create the term dictionary of our corpus; terms are unique; each term is assigned an index
dictionary = corpora.Dictionary(cleaned)
print(dictionary)
dictionary.filter_extremes(no_below=3, no_above=0.7)
print(dictionary)
#filtering for words that are semantically related within the dictionary 
stoplist = set('also use make people know many call include part find become like mean often different usually take wikt come give well get since type list say change see refer actually iii aisne kinds pas ask would way something need things want every str'.split())
stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id]
dictionary.filter_tokens(stop_ids)
print(dictionary)
dictionary.filter_n_most_frequent(50)
print(dictionary)

# This saves the dictionary to the local disk
dictionary.save_as_text('./dictionary.txt')
print(dictionary.token2id)

Dictionary(25740 unique tokens: ['common', 'companion', 'first', 'linearization', "'big"]...)
Dictionary(5934 unique tokens: ['common', 'companion', 'first', 'linearization', '...']...)
Dictionary(5896 unique tokens: ['common', 'companion', 'first', 'linearization', '...']...)
Dictionary(5846 unique tokens: ['common', 'companion', 'linearization', '...', '.the']...)
{'common': 0, 'companion': 1, 'linearization': 2, '...': 3, '.the': 4, 'ability': 5, 'abstraction': 6, 'abstractions': 7, 'accelerate': 8, 'access': 9, 'achieve': 10, 'acquire': 11, 'across': 12, 'add': 13, 'addition': 14, 'additional': 15, 'address': 16, 'adopt': 17, 'adoption': 18, 'advance': 19, 'advantage': 20, 'affect': 21, 'agree': 22, 'algorithms': 23, 'allow': 24, 'almost': 25, 'alternative': 26, 'alternatively': 27, 'alternatives': 28, 'although': 29, 'ambiguity': 30, 'amd': 31, 'amenable': 32, 'amount': 33, 'another': 34, 'application': 35, 'applications': 36, 'architectural': 37, 'architecture': 38, 'architecture

In [18]:
# Creating document-term matrix from vocabulary (dictionary)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in cleaned]
print(len(doc_term_matrix))
print(len(doc_term_matrix[1]))
print(doc_term_matrix[11])

611
533
[(0, 3), (3, 4), (4, 1), (6, 1), (12, 2), (13, 3), (17, 1), (20, 1), (24, 2), (29, 3), (30, 2), (34, 3), (35, 2), (36, 2), (44, 1), (56, 1), (59, 2), (61, 1), (66, 1), (68, 1), (73, 8), (78, 1), (84, 2), (87, 3), (94, 1), (96, 1), (98, 5), (106, 2), (110, 2), (127, 1), (128, 1), (135, 2), (136, 3), (137, 1), (140, 2), (143, 2), (148, 1), (153, 13), (155, 1), (161, 2), (164, 1), (170, 2), (171, 1), (175, 2), (178, 5), (182, 3), (184, 2), (197, 2), (203, 1), (208, 4), (211, 1), (213, 1), (220, 1), (221, 1), (223, 3), (226, 2), (229, 6), (234, 1), (242, 1), (245, 1), (270, 2), (304, 1), (326, 1), (328, 10), (330, 3), (331, 2), (333, 1), (336, 1), (339, 3), (341, 1), (347, 1), (354, 2), (359, 1), (371, 4), (372, 22), (379, 4), (382, 1), (384, 1), (385, 1), (391, 1), (399, 6), (408, 1), (411, 1), (419, 3), (431, 1), (435, 2), (441, 3), (443, 1), (452, 2), (455, 3), (457, 1), (465, 2), (466, 1), (469, 2), (472, 3), (474, 1), (479, 1), (484, 1), (486, 5), (503, 5), (508, 1), (509, 5),

In [19]:
# import numpy as np
# M = np.array([[0 for i in range(len(dictionary))] for i in range(len(dictionary))])
# for i in range (len(doc_term_matrix)):
#     for j in range (len(doc_term_matrix[i])):
#         for k in range(j+1, len(doc_term_matrix[i])):
#             M[doc_term_matrix[i][j][0]][doc_term_matrix[i][k][0]] +=1
#             M[doc_term_matrix[i][k][0]][doc_term_matrix[i][j][0]] +=1
#         freqMax = max(M[j])
#         M[j] = M[j] / freqMax
        
# #         M[j] = [x / freqMax for x in M[j]]

# for j in range(len(M)):
#     minM = min(M[j])
#     maxM = max(M[j])
#     R = minM/maxM
#     rel = R/(1+R)
#     for k in range(len(M[j])):
#         M[j][k] = rel
# M


In [20]:
# Training LDA model
# LDA automatically finds the mixture of similar words together, thus forming the topic or theme. we use this 
# unsupervised learning technique to identify the categories to which these articles belong, and the groups/clusters
# within the collection. 

from gensim.models.ldamodel import LdaModel as Lda

ldamodel = Lda(doc_term_matrix, num_topics=15, id2word = dictionary)

# Showing the 15 identified topics after the model is trained, where top 10 key terms are listed for each topic
for topic in ldamodel.print_topics(num_topics=15, num_words=10):
    print(topic[0]+1, " ", topic[1],"\n")

1   0.008*"model" + 0.007*"vectors" + 0.005*"coordinate" + 0.005*"map" + 0.004*"quantum" + 0.004*"study" + 0.004*"line" + 0.004*"group" + 0.004*"geometry" + 0.004*"describe" 

2   0.006*"map" + 0.005*"work" + 0.004*"line" + 0.004*"model" + 0.004*"geometry" + 0.004*"methods" + 0.004*"mathematical" + 0.004*"coordinate" + 0.003*"problems" + 0.003*"manifold" 

3   0.007*"ring" + 0.006*"polynomial" + 0.006*"matrices" + 0.005*"group" + 0.005*"vectors" + 0.004*"row" + 0.004*"element" + 0.004*"rotation" + 0.004*"map" + 0.003*"operator" 

4   0.007*"line" + 0.006*"ring" + 0.006*"vectors" + 0.005*"plane" + 0.004*"polynomial" + 0.004*"finite" + 0.004*"group" + 0.004*"scalar" + 0.004*"matrices" + 0.004*"map" 

5   0.016*"coordinate" + 0.006*"polynomial" + 0.005*"vectors" + 0.005*"map" + 0.005*"line" + 0.004*"matrices" + 0.004*"zero" + 0.004*"plane" + 0.003*"systems" + 0.003*"methods" 

6   0.007*"group" + 0.006*"elements" + 0.005*"line" + 0.005*"vectors" + 0.004*"ring" + 0.004*"finite" + 0.004*"se

In [21]:
# Clustering documents based on topics extracted from LDA model 
from operator import itemgetter
def cluster(doc_term_matrix, num):
    doc_topics = ldamodel.get_document_topics(doc_term_matrix, minimum_probability=0.20)
    result = [[] for i in range(num)]
    for k,topic in enumerate(doc_topics):
        # Some articles do not have a topic
        if topic:
            topic.sort(key = itemgetter(1), reverse=True)
            result[topic[0][0]].append(k)
    for k in range(len(result)):
        print('Articles(ID) in Cluster ' + str(k+1) + ': ' + ', '.join(map(str, result[k])))
        print()
    return result
cluster_result = cluster(doc_term_matrix, 15)
print(cluster_result[0])

Articles(ID) in Cluster 1: 12, 20, 46, 81, 88, 92, 107, 112, 121, 124, 147, 163, 168, 174, 189, 194, 199, 212, 243, 264, 266, 272, 274, 283, 302, 309, 311, 341, 354, 366, 367, 371, 377, 382, 410, 466, 468, 470, 474, 484, 525, 533, 551, 592, 607

Articles(ID) in Cluster 2: 14, 24, 33, 62, 80, 96, 98, 126, 156, 170, 176, 186, 187, 227, 231, 246, 251, 301, 314, 321, 351, 379, 386, 403, 458, 476, 486, 499, 530, 537, 571

Articles(ID) in Cluster 3: 4, 9, 23, 34, 36, 43, 55, 69, 102, 114, 129, 135, 141, 145, 150, 165, 173, 180, 181, 198, 202, 206, 218, 223, 247, 254, 263, 265, 267, 275, 290, 292, 315, 319, 329, 339, 345, 348, 353, 355, 357, 361, 368, 378, 392, 394, 406, 423, 426, 429, 432, 434, 444, 445, 448, 452, 456, 459, 464, 465, 481, 502, 509, 512, 516, 517, 529, 534, 549, 558, 567, 572, 575, 582, 583, 600, 606

Articles(ID) in Cluster 4: 7, 18, 30, 48, 49, 63, 95, 101, 115, 125, 138, 142, 144, 154, 159, 164, 166, 179, 191, 197, 201, 203, 224, 226, 240, 255, 257, 258, 271, 277, 280, 288

In [22]:
# Showing the exact document titles in each cluster
for k in range(len(cluster_result)):
    print('Articles in Cluster ' + str(k+1) + ': ' + ', '.join(map(lambda x: titles[x], cluster_result[k])))
    print()

Articles in Cluster 1: Function (mathematics), Faddeev–LeVerrier algorithm, Centrosymmetric matrix, Quantum mechanics, Convex cone, Unrestricted algorithm, Spherical basis, Trigonometric tables, Abelian group, Affine space, Mathematical analysis, Peano kernel theorem, Möbius transformation, Levinson recursion, History of Lorentz transformations, Permanent (mathematics), Manifold, Isomorphism, Kahan summation algorithm, Invertible matrix, Polynomial ring, The Nine Chapters on the Mathematical Art, Computational complexity, Van Wijngaarden transformation, Rota's basis conjecture, Hundred-dollar, Hundred-digit Challenge problems, Hermann Grassmann, General linear group, Coopmans approximation, Semilinear map, Generalizations of Pauli matrices, Linear subspace, Intersection (Euclidean geometry), Radial basis function, List of vector spaces in mathematics, Processor (computing), Split-complex number, Dual number, Tensor operator, Weyl's inequality, Integer points in convex polyhedra, Gaussi

In [23]:
term_topics = ldamodel.get_term_topics('convex', minimum_probability=0.000001)
print(term_topics)

[(0, 0.00048728645), (1, 0.0001631483), (2, 0.0002070987), (3, 0.00079177413), (4, 0.0008591001), (5, 0.0006738753), (6, 0.00036574306), (7, 0.00026577167), (8, 0.00031582304), (9, 0.0004258684), (10, 0.0005861479), (11, 0.00058429944), (12, 0.00021070779), (13, 0.00023094674), (14, 0.00095615233)]


In [24]:
# Getting related documents based on a term 
def get_related_documents(term, top, doc_term_matrix):
    print('------- Top', top, 'articles related to',term,'-------')
    related_docs = []
    doc_topics = ldamodel.get_document_topics(doc_term_matrix, minimum_probability=0.20)
    term_topics = ldamodel.get_term_topics(term, minimum_probability=0.000001)
    term_topics.sort(key = itemgetter(1), reverse=True)
    for k,topic in enumerate(doc_topics):
        if topic:
            topic.sort(key = itemgetter(1), reverse=True)
            if topic[0][0] == term_topics[0][0]:
                related_docs.append((k,topic[0][1]))
    related_docs.sort(key = itemgetter(1), reverse=True)
    result = []
    for j,doc in enumerate(related_docs):
        print(titles[doc[0]],"\n",doc[1],"\n")   
        result.append(titles[doc[0]])
        if j == top - 1:
            break
related_docs = get_related_documents('convex', 7, doc_term_matrix)


------- Top 7 articles related to convex -------
Dot product 
 0.99910426 

Category of modules 
 0.9987719 

Minimum polynomial extrapolation 
 0.9984649 

Matrix norm 
 0.9968992 

Runge–Kutta methods 
 0.9967592 

Quaternion 
 0.9966546 

Complex conjugate vector space 
 0.9959064 



In [45]:
def get_theme(doc, cluster_result):
    doc_id = titles.index(doc)
    if doc_id == -1:
        print('Document not found.')
        return
    for i, cluster in enumerate(cluster_result):
        if doc_id in cluster:
            return i+1
    return 0
cluster_num = get_theme('Absolutely convex set', cluster_result)
print(cluster_num)

13


In [46]:
# Implementing tf-idf model; the only information needed from the previous part is the doc_term_matrix
from gensim.models import TfidfModel, LsiModel
tfidf_model = TfidfModel(doc_term_matrix, dictionary = dictionary)
print(tfidf_model)
vector = tfidf_model[doc_term_matrix[0]]
print(vector[0])


TfidfModel(num_docs=611, num_nnz=143903)
(0, 0.056852536755099284)


In [47]:
# Implementing LSI model; the only information needed from the previous part is the doc_term_matrix
lsi_model = LsiModel(doc_term_matrix, id2word=dictionary)
print(lsi_model)

LsiModel(num_terms=5846, num_topics=200, decay=1.0, chunksize=20000)


In [48]:
# Creating the similarity matrix from simple bag-of-words model (# of documents * # of documents)
from gensim import similarities

index = similarities.MatrixSimilarity(doc_term_matrix, num_features=len(dictionary))
print(len(index[doc_term_matrix[610]])) # 611 * 611 matrix

611


In [49]:
# Training tf-idf model from bag-of-word dataset
model_tfidf = TfidfModel(doc_term_matrix, id2word=dictionary, normalize=False)

In [50]:
# Applying tf-idf model to all vectors
from gensim.corpora import MmCorpus
MmCorpus.serialize('./corpus_tfidf.mm', model_tfidf[doc_term_matrix], progress_cnt=100)

In [51]:
corpus_tfidf = MmCorpus('./corpus_tfidf.mm') # Loading back the corpus file after applying tf-idf
model_lsi = LsiModel(corpus_tfidf, num_topics=15, id2word=dictionary)
# Applying LSI model to all vectors
index = similarities.MatrixSimilarity(model_lsi[corpus_tfidf], num_features=len(dictionary))
print(index)
index.save('./lsi_index.mm') # Saving the similarity matrix to a local matrix market file named './lsi_model.mm'

MatrixSimilarity<611 docs, 5846 features>


In [52]:
# Loading the similarity matrix back from the local file
similarity_matrix = similarities.MatrixSimilarity.load('./lsi_index.mm')
print(len(similarity_matrix))

611
