In [2]:
# Splitting text data and storing them in a list (of articles)
import io
docs = io.open("raw_data_1130.txt", mode="r", encoding="utf-8", errors="ignore").read().split('\n') # list of strings 
titles = [docs[i] for i in range(len(docs)) if i % 2 == 0] # list of string titles
contents = [docs[i] for i in range(len(docs)) if i % 2 == 1] # list of string contents
print(contents[0])

In mathematics, a function[1] was originally the idealization of how a varying quantity depends on another quantity. For example, the position of a planet is a function of time. Historically, the concept was elaborated with the infinitesimal calculus at the end of the 17th century, and, until the 19th century, the functions that were considered were differentiable (that is, they had a high degree of regularity). The concept of function was formalized at the end of the 19th century in terms of set theory, and this greatly enlarged the domains of application of the concept. A function is a process or a relation that associates each element x of a set X,  the domain of the function, to a single element y of another set Y (possibly the same set), the codomain of the function. If the function is called f, this relation is denoted y = f (x) (read f of x), the element x is the argument or input of the function, and y is the value of the function, the output, or the image of x by f.[2] The sym

In [3]:
# Preprocessing/ cleaning the data
import re
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize

# remove text between parenthesis
# contents = list(map(lambda x: re.sub(r"\(.*\)","",x), contents))

# remove all digits from text
contents = list(map(lambda x: re.sub(r"\d+","",x), contents))

stop = set(stopwords.words('english')) # set of stopwords
lemma = WordNetLemmatizer()
def clean(doc):
    # remove stopwords and words that are too short
    return [lemma.lemmatize(i, 'v') for i in word_tokenize(doc) if i not in stop and len(i) > 2]
cleaned = [clean(page.lower()) for page in contents]

print(len(cleaned))

197


In [4]:
# Building word dicitonary
from gensim import corpora
# create the term dictionary of our corpus; terms are unique; each term is assigned an index
dictionary = corpora.Dictionary(cleaned)
print(dictionary)
dictionary.filter_extremes(no_below=3, no_above=0.7)
print(dictionary)
stoplist = set('also use make people know many call include part find become like mean often different usually take wikt come give well get since type list say change see refer actually iii aisne kinds pas ask would way something need things want every str'.split())
stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id]
dictionary.filter_tokens(stop_ids)
print(dictionary)
dictionary.filter_n_most_frequent(50)
print(dictionary)

# This saves the dictionary to the local disk
dictionary.save_as_text('./dictionary.txt')


Dictionary(16197 unique tokens: ['-tuples', '.the', 'abbreviation', 'above.according', 'above.this']...)
Dictionary(4085 unique tokens: ['.the', 'abbreviation', 'abstraction', 'abuse', 'act']...)
Dictionary(4050 unique tokens: ['.the', 'abbreviation', 'abstraction', 'abuse', 'act']...)
Dictionary(4000 unique tokens: ['.the', 'abbreviation', 'abstraction', 'abuse', 'act']...)


In [4]:
# Creating document-term matrix from vocabulary (dictionary)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in cleaned]
print(len(doc_term_matrix))
print(doc_term_matrix[693])

694
[(1, 1), (7, 1), (11, 1), (14, 1), (22, 2), (24, 2), (30, 3), (34, 29), (39, 1), (40, 2), (43, 1), (46, 1), (61, 2), (64, 1), (65, 1), (74, 2), (79, 1), (84, 2), (86, 2), (100, 1), (104, 1), (111, 1), (117, 2), (123, 3), (125, 1), (126, 1), (129, 1), (132, 3), (134, 7), (137, 1), (138, 1), (141, 3), (154, 4), (164, 1), (165, 1), (172, 3), (175, 1), (176, 2), (183, 1), (198, 1), (214, 3), (217, 1), (222, 1), (228, 1), (233, 1), (253, 1), (255, 2), (262, 2), (270, 1), (273, 2), (275, 2), (281, 1), (288, 2), (292, 3), (303, 1), (321, 1), (324, 1), (329, 1), (339, 2), (342, 1), (356, 6), (357, 1), (376, 1), (381, 1), (385, 1), (391, 1), (392, 1), (395, 2), (399, 1), (404, 6), (409, 1), (419, 4), (423, 4), (438, 2), (459, 1), (461, 1), (464, 2), (467, 1), (490, 1), (491, 2), (500, 4), (511, 24), (518, 3), (519, 1), (525, 2), (526, 1), (529, 1), (533, 1), (535, 1), (539, 2), (548, 3), (554, 3), (561, 1), (564, 1), (578, 1), (595, 1), (617, 1), (627, 1), (631, 1), (654, 1), (665, 1), (668

In [5]:
# Training LDA model
# LDA automatically finds the mixture of similar words together, thus forming the topic or theme. we use this 
# unsupervised learning technique to identify the categories to which these articles belong, and the groups/clusters
# within the collection. 

from gensim.models.ldamodel import LdaModel as Lda

ldamodel = Lda(doc_term_matrix, num_topics=15, id2word = dictionary)

# Showing the 15 identified topics after the model is trained, where top 10 key terms are listed for each topic
for topic in ldamodel.print_topics(num_topics=15, num_words=10):
    print(topic[0]+1, " ", topic[1],"\n")

1   0.008*"line" + 0.007*"map" + 0.006*"group" + 0.006*"matrices" + 0.005*"coordinate" + 0.004*"plane" + 0.004*"determinant" + 0.004*"projection" + 0.004*"manifold" + 0.004*"vectors" 

2   0.009*"line" + 0.007*"coordinate" + 0.005*"matrices" + 0.005*"group" + 0.005*"image" + 0.004*"ring" + 0.004*"equations" + 0.004*"polynomial" + 0.004*"product" + 0.004*"plane" 

3   0.007*"coordinate" + 0.006*"model" + 0.005*"product" + 0.005*"map" + 0.005*"cordic" + 0.004*"algorithm" + 0.004*"computer" + 0.004*"line" + 0.004*"geometry" + 0.003*"equations" 

4   0.012*"sequence" + 0.008*"group" + 0.007*"matrices" + 0.006*"equations" + 0.005*"row" + 0.005*"column" + 0.005*"solution" + 0.004*"rank" + 0.004*"vectors" + 0.004*"product" 

5   0.006*"polynomial" + 0.006*"equations" + 0.005*"row" + 0.004*"coordinate" + 0.004*"model" + 0.004*"line" + 0.004*"zero" + 0.004*"geometry" + 0.004*"solution" + 0.004*"data" 

6   0.012*"vectors" + 0.011*"product" + 0.010*"row" + 0.009*"matrices" + 0.005*"column" + 0.0

In [8]:
# Clustering documents based on topics extracted from LDA model 
from operator import itemgetter
def cluster(doc_term_matrix, num):
    doc_topics = ldamodel.get_document_topics(doc_term_matrix, minimum_probability=0.20)
    result = [[] for i in range(num)]
    for k,topic in enumerate(doc_topics):
        # Some articles do not have a topic
        if topic:
            topic.sort(key = itemgetter(1), reverse=True)
            result[topic[0][0]].append(k)
    for k in range(len(result)):
        print('Articles(ID) in Cluster ' + str(k+1) + ': ' + ', '.join(map(str, result[k])))
        print()
    return result
cluster_result = cluster(doc_term_matrix, 15)

Articles(ID) in Cluster 1: 8, 67, 70, 86, 90, 150, 151, 152, 157, 160, 169, 187, 200, 208, 315, 319, 345, 369, 378, 384, 396, 412, 421, 422, 429, 440, 454, 460, 466, 505, 520, 522, 545, 566, 571, 578, 606, 620, 647, 653, 662, 692

Articles(ID) in Cluster 2: 37, 49, 60, 93, 110, 115, 117, 142, 158, 161, 180, 185, 211, 214, 286, 316, 328, 332, 359, 393, 439, 442, 443, 478, 515, 521, 535, 552, 564, 639, 649, 672

Articles(ID) in Cluster 3: 19, 35, 54, 56, 65, 71, 89, 113, 114, 130, 155, 156, 179, 204, 210, 212, 222, 231, 233, 284, 296, 297, 299, 304, 307, 308, 321, 324, 334, 347, 370, 394, 395, 414, 434, 457, 484, 496, 503, 519, 526, 527, 568, 572, 603, 618, 641, 646, 657, 675, 676

Articles(ID) in Cluster 4: 24, 82, 84, 96, 102, 119, 127, 134, 143, 172, 194, 213, 217, 247, 252, 265, 268, 273, 281, 311, 329, 338, 349, 350, 353, 360, 374, 379, 382, 400, 405, 415, 430, 447, 449, 508, 562, 565, 574, 638, 655, 664, 674, 679

Articles(ID) in Cluster 5: 6, 16, 53, 73, 78, 87, 109, 120, 126, 133

In [9]:
# Showing the exact document titles in each cluster
for k in range(len(cluster_result)):
    print('Articles in Cluster ' + str(k+1) + ': ' + ', '.join(map(lambda x: titles[x], cluster_result[k])))
    print()

Articles in Cluster 1: Curse of dimensionality, Trace (linear algebra), Linear map, Relative dimension, Kempner series, Overlap–save method, Matrix determinant lemma, Projection (linear algebra), Probability box, Line segment, Identity matrix, Successive parabolic interpolation, Trace identity, Intersection curve, Linear map, Möbius transformation, Linear inequality, Rota's basis conjecture, Intersection (Euclidean geometry), Piecewise linear continuation, 3D projection, Lapped transform, Equipollence (geometry), Linear map, Lie group integrator, K-SVD, Three-dimensional space, Dimension (vector space), Majorization, Trace diagram, Projection (mathematics), Dimension (vector space), Interval contractor, Complex plane, Orthographic projection, Well-posed problem, Nyström method, Unrestricted algorithm, Discrete Fourier transform, Golden–Thompson inequality, Manifold, Ambient space

Articles in Cluster 2: Skew-Hermitian matrix, Weyr canonical form, ND4S, Rank factorization, Image (mathem

In [10]:
term_topics = ldamodel.get_term_topics('convex', minimum_probability=0.000001)
print(term_topics)

[(0, 0.00062025624), (1, 0.0002851983), (2, 0.00018773491), (3, 0.00031237974), (4, 0.00075876398), (5, 0.00055337959), (6, 0.0020672388), (7, 0.00025084193), (8, 0.00046323851), (9, 0.00055092655), (10, 0.00044278661), (11, 0.00022622035), (12, 0.00034563636), (13, 0.00052796141), (14, 0.000323835)]


In [11]:
# Getting related documents based on a term 
def get_related_documents(term, top, doc_term_matrix):
    print('------- Top', top, 'articles related to',term,'-------')
    related_docs = []
    doc_topics = ldamodel.get_document_topics(doc_term_matrix, minimum_probability=0.20)
    term_topics = ldamodel.get_term_topics(term, minimum_probability=0.000001)
    term_topics.sort(key = itemgetter(1), reverse=True)
    for k,topic in enumerate(doc_topics):
        if topic:
            topic.sort(key = itemgetter(1), reverse=True)
            if topic[0][0] == term_topics[0][0]:
                related_docs.append((k,topic[0][1]))
    related_docs.sort(key = itemgetter(1), reverse=True)
    result = []
    for j,doc in enumerate(related_docs):
        print(titles[doc[0]],"\n",doc[1],"\n")   
        result.append(titles[doc[0]])
        if j == top - 1:
            break
related_docs = get_related_documents('convex', 7, doc_term_matrix)


------- Top 7 articles related to convex -------
Homography 
 0.998413 

Flag (linear algebra) 
 0.993473 

Flag (linear algebra) 
 0.993473 

Linear system 
 0.992412 

Definite quadratic form 
 0.991884 

Definite quadratic form 
 0.989251 

Absolutely convex set 
 0.983626 



In [138]:
def get_theme(doc, cluster_result):
    doc_id = titles.index(doc)
    if doc_id == -1:
        print('Document not found.')
        return
    for i, cluster in enumerate(cluster_result):
        if doc_id in cluster:
            return i+1
    return 0
cluster_num = get_theme('Absolutely convex set', cluster_result)
print(cluster_num)

6


In [196]:
# Implementing tf-idf model; the only information needed from the previous part is the doc_term_matrix
from gensim.models import TfidfModel, LsiModel
tfidf_model = TfidfModel(doc_term_matrix, dictionary = dictionary)
print(tfidf_model)
vector = tfidf_model[doc_term_matrix[0]]
print(vector[0])


TfidfModel(num_docs=694, num_nnz=169581)
(0, 0.045342656413852538)


In [150]:
# Implementing LSI model; the only information needed from the previous part is the doc_term_matrix
lsi_model = LsiModel(doc_term_matrix, id2word=dictionary)
print(lsi_model)

LsiModel(num_terms=6533, num_topics=200, decay=1.0, chunksize=20000)


In [171]:
# Creating the similarity matrix from simple bag-of-words model (# of documents * # of documents)
from gensim import similarities

index = similarities.MatrixSimilarity(doc_term_matrix, num_features=len(dictionary))
print(len(index[doc_term_matrix[693]])) # 694 * 694 matrix

694


In [172]:
# Training tf-idf model from bag-of-word dataset
model_tfidf = TfidfModel(doc_term_matrix, id2word=dictionary, normalize=False)

In [187]:
# Applying tf-idf model to all vectors
from gensim.corpora import MmCorpus
MmCorpus.serialize('./corpus_tfidf.mm', model_tfidf[doc_term_matrix], progress_cnt=100)

In [188]:
corpus_tfidf = MmCorpus('./corpus_tfidf.mm') # Loading back the corpus file after applying tf-idf
model_lsi = LsiModel(corpus_tfidf, num_topics=15, id2word=dictionary)
# Applying LSI model to all vectors
index = similarities.MatrixSimilarity(model_lsi[corpus_tfidf], num_features=len(dictionary))
print(index)
index.save('./lsi_index.mm') # Saving the similarity matrix to a local matrix market file named './lsi_model.mm'

MatrixSimilarity<694 docs, 6533 features>


In [195]:
# Loading the similarity matrix back from the local file
similarity_matrix = similarities.MatrixSimilarity.load('./lsi_index.mm')
print(len(similarity_matrix))

694
