In [31]:
# Splitting text data and storing them in a list (of articles)
import io
docs = io.open("raw_data.txt", mode="r", encoding="utf-8", errors="ignore").read().split('\n') # list of strings 
titles = [docs[i] for i in range(len(docs)) if i % 2 == 0] # list of string titles
contents = [docs[i] for i in range(len(docs)) if i % 2 == 1] # list of string contents
print(len(titles))
print(len(contents))
print('\n')
print(len(list(set(titles))))
print(len(list(set(contents))))

# import collections
# print(len([item for item, count in collections.Counter(contents).items() if count > 1]))

694
694


616
612


In [6]:
# Preprocessing/ cleaning the data
import re
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize

# remove text between parenthesis
# contents = list(map(lambda x: re.sub(r"\(.*\)","",x), contents))

# remove all digits from text
contents = list(map(lambda x: re.sub(r"\d+","",x), contents))

stop = set(stopwords.words('english')) # set of stopwords
lemma = WordNetLemmatizer()
def clean(doc):
    # remove stopwords and words that are too short
    return [lemma.lemmatize(i, 'v') for i in word_tokenize(doc) if i not in stop and len(i) > 2]
cleaned = [clean(page.lower()) for page in contents]

print(len(cleaned))

197


In [7]:
# Building word dicitonary
from gensim import corpora
# create the term dictionary of our corpus; terms are unique; each term is assigned an index
dictionary = corpora.Dictionary(cleaned)
print(dictionary)
dictionary.filter_extremes(no_below=3, no_above=0.7)
print(dictionary)
stoplist = set('also use make people know many call include part find become like mean often different usually take wikt come give well get since type list say change see refer actually iii aisne kinds pas ask would way something need things want every str'.split())
stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id]
dictionary.filter_tokens(stop_ids)
print(dictionary)
dictionary.filter_n_most_frequent(50)
print(dictionary)

# This saves the dictionary to the local disk
dictionary.save_as_text('./dictionary.txt')


Dictionary(16197 unique tokens: ['-tuples', '.the', 'abbreviation', 'above.according', 'above.this']...)
Dictionary(4085 unique tokens: ['.the', 'abbreviation', 'abstraction', 'abuse', 'act']...)
Dictionary(4050 unique tokens: ['.the', 'abbreviation', 'abstraction', 'abuse', 'act']...)
Dictionary(4000 unique tokens: ['.the', 'abbreviation', 'abstraction', 'abuse', 'act']...)


In [9]:
# Creating document-term matrix from vocabulary (dictionary)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in cleaned]
print(len(doc_term_matrix))
print(doc_term_matrix[0])

197
[(0, 2), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 2), (8, 1), (9, 2), (10, 2), (11, 2), (12, 1), (13, 5), (14, 1), (15, 5), (16, 3), (17, 2), (18, 2), (19, 1), (20, 1), (21, 2), (22, 4), (23, 2), (24, 1), (25, 4), (26, 1), (27, 1), (28, 5), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 5), (37, 1), (38, 2), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 5), (46, 2), (47, 1), (48, 1), (49, 1), (50, 1), (51, 6), (52, 1), (53, 1), (54, 1), (55, 20), (56, 1), (57, 1), (58, 1), (59, 1), (60, 3), (61, 5), (62, 1), (63, 1), (64, 2), (65, 2), (66, 1), (67, 1), (68, 1), (69, 1), (70, 4), (71, 2), (72, 7), (73, 1), (74, 3), (75, 2), (76, 1), (77, 1), (78, 2), (79, 1), (80, 1), (81, 1), (82, 1), (83, 1), (84, 2), (85, 1), (86, 1), (87, 2), (88, 1), (89, 1), (90, 1), (91, 2), (92, 1), (93, 1), (94, 1), (95, 1), (96, 2), (97, 5), (98, 3), (99, 1), (100, 1), (101, 1), (102, 1), (103, 38), (104, 5), (105, 3), (106, 1), (107, 1), (108, 1), (109, 1), (11

In [10]:
# Training LDA model
# LDA automatically finds the mixture of similar words together, thus forming the topic or theme. we use this 
# unsupervised learning technique to identify the categories to which these articles belong, and the groups/clusters
# within the collection. 

from gensim.models.ldamodel import LdaModel as Lda

ldamodel = Lda(doc_term_matrix, num_topics=15, id2word = dictionary)

# Showing the 15 identified topics after the model is trained, where top 10 key terms are listed for each topic
for topic in ldamodel.print_topics(num_topics=15, num_words=10):
    print(topic[0]+1, " ", topic[1],"\n")

1   0.015*"group" + 0.012*"line" + 0.009*"geometry" + 0.008*"coordinate" + 0.008*"manifold" + 0.006*"euclidean" + 0.005*"determinant" + 0.005*"tensor" + 0.005*"plane" + 0.005*"equation" 

2   0.007*"ring" + 0.006*"affine" + 0.006*"system" + 0.006*"cross" + 0.005*"group" + 0.005*"coordinate" + 0.005*"time" + 0.005*"map" + 0.004*"row" + 0.004*"solution" 

3   0.007*"group" + 0.006*"inequality" + 0.006*"sequence" + 0.006*"ring" + 0.006*"method" + 0.005*"finite" + 0.005*"polynomial" + 0.004*"determinant" + 0.004*"methods" + 0.004*"element" 

4   0.015*"group" + 0.008*"map" + 0.006*"system" + 0.006*"equations" + 0.006*"ring" + 0.006*"kernel" + 0.005*"hilbert" + 0.005*"element" + 0.005*"row" + 0.005*"equation" 

5   0.016*"ring" + 0.006*"map" + 0.006*"system" + 0.006*"dual" + 0.005*"equations" + 0.005*"hilbert" + 0.004*"element" + 0.004*"transform" + 0.004*"continuous" + 0.004*"coordinate" 

6   0.011*"map" + 0.005*"element" + 0.005*"hilbert" + 0.005*"group" + 0.005*"equations" + 0.005*"oper

In [11]:
# Clustering documents based on topics extracted from LDA model 
from operator import itemgetter
def cluster(doc_term_matrix, num):
    doc_topics = ldamodel.get_document_topics(doc_term_matrix, minimum_probability=0.20)
    result = [[] for i in range(num)]
    for k,topic in enumerate(doc_topics):
        # Some articles do not have a topic
        if topic:
            topic.sort(key = itemgetter(1), reverse=True)
            result[topic[0][0]].append(k)
    for k in range(len(result)):
        print('Articles(ID) in Cluster ' + str(k+1) + ': ' + ', '.join(map(str, result[k])))
        print()
    return result
cluster_result = cluster(doc_term_matrix, 15)

Articles(ID) in Cluster 1: 1, 10, 16, 37, 39, 61, 66, 67, 69, 94, 95, 113, 121, 133, 135, 154, 157, 158, 180, 195

Articles(ID) in Cluster 2: 2, 12, 17, 30, 86, 92, 102, 103, 106, 117, 150, 156, 190

Articles(ID) in Cluster 3: 9, 46, 75, 78, 91, 107, 178, 181, 185

Articles(ID) in Cluster 4: 5, 24, 29, 32, 33, 48, 49, 50, 56, 58, 60, 62, 63, 64, 65, 68, 71, 73, 79, 81, 87, 90, 98, 114, 118, 126, 141, 142, 144, 152, 153, 170, 171, 177, 189, 192

Articles(ID) in Cluster 5: 3, 47, 74, 112, 120, 174, 183

Articles(ID) in Cluster 6: 0, 8, 14, 31, 42, 45, 59, 76, 85, 97, 108, 129, 139, 166, 182, 193

Articles(ID) in Cluster 7: 15, 23, 35, 53, 88, 105, 140, 147

Articles(ID) in Cluster 8: 25, 40, 82, 101, 115, 122, 132, 137, 159, 161, 179

Articles(ID) in Cluster 9: 4, 7, 11, 18, 22, 41, 54, 93, 96, 128, 131, 163, 173, 176, 191

Articles(ID) in Cluster 10: 34, 44, 77, 100, 138

Articles(ID) in Cluster 11: 19, 20, 26, 51, 84, 89, 99, 111, 124, 146, 149

Articles(ID) in Cluster 12: 6, 13, 28, 3

In [12]:
# Showing the exact document titles in each cluster
for k in range(len(cluster_result)):
    print('Articles in Cluster ' + str(k+1) + ': ' + ', '.join(map(lambda x: titles[x], cluster_result[k])))
    print()

Articles in Cluster 1: Quadruple product, Norm (mathematics), Homogeneous coordinates, Trace (linear algebra), Real number, Flat (geometry), Semi-simple operator, Synthetic geometry, Endomorphism, Levi-Civita symbol, Line (geometry), Matrix calculus, Manifold, Abelian group, Diagonalizable matrix, Unit vector, Geometry, Three-dimensional space, Möbius transformation, Cartesian tensor

Articles in Cluster 2: MATLAB, Compressed sensing, Euclidean vector, Cross product, Field extension, Pseudovector, Seven-dimensional cross product, Row and column vectors, Lorentz transformation, René Descartes, Linear inequality, Weyl's inequality, Vector-valued function

Articles in Cluster 3: Finite field, Sequence, Numerical analysis, Triangle inequality, Cauchy–Schwarz inequality, The Nine Chapters on the Mathematical Art, Cauchy–Schwarz inequality, Runge–Kutta methods, Non-negative matrix factorization

Articles in Cluster 4: Butcher group, Generalizations of Pauli matrices, Vector space, Skew-Hermi

In [13]:
term_topics = ldamodel.get_term_topics('convex', minimum_probability=0.000001)
print(term_topics)

[(0, 0.00070352375), (1, 0.0013643978), (2, 0.00025655841), (3, 0.00050678279), (4, 0.00044210179), (5, 0.00096805347), (6, 0.00038708164), (7, 0.0001171499), (8, 0.00076638913), (9, 0.00036176638), (10, 0.0002326205), (11, 0.0015267399), (12, 0.00048151685), (13, 0.00038440333), (14, 0.00032594844)]


In [14]:
# Getting related documents based on a term 
def get_related_documents(term, top, doc_term_matrix):
    print('------- Top', top, 'articles related to',term,'-------')
    related_docs = []
    doc_topics = ldamodel.get_document_topics(doc_term_matrix, minimum_probability=0.20)
    term_topics = ldamodel.get_term_topics(term, minimum_probability=0.000001)
    term_topics.sort(key = itemgetter(1), reverse=True)
    for k,topic in enumerate(doc_topics):
        if topic:
            topic.sort(key = itemgetter(1), reverse=True)
            if topic[0][0] == term_topics[0][0]:
                related_docs.append((k,topic[0][1]))
    related_docs.sort(key = itemgetter(1), reverse=True)
    result = []
    for j,doc in enumerate(related_docs):
        print(titles[doc[0]],"\n",doc[1],"\n")   
        result.append(titles[doc[0]])
        if j == top - 1:
            break
related_docs = get_related_documents('convex', 7, doc_term_matrix)


------- Top 7 articles related to convex -------
Convex cone 
 0.996011 

Linear span 
 0.995534 

Linear span 
 0.995534 

Dual basis 
 0.990476 

Dual basis 
 0.990476 

Vectorization (mathematics) 
 0.990378 

Predictor–corrector method 
 0.989513 



In [15]:
def get_theme(doc, cluster_result):
    doc_id = titles.index(doc)
    if doc_id == -1:
        print('Document not found.')
        return
    for i, cluster in enumerate(cluster_result):
        if doc_id in cluster:
            return i+1
    return 0
cluster_num = get_theme('Absolutely convex set', cluster_result)
print(cluster_num)

ValueError: 'Absolutely convex set' is not in list

In [16]:
# Implementing tf-idf model; the only information needed from the previous part is the doc_term_matrix
from gensim.models import TfidfModel, LsiModel
tfidf_model = TfidfModel(doc_term_matrix, dictionary = dictionary)
print(tfidf_model)
vector = tfidf_model[doc_term_matrix[0]]
print(vector[0])


TfidfModel(num_docs=197, num_nnz=77881)
(0, 0.01622820264288196)


In [17]:
# Implementing LSI model; the only information needed from the previous part is the doc_term_matrix
lsi_model = LsiModel(doc_term_matrix, id2word=dictionary)
print(lsi_model)

LsiModel(num_terms=4000, num_topics=200, decay=1.0, chunksize=20000)


In [18]:
# Creating the similarity matrix from simple bag-of-words model (# of documents * # of documents)
from gensim import similarities

index = similarities.MatrixSimilarity(doc_term_matrix, num_features=len(dictionary))
print(len(index[doc_term_matrix[693]])) # 694 * 694 matrix

IndexError: list index out of range

In [172]:
# Training tf-idf model from bag-of-word dataset
model_tfidf = TfidfModel(doc_term_matrix, id2word=dictionary, normalize=False)

In [187]:
# Applying tf-idf model to all vectors
from gensim.corpora import MmCorpus
MmCorpus.serialize('./corpus_tfidf.mm', model_tfidf[doc_term_matrix], progress_cnt=100)

In [188]:
corpus_tfidf = MmCorpus('./corpus_tfidf.mm') # Loading back the corpus file after applying tf-idf
model_lsi = LsiModel(corpus_tfidf, num_topics=15, id2word=dictionary)
# Applying LSI model to all vectors
index = similarities.MatrixSimilarity(model_lsi[corpus_tfidf], num_features=len(dictionary))
print(index)
index.save('./lsi_index.mm') # Saving the similarity matrix to a local matrix market file named './lsi_model.mm'

MatrixSimilarity<694 docs, 6533 features>


In [195]:
# Loading the similarity matrix back from the local file
similarity_matrix = similarities.MatrixSimilarity.load('./lsi_index.mm')
print(len(similarity_matrix))

694
