In [1]:
import numpy
import pandas as pd
import nltk
import re
import io

In [3]:
#titles = io.open("titles.txt", mode="r", encoding="utf-8", errors="ignore").read().split('\n')
docs = io.open(".txt", mode="r", encoding="utf-8", errors="ignore").read()
docs = re.sub(r"\[\d+\]", "", docs)
docs = re.sub(r'\w*\d\w*', '', docs).split('\n')

titles = [docs[i] for i in range(len(docs)) if i % 2 == 0]
docs = [docs[i] for i in range(len(docs)) if i % 2 == 1]


print(str(len(titles)) + ' titles')
print(str(len(docs)) + ' docs')
print(titles[-20])
print(docs[-20])

ranks = []

for i in range(0,len(titles)):
    ranks.append(i)
stopwords = nltk.corpus.stopwords.words('english')
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

682 titles
682 docs
Mathematical physics
Prominent contributors to the  century's mathematical physics (although the list contains some typically theoretical, not mathematical, physicists and leaves many contributors out; please also note that since the page can be edited by anyone, sometimes less deserved mentions can pop up in the list) include, ordered by birth date, William Thomson (Lord Kelvin) [–], Oliver Heaviside [–], Jules Henri Poincaré [–] , David Hilbert [–], Arnold Sommerfeld [–], Constantin Caratheodory [–], Albert Einstein [–], Max Born [–], George David Birkhoff [-], Niels Bohr [–], Hermann Weyl [–], Satyendra Nath Bose [–], Norbert Wiener [–], Wolfgang Pauli [–], Werner Heisenberg [–], Paul Dirac [–], Eugene Wigner [–], Lars Onsager [-], John von Neumann [–], Sin-Itiro Tomonaga [–], Hideki Yukawa [–], Lev Davidovich Landau [–], Nikolay Nikolayevich Bogolyubov [–], Subrahmanyan Chandrasekhar [-], Mark Kac [–], Julian Schwinger [–], Richard Phillips Feynman [–], Irving E

In [4]:
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

totalvocab_stemmed = []
totalvocab_tokenized = []
for i in docs:
    allwords_stemmed = tokenize_and_stem(i)
    totalvocab_stemmed.extend(allwords_stemmed)
    
    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)

In [5]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.1, stop_words= stopwords,
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(docs)

print(tfidf_matrix.shape)

CPU times: user 9 s, sys: 191 ms, total: 9.19 s
Wall time: 10.2 s
(682, 121)


In [7]:
terms = tfidf_vectorizer.get_feature_names()
print(terms)

["'s", 'abl', 'addit', 'algebra', 'algorithm', 'also', 'analysi', 'ani', 'anoth', 'appli', 'applic', 'appropri', 'base', 'becaus', 'book', 'book number', 'book sourc', 'call', 'case', 'chang', 'common', 'complex', 'comput', 'concept', 'consid', 'continu', 'defin', 'definit', 'describ', 'determin', 'develop', 'differ', 'doe', 'edit', 'element', 'english', 'equat', 'equival', 'even', 'exampl', 'exist', 'express', 'field', 'find', 'first', 'follow', 'form', 'function', 'general', 'given', 'help', 'howev', 'import', 'includ', 'intern', 'intern standard', 'intern standard book', 'isbn', 'known', 'languag', 'like', 'linear', 'link', 'look', 'mani', 'mathemat', 'may', 'mean', 'method', 'might', 'multipl', 'natur', 'need', 'number', 'object', 'often', 'one', 'onli', 'oper', 'order', 'origin', 'page', 'part', 'particular', 'point', 'problem', 'product', 'properti', 'real', 'record', 'refer', 'relat', 'repres', 'requir', 'result', 'search', 'see', 'set', 'sever', 'sinc', 'sourc', 'space', 'speci

In [7]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)

In [8]:
from sklearn.cluster import KMeans

num_clusters = 7

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

Wall time: 17.8 s


In [9]:
from sklearn.externals import joblib

joblib.dump(km,  'doc_cluster.pkl')
km = joblib.load('doc_cluster.pkl')
clusters = km.labels_.tolist()

In [10]:
results = { 'title': titles, 'rank': ranks, 'docs': docs, 'cluster': clusters}

frame = pd.DataFrame(results, index = [clusters] , columns = ['rank', 'title', 'cluster'])

In [11]:
from __future__ import print_function

print("Top terms per cluster:")
print()
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
for i in range(num_clusters):
    print("Cluster %d words:" % i, end='')
    for ind in order_centroids[i, :6]:
        print(' %s' % vocab_frame.loc[terms[ind].split(' ')].values.tolist()[0][0], end=',')
    print()
    print()
    print("Cluster %d titles:" % i, end='')
    for title in frame.loc[i]['title'].values.tolist():
        print(' %s,' % title, end='')
    print()
    print()

Top terms per cluster:

Cluster 0 words: mathematics, theory, universe, science, studied, 's,

Cluster 0 titles: Mathematics, Gottfried Wilhelm Leibniz, Carl Friedrich Gauss, Hermann Grassmann, James Joseph Sylvester, Arthur Cayley, Giuseppe Peano, School Mathematics Study Group, Axiom, Axiom, Marie A. Vitulli, Alan Tucker, Martha Siegel, Graduate Texts in Mathematics, James Demmel, Israel Gelfand, Ray Kunze, Mathematical Reviews, Paul Halmos, Undergraduate Texts in Mathematics, Igor Shafarevich, Michiel Hazewinkel, Encyclopedia of Mathematics, MathWorld, Template talk:Areas of mathematics, Areas of mathematics, Mathematics, Outline of mathematics, Lists of mathematics topics, History of mathematics, Philosophy of mathematics, Philosophy of mathematics education, Set theory, Theory of computation, Number theory, Combinatorics, Mathematical physics, Computer science, Recreational mathematics, Mathematics and art, Mathematics education, Mathematical logic, Pure mathematics, Applied mathe

In [12]:
print(frame)

    rank                              title  cluster
6      0                 Elementary algebra        6
5      1                    Euclidean space        5
0      2                        Mathematics        0
6      3                    Linear equation        6
1      4                         Linear map        1
3      5               Matrix (mathematics)        3
1      6                       Vector space        1
0      7                           Geometry        0
5      8                    Line (geometry)        5
5      9                   Plane (geometry)        5
5     10             Rotation (mathematics)        5
5     11                Functional analysis        5
0     12                        Engineering        0
5     13                 Mathematical model        5
6     14                   Nonlinear system        6
3     15                        Determinant        3
6     16         System of linear equations        6
0     17          Gottfried Wilhelm Leibniz   