In [1]:
import numpy
import pandas as pd
import nltk
import re
import io

In [2]:
#titles = io.open("titles.txt", mode="r", encoding="utf-8", errors="ignore").read().split('\n')
docs = io.open("docs.txt", mode="r", encoding="utf-8", errors="ignore").read()
docs = re.sub(r"\[\d+\]", "", docs)
docs = re.sub(r'\w*\d\w*', '', docs).split('\n')

titles = [docs[i] for i in range(len(docs)) if i % 2 == 0]
docs = [docs[i] for i in range(len(docs)) if i % 2 == 1]


print(str(len(titles)) + ' titles')
print(str(len(docs)) + ' docs')
print(titles[-1])
print(docs[-1])

ranks = []

for i in range(0,len(titles)):
    ranks.append(i)
stopwords = nltk.corpus.stopwords.words('english')
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

421 titles
421 docs
National Diet Library
b'The National Diet Library (NDL) (\\\xbd\\xab\\\\xbd\\xbc\\\\\\\\\\, Kokuritsu Kokkai Toshokan) is the national library of Japan and among the largest libraries in the world. It was established in  for the purpose of assisting members of the National Diet of Japan (\\\xbd\\xbc\, Kokkai) in researching matters of public policy. The library is similar in purpose and scope to the United States Library of Congress.'b'The National Diet Library (NDL) consists of two main facilities in Tokyo and Kyoto, and several other branch libraries throughout Japan.'b''b''b"The National Diet Library is the successor of three separate libraries: the library of the House of Peers, the library of the House of Representatives, both of which were established at the creation of Japan's Imperial Diet in ; and the Imperial Library, which had been established in  under the jurisdiction of the Ministry of Education."b'The Diet\'s power in prewar Japan was limited, and its

In [3]:
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

totalvocab_stemmed = []
totalvocab_tokenized = []
for i in docs:
    allwords_stemmed = tokenize_and_stem(i)
    totalvocab_stemmed.extend(allwords_stemmed)
    
    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)

In [4]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.1, stop_words= stopwords,
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(docs)

print(tfidf_matrix.shape)

Wall time: 28.9 s
(421, 4964)


In [6]:
terms = tfidf_vectorizer.get_feature_names()
print(terms)

["'b also", "'b also true", "'b b", "'b b b'the", "'b edit", "'b edit intern", "'b registr", "'b registr group", "'b two", "'b two common", "'s", "'s back", "'s back cover", "'s eight", "'s eight ident", "'s front", "'s front cover", "'s least", "'s least two", "'s linkag", "'s linkag determin", "'s offici", "'s offici manual", "'s republ", "'s republ china", "'s sum", "'s sum ten", "'s theorem", '-digit', '-digit commerci', '-digit commerci book', '-digit data', '-digit data field', '-digit group', '-digit group identifi', '-digit isbn', '-digit isbn also', "-digit isbn b'the", '-digit isbn began', '-digit isbn check', '-digit isbn fall', '-digit isbn format', '-digit isbn migrat', '-digit isbn number', '-digit isbn separ', '-digit number', '-digit number valid', '-digit sbn', '-digit sbn code', '-digit sbn creat', '-digit standard', '-digit standard book', '-dimension', '-x', '-x could', '-x could reduc', ".\\n'b'alabama\\n'b'california\\n'b'colorado\\n'b'delaware\\n'b'florida\\n'b'g

In [7]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)

In [8]:
from sklearn.cluster import KMeans

num_clusters = 7

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

Wall time: 23 s


In [9]:
from sklearn.externals import joblib

joblib.dump(km,  'doc_cluster.pkl')
km = joblib.load('doc_cluster.pkl')
clusters = km.labels_.tolist()

In [10]:
results = { 'title': titles, 'rank': ranks, 'docs': docs, 'cluster': clusters}

frame = pd.DataFrame(results, index = [clusters] , columns = ['rank', 'title', 'cluster'])

In [11]:
from __future__ import print_function

print("Top terms per cluster:")
print()
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
for i in range(num_clusters):
    print("Cluster %d words:" % i, end='')
    for ind in order_centroids[i, :6]:
        print(' %s' % vocab_frame.loc[terms[ind].split(' ')].values.tolist()[0][0], end=',')
    print()
    print()
    print("Cluster %d titles:" % i, end='')
    for title in frame.loc[i]['title'].values.tolist():
        print(' %s,' % title, end='')
    print()
    print()

Top terms per cluster:

Cluster 0 words: mathematics, theory, algebra, studied, group, science,

Cluster 0 titles: Mathematics, Geometry, Engineering, Gottfried Wilhelm Leibniz, Gabriel Cramer, Carl Friedrich Gauss, Hermann Grassmann, James Joseph Sylvester, Arthur Cayley, Hüseyin Tevfik Pasha, Giuseppe Peano, Abstract algebra, School Mathematics Study Group, Secondary school, Axiom, Axiom, Multilinear algebra, List of linear algebra topics, Numerical linear algebra, Marie A. Vitulli, Alan Tucker, Martha Siegel, Graduate Texts in Mathematics, James Demmel, Felix Gantmacher, Israel Gelfand, Ray Kunze, Mathematical Reviews, Paul Halmos, Undergraduate Texts in Mathematics, Leon Mirsky, Igor Shafarevich, Springer Science+Business Media, Georgiy Shilov, Michiel Hazewinkel, Encyclopedia of Mathematics, MathWorld, Template talk:Linear algebra, Multilinear algebra, Abstract algebra, Numerical linear algebra, Comparison of linear algebra libraries, List of linear algebra topics, Portal:Algebra,

In [12]:
print(frame)

    rank                              title  cluster
6      0                 Elementary algebra        6
5      1                    Euclidean space        5
0      2                        Mathematics        0
6      3                    Linear equation        6
1      4                         Linear map        1
3      5               Matrix (mathematics)        3
1      6                       Vector space        1
0      7                           Geometry        0
5      8                    Line (geometry)        5
5      9                   Plane (geometry)        5
5     10             Rotation (mathematics)        5
5     11                Functional analysis        5
0     12                        Engineering        0
5     13                 Mathematical model        5
6     14                   Nonlinear system        6
3     15                        Determinant        3
6     16         System of linear equations        6
0     17          Gottfried Wilhelm Leibniz   