In [1]:
import numpy
import pandas as pd
import nltk
import re
import io

In [46]:
#titles = io.open("titles.txt", mode="r", encoding="utf-8", errors="ignore").read().split('\n')
docs = io.open("../data/linalg_text2.txt", mode="r", encoding="utf-8", errors="ignore").read()
docs = re.sub(r"\[\d+\]", "", docs)
docs = re.sub(r'\w*\d\w*', '', docs).split('\n')
titles = [docs[i] for i in range(len(docs)) if i % 2 == 0]
docs = [docs[i] for i in range(len(docs)) if i % 2 == 1]


print(str(len(titles)) + ' titles')
print(str(len(docs)) + ' docs')
print(titles[-1])
print(docs[-1])

ranks = []

for i in range(0,len(titles)):
    ranks.append(i)
stopwords = nltk.corpus.stopwords.words('english')
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

122 titles
122 docs
Linear equation
b'If n =  the set of the solutions is a plane in a three-dimensional space. More generally, the set of the solutions is an (n\\\\\\\)-dimensional hyperplane in a n-dimensional Euclidean space (or affine space if the coefficients are complex numbers or belong to any field).'b'In other words, if ai \\\ , one may choose arbitrary values for all the unknowns except xi, and express xi in term of these values.'b'If at least one coefficient is nonzero, a permutation of the subscripts allows one to suppose  \\\ , and rewrite the equation'b'If all the coefficients are zero, then either b \\\  and the equation does not have any solution, or b =  and every set of values for the unknowns is a solution.'b'where, , , ..., an represent numbers, called the coefficients, , , ..., xn are the unknowns, and b is called the constant term. When dealing with three or fewer variables, it is common to use x, y and z instead of ,  and .'b'A linear equation can involve more th

In [47]:
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

totalvocab_stemmed = []
totalvocab_tokenized = []
for i in docs:
    allwords_stemmed = tokenize_and_stem(i)
    totalvocab_stemmed.extend(allwords_stemmed)
    
    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)

In [48]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)

In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.1, stop_words= stopwords,
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(docs)

print(tfidf_matrix.shape)

Wall time: 9.18 s
(122, 1231)


In [50]:
terms = tfidf_vectorizer.get_feature_names()
print(terms)

["'b", "'b b", "'b b b'in", "'s", "'s formula", "'s rule", "'s theorem", '-dimension', '\\xce\\', '\\xce\\ \\xce\\', '\\xce\\xbb', '\\xcf\\', 'a.', 'a\\\\\\', 'ab', 'abelian', 'abl', 'abov', 'absolut', 'absolut valu', 'abstract', 'abstract algebra', 'accept', 'accord', 'account', 'accur', 'achiev', 'act', 'action', 'actual', 'ad', 'add', 'addit', 'addit multipl', 'addit scalar', 'addit scalar multipl', 'admit', 'advanc', 'advantag', 'affin', 'age', 'agre', 'ai', 'algebra', 'algebra close', 'algebra geometri', 'algebra structur', 'algorithm', 'allow', 'allow one', 'almost', 'along', 'alreadi', 'also call', 'also known', 'also use', 'altern', 'although', 'alway', 'among', 'amount', 'analog', 'analysi', 'analyt', 'analyt geometri', 'ancient', 'angl', 'ani', 'ani field', 'ani given', 'ani two', 'ani vector', 'anoth', 'answer', 'appear', 'appli', 'applic', 'approach', 'appropri', 'approxim', 'arbitrari', 'area', 'argument', 'aris', 'arithmet', 'around', 'arriv', 'art', 'articl', 'aspect', '

In [51]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)

In [52]:
from sklearn.cluster import KMeans

num_clusters = 7

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

Wall time: 1.62 s


In [53]:
from sklearn.externals import joblib

joblib.dump(km,  'doc_cluster.pkl')
km = joblib.load('doc_cluster.pkl')
clusters = km.labels_.tolist()

In [54]:
results = { 'title': titles, 'rank': ranks, 'docs': docs, 'cluster': clusters}

frame = pd.DataFrame(results, index = [clusters] , columns = ['rank', 'title', 'cluster'])

In [55]:
from __future__ import print_function

print("Top terms per cluster:")
print()
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
for i in range(num_clusters):
    print("Cluster %d words:" % i, end='')
    for ind in order_centroids[i, :6]:
        print(' %s' % vocab_frame.loc[terms[ind].split(' ')].values.tolist()[0][0], end=',')
    print()
    print()
    print("Cluster %d titles:" % i, end='')
    for title in frame.loc[i]['title'].values.tolist():
        print(' %s,' % title, end='')
    print()
    print()

Top terms per cluster:

Cluster 0 words: matrix, diagonal, matrices, eigenvalues, eigenvectors, entries,

Cluster 0 titles: Diagonalizable matrix, Diagonal matrix,

Cluster 1 words: set, mathematical, function, 's, theory, algebra,

Cluster 1 titles: Algebraic geometry, Representation theory, Mathematical analysis, Algebra over a field, Multilinear algebra, Module (mathematics), Field (mathematics), Free module, Cramer's rule, Homogeneous coordinates, Inner product space, Dirichlet conditions, Fourier series, Linear least squares (mathematics), Triangular matrix, Normal matrix, Hermitian adjoint, CauchySchwarz inequality, Axiom, Inner product space, Polynomial, Algebraically closed field, Complex number, Invariant (mathematics), Eigenvalues and eigenvectors, Eigenvalues and eigenvectors, Inverse element, Cramer's rule, Standard basis, Coordinate system, Linear combination, Cardinality, Well-defined, Dimension theorem for vector spaces, Axiom of choice, Rational number, Linear combinati

In [59]:
frame.loc[frame['cluster'] == 1].head()

Unnamed: 0,rank,title,cluster
1,1,Algebraic geometry,1
1,3,Representation theory,1
1,5,Mathematical analysis,1
1,7,Algebra over a field,1
1,8,Multilinear algebra,1


In [75]:
df = pd.read_csv('../data/linalg_links2.csv', encoding='latin1', header=0)
df.head()

Unnamed: 0,origin_link,outgoing_link,origin_title,outgoing_title
0,b'https://en.wikipedia.org/wiki/Linear_algebra',b'/wiki/Algebraic_geometry',b'Linear algebra',b'Algebraic geometry'
1,b'https://en.wikipedia.org/wiki/Linear_algebra',b'/wiki/Systems_of_polynomial_equations',b'Linear algebra',b'Systems of polynomial equations'
2,b'https://en.wikipedia.org/wiki/Linear_algebra',b'/wiki/Representation_theory',b'Linear algebra',b'Representation theory'
3,b'https://en.wikipedia.org/wiki/Linear_algebra',b'/wiki/Functional_analysis',b'Linear algebra',b'Functional analysis'
4,b'https://en.wikipedia.org/wiki/Linear_algebra',b'/wiki/Mathematical_analysis',b'Linear algebra',b'Mathematical analysis'


In [77]:
def remGross(s):
    return s[2:-2]

SyntaxError: invalid syntax (<ipython-input-77-15e5f577fbe7>, line 1)