In [8]:
import spacy
# To tag words
from textblob import TextBlob
import pandas as pd
from pandas import DataFrame
# To use new datatypes
from collections import Counter
from urllib import request
import re
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.cm import get_cmap
from matplotlib.colors import rgb2hex
import plotly.offline
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import seaborn as sns
# To vectorize texts
from sklearn.feature_extraction.text import CountVectorizer
# To decompose texts
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA
from sklearn.decomposition import SparsePCA
from sklearn.decomposition import LatentDirichletAllocation
# To visualize high dimensional dataset
from sklearn.manifold import TSNE
import pyLDAvis
import pyLDAvis.gensim

In [10]:
url = "http://www.gutenberg.org/files/11/11-0.txt"
response = request.urlopen(url)
book = response.read().decode('utf8')
# spacy.cli.download("en_core_web_lg")
# nlp = spacy.load('en_core_web_lg')
# #nlp = English()

In [12]:
myBook = book[1492:148851]
myChap = [
    book[1492:13267], book[13267:24430],
    book[24430:33906], book[33906:48059],
    book[48059:60374], book[60374:74550],
    book[74550:87604], book[87604:101587],
    book[101587:114545], book[114545:126262],
    book[126262:136916], book[136916:148851]]
for chap in myChap:
    chap

type(chap)

str

In [181]:
lemma_book = []
for word in myBook:
    lemma_book.append(word.lemma_)

In [182]:
filtered_book =[] 

for token in lemma_book:
    term = nlp.vocab[token]
    if term.is_stop == False:
        filtered_book.append(token)

In [183]:
refiltered_book =[]
for token in filtered_book:
    if re.sub(r"[^a-zA-ZÀ-ÿ0-9]+", ' ', token) != ' ':
        refiltered_book.append(re.sub(r"[^a-zA-ZÀ-ÿ0-9]+", '', token))

In [184]:
my_stopwords = ['PRON', 'chapter']
second_filter = []
for token in refiltered_book:
    if token not in my_stopwords:
        second_filter.append(token)

In [185]:
countVectorizer = CountVectorizer()

# Vectorize text
vectorizedText = countVectorizer.fit_transform(second_filter)
print('Shape Vectorized Text: {}'.format(vectorizedText.shape))

Shape Vectorized Text: (9473, 1783)


In [186]:
#Plot n most frequent words
n = 20
subject = 'Alice'


def nMostFrequentWords(n, countVectorizer, vectorizedText):    
    # Count word appearences in text
    vectorizedCount = np.sum(vectorizedText, axis=0)
    
    # Get word indices and counts
    wordIndices = np.flip(np.argsort(vectorizedCount), 1)
    wordCounts = np.flip(np.sort(vectorizedCount),1)

    # Create wordvectors to inverse-transform them
    wordVectors = np.zeros((n, vectorizedText.shape[1]))
    for i in range(n):
        wordVectors[i, wordIndices[0,i]] = 1

    # Inverse-transfrom the wordvectors
    words = [word[0].encode('ascii').decode('utf-8') for word in countVectorizer.inverse_transform(wordVectors)]

    # Return word and word-counts
    return (words, wordCounts[0, :n].tolist()[0])



# Get most frequent words with wordcounts
words, wordCounts = nMostFrequentWords(n=n, countVectorizer=countVectorizer, vectorizedText=vectorizedText)

# Create colormap
cmap = get_cmap('viridis')
colors = [rgb2hex(cmap(color)) for color in np.arange(0, 1.000001, 1/(n-1))]

# Create plot
data = go.Bar(x = words,
              y = wordCounts,
              marker = dict(color = colors))

layout = go.Layout(title = 'Most Frequent {} Words In {}'.format(n, subject),
                   xaxis = dict(title = 'Words'),
                   yaxis = dict(title = 'Count'))

fig = go.Figure(data=[data], layout=layout)
iplot(fig)

In [187]:
tag_dict = {"CC":"conjunction, coordinating; and, or, but",
                "CD":"cardinal number; five, three, 13%",
                "DT":"determiner; the, a, these",
                "EX":"existential there; there were six boys",
                "FW":"foreign word; mais",
                "IN":"conjunction, subordinating or preposition; of, on, before, unless",
                "JJ":"adjective; nice, easy",
                "JJR":"adjective, comparative; nicer, easier",
                "JJS":"adjective, superlative; nicest, easiest",
                "LS":"list item marker; ",
                "MD":"verb, modal auxillary; may, should",
                "NN":"noun, singular or mass; tiger, chair, laughter",
                "NNS":"noun, plural; tigers, chairs, insects",
                "NNP":"noun, proper singular; Germany, God, Alice",
                "NNPS":"noun, proper plural; we met two Christmases ago",
                "PDT":"predeterminer; both his children",
                "POS":"possessive ending; 's",
                "PRP":"pronoun, personal; me, you, it",
                "PRP$":"pronoun, possessive; my, your, our",
                "RB":"adverb; extremely, loudly, hard",
                "RBR":"adverb, comparative; better",
                "RBS":"adverb, superlative; best",
                "RP":"adverb, particle; about, off, up",
                "SYM":"symbol; %",
                "TO":"infinitival to; what to do?",
                "UH":"interjection; oh, oops, gosh",
                "VB":"verb, base form; think",
                "VBZ":"verb, 3rd person singular present; she thinks",
                "VBP":"verb, non-3rd person singular present; I think",
                "VBD":"verb, past tense; they thought",
                "VBN":"verb, past participle; a sunken ship",
                "VBG":"verb, gerund or present participle; thinking is fun",
                "WDT":"wh-determiner; which, whatever, whichever",
                "WP":"wh-pronoun, personal; what, who, whom",
                "WP$":"wh-pronoun, possessive; whose, whosever",
                "WRB":"wh-adverb; where, when"}

In [188]:
df = DataFrame(myChap, columns=['text'])
# Apply tag-function to DataFrame, stack tags and count them
tag_df = pd.DataFrame.from_records(df['text'].apply(lambda x: [tag for word, tag in TextBlob(x).pos_tags]).tolist()).stack().value_counts().reset_index().rename(columns={'index':'tag', 0:'count'})


# Create colormap
n = tag_df.shape[0]
cmap = get_cmap('viridis')
colors = [rgb2hex(cmap(color)) for color in np.arange(0, 1.000001, 1/(n-1))]

# Create plot
data = go.Bar(x = tag_df['tag'],
              y = tag_df['count'],
              text = tag_df['tag'].apply(lambda x: tag_dict[x] if x in tag_dict.keys() else x),
              marker = dict(color = colors))

layout = go.Layout(title = 'Most Frequent Tags In {}'.format(subject),
                   xaxis = dict(title = 'Type Of Word'),
                   yaxis = dict(title = 'Count'))

fig = go.Figure(data=[data], layout=layout)
iplot(fig)

In [189]:
filtered_df = DataFrame(second_filter, columns=['text'])
# Apply tag-function to DataFrame, stack tags and count them
tag_df = pd.DataFrame.from_records(filtered_df['text'].apply(lambda x: [tag for word, tag in TextBlob(x).pos_tags]).tolist()).stack().value_counts().reset_index().rename(columns={'index':'tag', 0:'count'})


# Create colormap
n = tag_df.shape[0]
cmap = get_cmap('viridis')
colors = [rgb2hex(cmap(color)) for color in np.arange(0, 1.000001, 1/(n-1))]

# Create plot
data = go.Bar(x = tag_df['tag'],
              y = tag_df['count'],
              text = tag_df['tag'].apply(lambda x: tag_dict[x] if x in tag_dict.keys() else x),
              marker = dict(color = colors))

layout = go.Layout(title = 'Most Frequent Tags In {}'.format(subject),
                   xaxis = dict(title = 'Type Of Word'),
                   yaxis = dict(title = 'Count'))

fig = go.Figure(data=[data], layout=layout)
iplot(fig)

In [190]:
nTopics = 8
# Create LSI and fit
lsiModel = TruncatedSVD(n_components=nTopics)
lsiTopicMatrix = lsiModel.fit_transform(vectorizedText)
print('Shape LSI Topic Matrix: {}'.format(lsiTopicMatrix.shape))

# Get most probable keys and all categories with counts
lsiKeys = lsiTopicMatrix.argmax(axis=1)
lsiCategories, lsiCounts = zip(*Counter(lsiKeys).items())

Shape LSI Topic Matrix: (9473, 8)


In [191]:
def getTopWords(n, lsiKeys, vectorizedText, countVectorizer):
    # Create empty array for mean
    wordMean = np.zeros((nTopics, vectorizedText.shape[1]))
    # Iterate over each topic
    for i in np.unique(lsiKeys):
        wordMean[i] += vectorizedText.toarray()[lsiKeys==i].mean(axis=0)
        
    # Sort and get the most frequent n words for each topic
    topWordsIndices = np.flip(np.argsort(wordMean, axis=1)[:, -n:], axis=1)
    topWordsPercentage = (np.divide(np.flip(np.sort(wordMean, axis=1)[:, -n:], axis=1), (np.sum(wordMean, axis=1)+0.0000001)[:, None])*100).astype(int)


    # Store all words for all topics
    topWords = []

    # Iterate over the topics with its indices
    for i, (topic, percentage) in enumerate(zip(topWordsIndices, topWordsPercentage)):
        # Store all words for one topic
        topicWords = []

        if i in np.unique(lsiKeys):
            # Iterate over the indices for the topic
            for index, percent in zip(topic, percentage):
                # Create a wordvector for the index
                wordVector = np.zeros((vectorizedText.shape[1]))
                wordVector[index] = 1
                # Inverse-transfor the wordvector
                word = countVectorizer.inverse_transform(wordVector)[0][0]
                # Store the word
                topicWords.append('{}% '.format(percent) + word.encode('ascii').decode('utf-8'))
        # Store all words for the topic
        topWords.append(', '.join(topicWords))

    return topWords

In [192]:
# Get top n words
topWords = getTopWords(5, lsiKeys, vectorizedText, countVectorizer)

# Print the topics and its words
for i, words in enumerate(topWords):
    print('Topic {}: {}'.format(i, words))

Topic 0: 95% alice, 0% spectacle, 0% notion, 0% treat, 0% archbishop
Topic 1: 17% think, 8% king, 6% cat, 5% good, 4% grow
Topic 2: 49% little, 21% mock, 11% foot, 2% bat, 2% growl
Topic 3: 29% know, 8% right, 4% footman, 3% mouth, 3% fan
Topic 4: 21% look, 15% queen, 11% find, 10% rabbit, 5% caterpillar
Topic 5: 4% come, 3% thing, 2% hatter, 2% gryphon, 2% mouse
Topic 6: 4% begin, 3% head, 2% way, 2% hear, 2% oh
Topic 7: 2% like, 2% time, 1% turtle, 1% voice, 1% try


In [193]:
# Sort data
lsiCategoriesSorted, lsiCountsSorted = zip(*sorted(zip(lsiCategories, lsiCounts)))

# Create labels
topWords = getTopWords(5, lsiKeys, vectorizedText, countVectorizer)
labels = ['Topic {}'.format(i) for i in lsiCategoriesSorted]

# Create colormap
n = nTopics
cmap = get_cmap('viridis')
colors = [rgb2hex(cmap(color)) for color in np.arange(0, 1.000001, 1/(n-1))]

# Create plot
data = go.Bar(x = labels,
              y = lsiCountsSorted,
              text = [word for word in topWords if word],
              marker = dict(color = colors))

layout = go.Layout(title = 'Most Frequent LSI Topics In {}'.format(subject),
                   xaxis = dict(title = 'Topic'),
                   yaxis = dict(title = 'Count'))

fig = go.Figure(data=[data], layout=layout)
iplot(fig)

In [194]:
# Transform high dimensional dataset to visualize in 2D
tsneModel = TSNE(n_components=2, perplexity=50, learning_rate=100, n_iter=2000, verbose=1, random_state=0, angle=0.75)
tsneModelVectors = tsneModel.fit_transform(lsiTopicMatrix)

[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 9473 samples in 0.032s...
[t-SNE] Computed neighbors for 9473 samples in 0.875s...
[t-SNE] Computed conditional probabilities for sample 1000 / 9473
[t-SNE] Computed conditional probabilities for sample 2000 / 9473
[t-SNE] Computed conditional probabilities for sample 3000 / 9473
[t-SNE] Computed conditional probabilities for sample 4000 / 9473
[t-SNE] Computed conditional probabilities for sample 5000 / 9473
[t-SNE] Computed conditional probabilities for sample 6000 / 9473
[t-SNE] Computed conditional probabilities for sample 7000 / 9473
[t-SNE] Computed conditional probabilities for sample 8000 / 9473
[t-SNE] Computed conditional probabilities for sample 9000 / 9473
[t-SNE] Computed conditional probabilities for sample 9473 / 9473
[t-SNE] Mean sigma: 0.000000
[t-SNE] KL divergence after 250 iterations with early exaggeration: 60.075146
[t-SNE] KL divergence after 2000 iterations: -0.252572


In [195]:
points = 9473
# Create colormap
n = nTopics
cmap = get_cmap('tab10')
colors = [rgb2hex(cmap(color)) for color in np.arange(0, 1.000001, 1/(n-1))]

# Get n top words
topWords = getTopWords(3, lsiKeys, vectorizedText, countVectorizer)


# Create plot
data = []
# Iterate over each topic
for topic in range(nTopics):
    # Mask for a single topic
    mask = lsiKeys==topic
    # Mask for sampling
    sample_mask = np.zeros(mask.sum()).astype(bool)
    sample_mask[:int(points/nTopics)] = True
    np.random.shuffle(sample_mask)
    
    scatter = go.Scatter(x = tsneModelVectors[mask,0][sample_mask],
                         y = tsneModelVectors[mask,1][sample_mask],
                         name = 'Topic {}: {}'.format(topic, topWords[topic]),
                         mode = 'markers',
                         text = second_filter,
                         marker = dict(color = colors[topic]))
    data.append(scatter)

layout = go.Layout(title = 't-SNE Clustering of {} LSI Topics'.format(nTopics),
                   showlegend=True,
                   hovermode = 'closest')

fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [196]:
# Create LDA and fit
ldaModel = LatentDirichletAllocation(n_components=nTopics, learning_method='online', random_state=0, verbose=0)
ldaTopicMatrix = ldaModel.fit_transform(vectorizedText)
print('Shape LSI Topic Matrix: {}'.format(ldaTopicMatrix.shape))

# Get most probable keys and all categories with counts
ldaKeys = ldaTopicMatrix.argmax(axis=1)
ldaCategories, ldaCounts = zip(*Counter(ldaKeys).items())

Shape LSI Topic Matrix: (9473, 8)


In [197]:
# Get top n words
topWords = getTopWords(5, ldaKeys, vectorizedText, countVectorizer)

# Print the topics and its words
for i, words in enumerate(topWords):
    print('Topic {}: {}'.format(i, words))

Topic 0: 33% alice, 3% turn, 3% long, 2% away, 1% arm
Topic 1: 5% turtle, 4% voice, 4% mouse, 3% large, 3% dormouse
Topic 2: 11% think, 4% way, 4% hear, 3% duchess, 3% tone
Topic 3: 8% look, 6% time, 4% hatter, 3% oh, 3% great
Topic 4: 10% little, 5% head, 4% cat, 3% try, 2% eye
Topic 5: 5% king, 4% mock, 3% good, 2% reply, 2% word
Topic 6: 8% know, 7% come, 6% thing, 4% gryphon, 3% tell
Topic 7: 7% begin, 7% like, 6% queen, 4% find, 4% rabbit


In [198]:
# Sort data
ldaCategoriesSorted, ldaCountsSorted = zip(*sorted(zip(ldaCategories, ldaCounts)))

# Create labels
topWords = getTopWords(5, ldaKeys, vectorizedText, countVectorizer)
labels = ['Topic {}'.format(i) for i in ldaCategoriesSorted]

# Create colormap
n = nTopics
cmap = get_cmap('viridis')
colors = [rgb2hex(cmap(color)) for color in np.arange(0, 1.000001, 1/(n-1))]

# Create plot
data = go.Bar(x = labels,
              y = ldaCountsSorted,
              text = [word for word in topWords if word],
              marker = dict(color = colors))

layout = go.Layout(title = 'Most Frequent LDA Topics In {}'.format(subject),
                   xaxis = dict(title = 'Topic'),
                   yaxis = dict(title = 'Count'))

fig = go.Figure(data=[data], layout=layout)
iplot(fig)

In [199]:
# Transform high dimensional dataset to visualize in 2D
tsneModel = TSNE(n_components=2, perplexity=50, learning_rate=100, n_iter=2000, verbose=1, random_state=0, angle=0.75)
tsneModelVectors = tsneModel.fit_transform(ldaTopicMatrix)

[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 9473 samples in 0.032s...
[t-SNE] Computed neighbors for 9473 samples in 0.765s...
[t-SNE] Computed conditional probabilities for sample 1000 / 9473
[t-SNE] Computed conditional probabilities for sample 2000 / 9473
[t-SNE] Computed conditional probabilities for sample 3000 / 9473
[t-SNE] Computed conditional probabilities for sample 4000 / 9473
[t-SNE] Computed conditional probabilities for sample 5000 / 9473
[t-SNE] Computed conditional probabilities for sample 6000 / 9473
[t-SNE] Computed conditional probabilities for sample 7000 / 9473
[t-SNE] Computed conditional probabilities for sample 8000 / 9473
[t-SNE] Computed conditional probabilities for sample 9000 / 9473
[t-SNE] Computed conditional probabilities for sample 9473 / 9473
[t-SNE] Mean sigma: 0.000000
[t-SNE] KL divergence after 250 iterations with early exaggeration: 52.588856
[t-SNE] KL divergence after 2000 iterations: -0.135033


In [200]:
# Create colormap
n = nTopics
cmap = get_cmap('tab10')
colors = [rgb2hex(cmap(color)) for color in np.arange(0, 1.000001, 1/(n-1))]

# Get n top words
topWords = getTopWords(3, ldaKeys, vectorizedText, countVectorizer)


# Create plot
data = []
# Iterate over each topic
for topic in range(nTopics):
    # Mask for a single topic
    mask = ldaKeys==topic
    # Mask for sampling
    sample_mask = np.zeros(mask.sum()).astype(bool)
    sample_mask[:int(points/nTopics)] = True
    np.random.shuffle(sample_mask)
    
    scatter = go.Scatter(x = tsneModelVectors[mask,0][sample_mask],
                         y = tsneModelVectors[mask,1][sample_mask],
                         name = 'Topic {}: {}'.format(topic, topWords[topic]),
                         mode = 'markers',
                         text = second_filter,
                         marker = dict(color = colors[topic]))
    data.append(scatter)

layout = go.Layout(title = 't-SNE Clustering of {} LDA Topics'.format(nTopics),
                   showlegend=True,
                   hovermode = 'closest')

fig = go.Figure(data=data, layout=layout)
iplot(fig)