In [21]:
import pandas as pd
import numpy as np
import re
import nltk

#### Sample corpus of text documents
Let’s now load some sample text documents, do some basic pre-processing, and learn about various
feature engineering strategies to deal with text data. 

In [22]:
corpus = ['The sky is blue and beautiful.',
          'Love this blue and beautiful sky!',
          'The quick brown fox jumps over the lazy dog.',
          'The brown fox is quick and the blue dog is lazy!',
          'The sky is very blue and the sky is very beautiful today',
          'The dog is lazy but the brown fox is quick!'    
]
labels = ['weather', 'weather', 'animals', 'animals', 'weather', 'animals']
corpus = np.array(corpus)
corpus_df = pd.DataFrame({'Document': corpus, 
                          'Category': labels})
corpus_df = corpus_df[['Document', 'Category']]
corpus_df

Unnamed: 0,Document,Category
0,The sky is blue and beautiful.,weather
1,Love this blue and beautiful sky!,weather
2,The quick brown fox jumps over the lazy dog.,animals
3,The brown fox is quick and the blue dog is lazy!,animals
4,The sky is very blue and the sky is very beaut...,weather
5,The dog is lazy but the brown fox is quick!,animals


#### Simple text pre-processing

In [23]:
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')
def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = wpt.tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_document)

In [24]:
norm_corpus = normalize_corpus(corpus)
norm_corpus

array(['sky blue beautiful', 'love blue beautiful sky',
       'quick brown fox jumps lazy dog', 'brown fox quick blue dog lazy',
       'sky blue sky beautiful today', 'dog lazy brown fox quick'],
      dtype='<U30')

#### Bag of Words Model

In [25]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(min_df=0., max_df=1.)
cv_matrix = cv.fit_transform(norm_corpus).toarray()
cv_matrix

array([[1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0],
       [0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0],
       [0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0],
       [1, 1, 0, 0, 0, 0, 0, 0, 0, 2, 1],
       [0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0]], dtype=int64)

In [26]:
vocab = cv.get_feature_names()
pd.DataFrame(cv_matrix,columns=vocab)

Unnamed: 0,beautiful,blue,brown,dog,fox,jumps,lazy,love,quick,sky,today
0,1,1,0,0,0,0,0,0,0,1,0
1,1,1,0,0,0,0,0,1,0,1,0
2,0,0,1,1,1,1,1,0,1,0,0
3,0,1,1,1,1,0,1,0,1,0,0
4,1,1,0,0,0,0,0,0,0,2,1
5,0,0,1,1,1,0,1,0,1,0,0


#### Bag of N-Grams Model

In [27]:
# Bi-grams indicate n-grams of order 2 (two words), Tri-grams indicate n-grams of order 3 (three words), and so on. 
# We can easily extend the bag of words model to use a bag of n-grams model to give us n-gram based feature vectors
bv = CountVectorizer(ngram_range=(2,2))
bv_matrix = bv.fit_transform(norm_corpus).toarray()
vocab = bv.get_feature_names()
pd.DataFrame(bv_matrix, columns=vocab)

Unnamed: 0,beautiful sky,beautiful today,blue beautiful,blue dog,blue sky,brown fox,dog lazy,fox jumps,fox quick,jumps lazy,lazy brown,lazy dog,love blue,quick blue,quick brown,sky beautiful,sky blue
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,1,0,1,0,1,0,1,0,0,1,0,0
3,0,0,0,1,0,1,1,0,1,0,0,0,0,1,0,0,0
4,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1
5,0,0,0,0,0,1,1,0,1,0,1,0,0,0,0,0,0


#### TF-IDF Model
TF-IDF stands for Term Frequency-Inverse Document Frequency, which uses a combination of two metrics in
its computation, namely: term frequency (tf) and inverse document frequency (idf). This technique was
developed for ranking results for queries in search engines and now it is an indispensable model in the
world of information retrieval and text analytics.

$tfidf(w,D)=tf(w,D)\times\ idf(w,D)=tf(w,D)\times\ log\bigg(\frac{C}{df(w)}\bigg)$

Here, $tfidf (w, D)$ is the TF-IDF score for word w in document D. The term $tf (w, D)$ represents the term
frequency of the word w in document D, which can be obtained from the Bag of Words model. The term
$idf (w, D)$ is the inverse document frequency for the term w, which can be computed as the log transform
of the total number of documents in the corpus C divided by the document frequency of the word w,
which is basically the frequency of documents in the corpus where the word w occurs.

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(min_df=0., max_df=1., use_idf=True)
tv_matrix = tv.fit_transform(norm_corpus)
tv_matrix = tv_matrix.toarray()

vocab = tv.get_feature_names()
pd.DataFrame(np.round(tv_matrix, 2), columns=vocab)

['beautiful', 'blue', 'brown', 'dog', 'fox', 'jumps', 'lazy', 'love', 'quick', 'sky', 'today']


Unnamed: 0,beautiful,blue,brown,dog,fox,jumps,lazy,love,quick,sky,today
0,0.6,0.52,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6,0.0
1,0.46,0.39,0.0,0.0,0.0,0.0,0.0,0.66,0.0,0.46,0.0
2,0.0,0.0,0.38,0.38,0.38,0.54,0.38,0.0,0.38,0.0,0.0
3,0.0,0.36,0.42,0.42,0.42,0.0,0.42,0.0,0.42,0.0,0.0
4,0.36,0.31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.72,0.52
5,0.0,0.0,0.45,0.45,0.45,0.0,0.45,0.0,0.45,0.0,0.0


#### Document Similarity

In [29]:
from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = cosine_similarity(tv_matrix)
similarity_df = pd.DataFrame(similarity_matrix)
similarity_df

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.753128,0.0,0.185447,0.807539,0.0
1,0.753128,1.0,0.0,0.139665,0.608181,0.0
2,0.0,0.0,1.0,0.784362,0.0,0.839987
3,0.185447,0.139665,0.784362,1.0,0.109653,0.933779
4,0.807539,0.608181,0.0,0.109653,1.0,0.0
5,0.0,0.0,0.839987,0.933779,0.0,1.0


#### Clustering documents using similarity features

In [31]:
from sklearn.cluster import KMeans
km = KMeans(n_clusters=2)
km.fit_transform(similarity_df)
cluster_labels = km.labels_
cluster_labels = pd.DataFrame(cluster_labels, columns=['ClusterLabel'])
pd.concat([corpus_df, cluster_labels], axis=1)

Unnamed: 0,Document,Category,ClusterLabel
0,The sky is blue and beautiful.,weather,0
1,Love this blue and beautiful sky!,weather,0
2,The quick brown fox jumps over the lazy dog.,animals,1
3,The brown fox is quick and the blue dog is lazy!,animals,1
4,The sky is very blue and the sky is very beaut...,weather,0
5,The dog is lazy but the brown fox is quick!,animals,1


### Topic models
There are various techniques for topic modeling and most of them involve some form of matrix
decomposition. Some techniques like Latent Semantic Indexing (LSI) use matrix decomposition operations,
more specifically Singular Valued Decomposition (refer back to important mathematical concepts in
Chapter 1), to split a term-document matrix (transpose of our TF-IDF document-term feature matrix) into
three matrices, U, S & VT. You can use the left singular vectors in matrix U and multiply it by the singular
vectors S to get terms and their weights (signifying importance) per topic. You can use scikit-learn or
gensim to use LSI based topic modeling.

In [41]:
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=2, max_iter=100, random_state=42)
dt_matrix = lda.fit_transform(tv_matrix)
feature = pd.DataFrame(dt_matrix,columns=['T1', 'T2'])
feature

Unnamed: 0,T1,T2
0,0.190548,0.809452
1,0.176804,0.823196
2,0.846184,0.153816
3,0.814863,0.185137
4,0.180516,0.819484
5,0.839172,0.160828


#### Show topics and their weights
Thus, the `dt_matrix` refers to the document-topic matrix giving us two features since we chose number
of topics to be 2. You can also use the other matrix obtained from the decomposition, the topic-term matrix
to see the topics extracted from our corpus using the LDA model using the following code

In [46]:
tt_matrix = lda.components_
print(tt_matrix)
for topic_weights in tt_matrix:
    topic = [(token, weight) for token, weight in zip(vocab, topic_weights)]
    topic = sorted(topic, key=lambda x: -x[1])
    topic = [item for item in topic if item[1] > 0.6]
    print(topic)
    print()

[[0.51095945 0.77315732 1.72736387 1.72736387 1.72736387 1.03283253
  1.72736387 0.50974622 1.72736387 0.51098278 0.50968019]
 [1.90682693 1.79962821 0.51310187 0.51310187 0.51310187 0.50977437
  0.51310187 1.14812724 0.51310187 2.26438664 1.00682512]]
[('brown', 1.7273638692668465), ('dog', 1.7273638692668465), ('fox', 1.7273638692668465), ('lazy', 1.7273638692668465), ('quick', 1.7273638692668465), ('jumps', 1.0328325272484777), ('blue', 0.7731573162915626)]

[('sky', 2.264386643135622), ('beautiful', 1.9068269319456903), ('blue', 1.7996282104933266), ('love', 1.148127242397004), ('today', 1.0068251160429935)]



#### Clustering documents using topic model features
The preceding output represents each of the two topics as a collection of terms and their importance
is depicted by the corresponding weight. It is definitely interesting to see that the two topics are quite
distinguishable from each other by looking at the terms. The first topic shows terms relevant to animals and
the second topic shows terms relevant to weather. This is reinforced by applying our unsupervised K-means
clustering algorithm on our document-topic feature matrix (dt_matrix) using the following code snippet.

In [60]:
km = KMeans(2)    
km.fit(feature)
cluster_labels = km.labels_
cluster_labels = pd.DataFrame(cluster_labels, columns=['ClusterLabel'])
pd.concat([corpus_df,cluster_labels],axis=1)

Unnamed: 0,Document,Category,ClusterLabel
0,The sky is blue and beautiful.,weather,0
1,Love this blue and beautiful sky!,weather,0
2,The quick brown fox jumps over the lazy dog.,animals,1
3,The brown fox is quick and the blue dog is lazy!,animals,1
4,The sky is very blue and the sky is very beaut...,weather,0
5,The dog is lazy but the brown fox is quick!,animals,1


### Word Embeddings
- **size**: Represents the feature vector size for each word in the corpus when
transformed.
- **window**: Sets the context window size specifying the length of the window of words to
be taken into account as belonging to a single, similar context when training.
- **min_count**: Specifies the minimum word frequency value needed across the corpus
to consider the word as a part of the final vocabulary during training the model.
- **sample**: Used to downsample the effects of words which occur very frequently.

In [62]:
from gensim.models import word2vec

wpt = nltk.WordPunctTokenizer()
tokenized_corpus = [wpt.tokenize(document) for document in norm_corpus]

# Set values for various parameters
feature_size = 10    # Word vector dimensionality  
window_context = 10          # Context window size                                                                                    
min_word_count = 1   # Minimum word count                        
sample = 1e-3   # Downsample setting for frequent words

w2v_model = word2vec.Word2Vec(tokenized_corpus, size=feature_size, 
                          window=window_context, min_count = min_word_count,
                          sample=sample)

TypeError: 'Word2VecKeyedVectors' object is not callable

In [63]:
w2v_model.wv('sky')

TypeError: 'Word2VecKeyedVectors' object is not callable

A question might arise in your mind now that so far, we had feature vectors for each complete
document, but now we have vectors for each word. How on earth do we represent entire documents now?
We can do that using various aggregation and combinations. A simple scheme would be to use an averaged
word vector representation, where we simply sum all the word vectors occurring in a document and then
divide by the count of word vectors to represent an averaged word vector for the document. The following
code enables us to do the same

In [64]:
def average_word_vectors(words, model, vocabulary, num_features):
    
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.
    
    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model[word])
    
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
        
    return feature_vector
    
   
def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index2word)
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)

In [65]:
w2v_feature_array = averaged_word_vectorizer(corpus=tokenized_corpus, model=w2v_model,
                                             num_features=feature_size)
pd.DataFrame(w2v_feature_array)

  if __name__ == '__main__':


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-0.028813,0.014459,-0.001859,0.033134,-0.015159,-0.017498,-0.015926,-0.035119,0.012561,0.010375
1,-0.027867,0.021286,-0.000418,0.014098,-0.015677,-0.00465,-0.017075,-0.033211,0.003152,-0.001771
2,0.010377,-0.006664,-0.004633,0.000897,0.001042,0.013745,-0.002968,0.002782,0.002518,0.00525
3,-0.006176,0.000572,-0.003033,0.011313,-0.008103,0.010579,-0.005232,7.1e-05,0.008494,0.004596
4,-0.024877,0.017566,0.004884,0.032732,-0.005788,-0.010325,-0.015727,-0.033549,0.008134,0.01612
5,0.002542,0.000231,-0.009711,0.005324,-0.001228,0.011612,0.003586,0.005812,0.011193,0.013413


Thus, we have our averaged word vector based feature set for all our corpus documents, as depicted
by the dataframe. Let’s use a different clustering algorithm this time known as **Affinity
Propagation** to try to cluster our documents based on these new features. Affinity Propagation is based on
the concept of message passing and you do not need to specify the number of clusters beforehand like you
did in K-means clustering

In [72]:
from sklearn.cluster import AffinityPropagation
ap = AffinityPropagation()
ap.fit(w2v_feature_array)
cluster_labels = ap.labels_
cluster_labels = pd.DataFrame(cluster_labels, columns=['ClusterLabel'])
pd.concat([corpus_df, cluster_labels], axis=1)

Unnamed: 0,Document,Category,ClusterLabel
0,The sky is blue and beautiful.,weather,0
1,Love this blue and beautiful sky!,weather,0
2,The quick brown fox jumps over the lazy dog.,animals,1
3,The brown fox is quick and the blue dog is lazy!,animals,1
4,The sky is very blue and the sky is very beaut...,weather,0
5,The dog is lazy but the brown fox is quick!,animals,1
