In [1]:
# TF - Term Frequency
# IDF - Inverse Document Frequency
# TF-TDf = TF * IDF
# Term Frequency = Number of Occurences of a word in a Document/ Number of Words in that document
# Inverse Document Frequency = Log(Number of Documents/Number of Documnets containing word)
# TF-IDF = Term Frequency * Inverse Document frequency
# TFIDF(word) = TF(document, word) * IDF(word)
import numpy as np
import nltk
import sys
import sklearn
import heapq # Sort the histo
import re # To Preprocess

In [2]:
paragraph = """
What are the qualities of your voice?
The Throat Center. {Chakra of the Month}
Mine is usually calm, soothing and clear. Yoga students often tell me my voice is relaxing, which is a sweet compliment. However, when I’m caught up in a moment of anger, fear or stress, my voice can get sharp and sound anything but calm and relaxing.

Let’s bring more mindfulness to our voices and how we use them this month. Our December chakra focus is our communication center, the throat center.
Just joining in? Read about the first, second, third and fourth chakras.

Chakra author Anothea Judith writes:
Vaguely Relephant Reads:

You Don’t have to be an Alcoholic to be Wasted. Read 

How to Get Over the One you Can’t Get Over. Read 

Chakra five is the center related to communication through sound, vibration, self-expression and creativity.
It is the realm of consciousness that controls, creates, transmits and receives communication, both within ourselves and between each other.
It is the center of dynamic creativity, of synthesizing old ideas into something new.
The fifth chakra color is bright, cerulean or turquoise blue.
Some symptoms of an out-of-balance fifth chakra include suffering from throat-centered illnesses, like strep throat or losing our voice.
Throaty ailments may mean we need to slow down, rest and communicate more clearly, both with ourselves and others, or they may be a result of excess communication or over-sharing—and a sign that we should give our voice a break and intentionally soak in silence.
If our lives are busy, we need to relax and remember how to set boundaries, express our needs and allow time and space to nurture ourselves.
When our throat chakra is healthy and balanced, we express ourselves and communicate clearly, honestly and openly—but not excessively.
Here are some other ways to help balance and align our throat chakra:
Chant
Any time we speak, sing, scream, whisper or make any other sort of sound with our voice, we invoke throat energy. The element of vishuddha chakra is sound, so belt out some long OMs, or whatever other chants you like.
Asana
Warm up by practicing gentle neck stretches in all directions. Many heart openers/back bends are also throat openers. The ultimate is probably fish pose (matsyasana).
Experiment with Music
Try practicing yoga or meditation to healing sounds like Tibetan singing bowls, African drums, ocean waves, rock music—and sometimes in sweet silence.
Mantra Meditation
Perfect for sitting, standing or walking meditation, repeating a silent mantra is a form of meditation that helps us quickly focus and calm the mind. It can be a syllable sound like “Om” or a word or phrase in Sanskrit, English or whatever language you prefer. A nice one is “sat nam,” which means “I am the truth,” in Sanskrit, but there are millions to choose from.
Mindfully Drink Tea
Brew up a mugful of your favorite herbal tea and have your own little ceremony as you meditate and enjoy a delicious, soothing hot drink.
Express Yourself
Speak up! Share something about yourself with someone. Write a poem. Sing. Express your unique voice in whatever way you wish.
On that note, if you’d like to participate in a two-week writing challenge this month, check out the reverb13 community writing project happening here at elephant."""

In [4]:
# Tokenize
sentences = nltk.sent_tokenize(paragraph)

In [5]:
# Preprocess
for i in range(len(sentences)):
    sentences[i] = sentences[i].lower() # Lower Case
    sentences[i] = re.sub(r'\W',' ',sentences[i]) # replace punctuations with space
    sentences[i] = re.sub(r'\s+', ' ', sentences[i])

In [6]:
word2count = {}
# Map Reduce
for data in sentences:
    words = nltk.word_tokenize(data)
    for word in words:
        if word not in word2count.keys():
            word2count[word] = 1
        else:
            # Initial a Count for each word 
            word2count[word] += 1

In [7]:
# Top 100 frequent words
# Get me the top 100 ocurrences of words, which have the highest frequency of occurence
freq_words = heapq.nlargest(100, word2count, key=word2count.get)

In [10]:
#IDF matrix
word_idfs = {}

for word in freq_words:
    doc_count = 0
    for data in sentences:
        if word in nltk.word_tokenize(data):
            doc_count += 1
    word_idfs[word] = np.log((len(sentences)/doc_count)+1)

In [11]:
# tf matrix

tf_matrix = {}
for word in freq_words:
    doc_tf = []
    for data in sentences:
        frequency = 0
        for w in nltk.word_tokenize(data):
            if w==word:
                frequency += 1
        tf_word = frequency/len(nltk.word_tokenize(data))
        doc_tf.append(tf_word)
    tf_matrix[word] = doc_tf

In [12]:
tf_matrix

{'and': [0.0,
  0.0,
  0.09090909090909091,
  0.0,
  0.07692307692307693,
  0.06666666666666667,
  0.0,
  0.0,
  0.1111111111111111,
  0.0,
  0.0,
  0.0625,
  0.1,
  0.0,
  0.0,
  0.0,
  0.0851063829787234,
  0.11538461538461539,
  0.15,
  0.02857142857142857,
  0.0,
  0.0,
  0.0,
  0.0,
  0.038461538461538464,
  0.037037037037037035,
  0.0,
  0.0,
  0.07142857142857142,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 'the': [0.14285714285714285,
  0.3333333333333333,
  0.09090909090909091,
  0.0,
  0.0,
  0.0,
  0.09090909090909091,
  0.0,
  0.1111111111111111,
  0.0,
  0.08333333333333333,
  0.0625,
  0.05,
  0.07142857142857142,
  0.1,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.05263157894736842,
  0.0,
  0.0,
  0.14285714285714285,
  0.0,
  0.037037037037037035,
  0.0,
  0.047619047619047616,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.03571428571428571],
 'a': [0.0,
  0.0,
  0.0,
  0.07142857142857142,
  0.038461538461538464,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0

In [13]:
# Tf IDf calculations
tfidf_matrix = []
for word in tf_matrix.keys():
    tfidf = []
    for value in tf_matrix[word]:
        score = value * word_idfs[word]
        tfidf.append(score)
    tfidf_matrix.append(tfidf)

In [14]:
tfidf_matrix

[[0.0,
  0.0,
  0.11875015031330494,
  0.0,
  0.10048089641895033,
  0.08708344356309028,
  0.0,
  0.0,
  0.14513907260515047,
  0.0,
  0.0,
  0.08164072834039714,
  0.13062516534463542,
  0.0,
  0.0,
  0.0,
  0.11117035348479609,
  0.15072134462842549,
  0.19593774801695313,
  0.037321475812752976,
  0.0,
  0.0,
  0.0,
  0.0,
  0.050240448209475166,
  0.048379690868383486,
  0.0,
  0.0,
  0.09330368953188244,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 [0.17199611490370514,
  0.40132426810864535,
  0.10945207312053964,
  0.0,
  0.0,
  0.0,
  0.10945207312053964,
  0.0,
  0.1337747560362151,
  0.0,
  0.10033106702716134,
  0.07524830027037101,
  0.06019864021629681,
  0.08599805745185257,
  0.12039728043259362,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.06336698970136506,
  0.0,
  0.0,
  0.17199611490370514,
  0.0,
  0.04459158534540504,
  0.0,
  0.05733203830123505,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.042999028725926286],
 [0.0,
  0.0,
  0.0,
  0.11335464689871727,
  0.061037117