In [16]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from collections import Counter
from num2words import num2words

import nltk
import os
import string
import numpy as np
import copy
import pandas as pd
import pickle
import re
import math


nltk.download('punkt')


[nltk_data] Downloading package punkt to /home/pt-lp048/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [13]:
df=pd.read_csv("transcripts.csv") 
data=df

# Preprocess

In [4]:
def convert_lower_case(data):
    return np.char.lower(data)


In [5]:
def remove_stop_words(data):
    stop_words = stopwords.words('english')
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if w not in stop_words and len(w) > 1:
            new_text = new_text + " " + w
    return new_text


In [6]:
def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")
    data = np.char.replace(data, ',', '')
    return data


In [7]:
def remove_apostrophe(data):
    return np.char.replace(data, "'", "")


In [8]:
def stemming(data):
    stemmer= PorterStemmer()
    
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + stemmer.stem(w)
    return new_text


In [9]:
def convert_numbers(data):
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        try:
            w = num2words(int(w))
        except:
            a = 0
        new_text = new_text + " " + w
    new_text = np.char.replace(new_text, "-", " ")
    return new_text

In [10]:
def preprocess(data):
    data = convert_lower_case(data)
    data = remove_punctuation(data) #remove comma seperately
    data = remove_apostrophe(data)
    data = remove_stop_words(data)
    data = convert_numbers(data)
    data = stemming(data)
    data = remove_punctuation(data)
    data = convert_numbers(data)
    data = stemming(data) #needed again as we need to stem the words
    data = remove_punctuation(data) #needed again as num2word is giving few hypens and commas fourty-one
    data = remove_stop_words(data) #needed again as num2word is giving stop words 101 - one hundred and one
    return data

In [17]:
processed_text = []
processed_title = []

for i in data.index:
    text = data['transcript'][i].strip()
    processed_text.append(word_tokenize(str(preprocess(text))))
#     processed_title.append(word_tokenize(str(preprocess(i[1]))))

# print(processed_text)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [18]:
DF = {}

for i in range(len(data)):
    tokens = processed_text[i]
    for w in tokens:
        try:
            DF[w].add(i)
        except:
            DF[w] = {i}

for i in DF:
    DF[i] = len(DF[i])


In [19]:
DF

{'good': 1707,
 'morn': 498,
 'laughter': 1812,
 'great': 1394,
 'hasnt': 197,
 'ive': 1418,
 'blown': 71,
 'away': 1192,
 'whole': 1152,
 'thing': 2176,
 'fact': 1424,
 'im': 1995,
 'leav': 852,
 'three': 1846,
 'theme': 139,
 'run': 1029,
 'confer': 252,
 'relev': 149,
 'want': 2130,
 'talk': 1736,
 'one': 2415,
 'extraordinari': 265,
 'evid': 358,
 'human': 1374,
 'creativ': 396,
 'present': 609,
 'weve': 1331,
 'peopl': 2198,
 'varieti': 164,
 'rang': 315,
 'second': 1265,
 'put': 1689,
 'us': 2111,
 'place': 1488,
 'idea': 1446,
 'go': 2278,
 'happen': 1808,
 'term': 806,
 'futur': 905,
 'may': 1113,
 'play': 969,
 'interest': 1240,
 'educ': 591,
 'actual': 1843,
 'find': 1558,
 'everybodi': 681,
 'dont': 2057,
 'dinner': 197,
 'parti': 294,
 'say': 1959,
 'work': 2080,
 'often': 868,
 'frankli': 118,
 'ask': 1552,
 'never': 1414,
 'back': 1917,
 'curiou': 177,
 'strang': 307,
 'somebodi': 599,
 'know': 2195,
 'see': 2185,
 'blood': 310,
 'face': 967,
 'theyr': 1606,
 'like': 2367

In [20]:
    total_vocab_size = len(DF)


In [21]:
total_vocab = [x for x in DF]


In [22]:
print(total_vocab[:20])


['good', 'morn', 'laughter', 'great', 'hasnt', 'ive', 'blown', 'away', 'whole', 'thing', 'fact', 'im', 'leav', 'three', 'theme', 'run', 'confer', 'relev', 'want', 'talk']


In [23]:
def doc_freq(word):
    c = 0
    try:
        c = DF[word]
    except:
        pass
    return c

In [32]:
doc = 0

tf_idf = {}

for i in range(len(data)):
    
    tokens = processed_text[i]
    counter = Counter(tokens)#converts transcript into hash table word:count of word in that doc
    words_count = len(tokens)#total number of words in the transcript
    for token in np.unique(tokens):
     
        tf = counter[token]/words_count#count of word in this doc/total no. of words in doc
        df = doc_freq(token)#count of this word in whole set of docs
        idf = np.log(((len(data))+1)/(df+1))#log(N+1/(whole_count+1))
        tf_idf[doc, token] = tf*idf

    doc += 1

Counter({'laughter': 39, 'educ': 28, 'think': 26, 'said': 22, 'thing': 16, 'one': 16, 'peopl': 16, 'know': 13, 'way': 11, 'dont': 10, 'say': 10, 'like': 10, 'school': 10, 'come': 10, 'want': 9, 'creativ': 9, 'see': 9, 'kid': 9, 'system': 9, 'talk': 8, 'human': 8, 'theyr': 8, 'children': 8, 'year': 8, 'realli': 8, 'get': 8, 'whole': 7, 'go': 7, 'world': 7, 'went': 7, 'wrong': 7, 'gillian': 7, 'im': 6, 'futur': 6, 'interest': 6, 'actual': 6, 'work': 6, 'never': 6, 'capac': 6, 'didnt': 6, 'head': 6, 'move': 6, 'life': 6, 'earth': 6, 'isnt': 6, 'danc': 6, 'intellig': 6, 'wonder': 6, 'idea': 5, 'happen': 5, 'four': 5, 'look': 5, 'speak': 5, 'minut': 5, 'girl': 5, 'draw': 5, 'boy': 5, 'shakespear': 5, 'would': 5, 'need': 5, 'job': 5, 'ballet': 5, 'mother': 5, 'three': 4, 'run': 4, 'extraordinari': 4, 'weve': 4, 'put': 4, 'often': 4, 'night': 4, 'right': 4, 'two': 4, 'time': 4, 'wasnt': 4, 'talent': 4, 'applau': 4, 'thank': 4, 'heard': 4, 'got': 4, 'grow': 4, 'rather': 4, 'live': 4, 'stratfor

Counter({'one': 34, 'world': 23, 'africa': 22, 'realli': 19, 'like': 18, 'live': 18, 'know': 14, 'america': 14, 'wish': 13, 'get': 13, 'chang': 13, 'peopl': 13, 'want': 12, 'need': 12, 'think': 12, 'would': 12, 'see': 11, 'make': 11, 'could': 11, 'us': 11, 'well': 10, 'much': 10, 'believ': 10, 'actual': 10, 'start': 10, 'everi': 10, 'right': 10, 'time': 9, 'lot': 9, 'talk': 9, 'ethiopia': 9, 'look': 9, 'idea': 9, 'laughter': 8, 'cant': 8, 'aid': 8, 'thing': 8, 'million': 8, 'american': 8, 'say': 8, 'histori': 8, 'first': 7, 'moment': 7, 'im': 7, 'person': 7, 'digit': 7, 'kind': 7, 'day': 7, 'give': 7, 'said': 6, 'turn': 6, 'life': 6, 'anyway': 6, 'contin': 6, 'feel': 6, 'poverti': 6, 'transform': 6, 'help': 6, 'anoth': 5, 'countri': 5, 'technolog': 5, 'excit': 5, 'got': 5, 'year': 5, 'big': 5, 'mind': 5, 'cau': 5, 'african': 5, 'equal': 5, 'problem': 5, 'afford': 5, 'fact': 5, 'line': 5, 'war': 5, 'togeth': 5, 'movement': 5, 'campaign': 5, 'three': 5, 'call': 4, 'thank': 4, 'half': 4, 

Counter({'life': 16, 'earth': 10, 'time': 8, 'first': 8, 'water': 8, 'like': 7, 'today': 7, 'still': 7, 'new': 7, 'turn': 6, 'form': 6, 'becam': 6, 'plant': 6, 'emerg': 5, 'becom': 5, 'aro': 5, 'world': 5, 'journey': 4, 'stori': 4, 'thing': 4, 'stromatolit': 4, 'came': 4, 'differ': 4, 'oxygen': 4, 'shape': 4, 'learn': 4, 'sea': 4, 'grew': 4, 'bird': 4, 'went': 3, 'back': 3, 'way': 3, 'sky': 3, 'live': 3, 'go': 3, 'breath': 3, 'mold': 3, 'made': 3, 'forc': 3, 'membran': 3, 'among': 3, 'land': 3, 'echo': 3, 'got': 3, 'mammal': 3, 'natur': 2, 'ive': 2, 'mani': 2, 'year': 2, 'ago': 2, 'one': 2, 'id': 2, 'share': 2, 'australia': 2, 'hope': 2, 'see': 2, 'place': 2, 'sen': 2, 'exhal': 2, 'fire': 2, 'alien': 2, 'planet': 2, 'born': 2, 'around': 2, 'mar': 2, 'multipli': 2, 'grow': 2, 'atmosph': 2, 'cell': 2, 'nurtur': 2, 'earli': 2, 'move': 2, 'began': 2, 'long': 2, 'follow': 2, 'adapt': 2, 'co': 2, 'fungi': 2, 'alga': 2, 'rock': 2, 'upright': 2, 'size': 2, 'later': 2, 'hard': 2, 'ventur': 2, '

Counter({'us': 21, 'year': 19, 'human': 19, 'see': 13, 'know': 12, 'like': 12, 'tell': 11, 'million': 10, 'would': 10, 'ancestor': 9, 'speci': 9, 'look': 9, 'africa': 8, 'place': 8, 'first': 8, 'make': 8, 'today': 8, 'three': 8, 'dont': 8, 'time': 8, 'fossil': 8, 'question': 8, 'teeth': 8, 'find': 7, 'selam': 7, 'ago': 7, 'way': 6, 'evid': 6, 'walk': 6, 'use': 6, 'call': 6, 'think': 6, 'five': 6, 'bone': 6, 'brain': 6, 'come': 5, 'long': 5, 'say': 5, 'african': 5, 'upright': 5, 'child': 5, 'old': 5, 'girl': 5, 'live': 5, 'give': 5, 'chimpanz': 5, 'inform': 5, 'actual': 5, 'go': 5, 'game': 5, 'park': 5, 'cour': 5, 'peopl': 5, 'still': 5, 'grow': 5, 'nineti': 4, 'took': 4, 'technolog': 4, 'form': 4, 'skeleton': 4, 'die': 4, 'belong': 4, 'found': 4, 'ethiopia': 4, 'mani': 4, 'learn': 4, 'analysi': 4, 'laughter': 4, 'help': 4, 'environ': 4, 'said': 4, 'excit': 4, 'featur': 4, 'babi': 4, 'much': 4, 'posit': 4, 'past': 3, 'right': 3, 'region': 3, 'percent': 3, 'earliest': 3, 'im': 3, 'job': 

Counter({'one': 24, 'go': 19, 'thing': 18, 'see': 18, 'like': 17, 'littl': 15, 'time': 15, 'way': 14, 'actual': 12, 'get': 12, 'kind': 12, 'theyr': 12, 'year': 12, 'think': 11, 'simpl': 11, 'two': 11, 'shape': 11, 'hundr': 11, 'look': 10, 'understand': 10, 'children': 10, 'car': 10, 'teacher': 9, 'complex': 8, 'make': 8, 'old': 8, 'six': 8, 'say': 7, 'tri': 7, 'im': 7, 'put': 7, 'drop': 7, 'speed': 7, 'start': 6, 'would': 6, 'find': 6, 'thought': 6, 'fact': 6, 'world': 6, 'learn': 6, 'idea': 6, 'second': 6, 'first': 6, 'wheel': 6, 'yesterday': 5, 'object': 5, 'order': 5, 'draw': 5, 'imag': 5, 'everi': 5, 'well': 5, 'measur': 5, 'saw': 5, 'happen': 5, 'four': 5, 'student': 5, 'someth': 5, 'million': 5, 'kid': 5, 'proof': 5, 'next': 5, 'steer': 5, 'increa': 5, 'ball': 5, 'take': 4, 'point': 4, 'rosl': 4, 'might': 4, 'fool': 4, 'side': 4, 'cant': 4, 'size': 4, 'ill': 4, 'use': 4, 'talk': 4, 'call': 4, 'insid': 4, 'differ': 4, 'goe': 4, 'triangl': 4, 'five': 4, 'nine': 4, 'number': 4, 'acc

Counter({'life': 22, 'go': 16, 'look': 16, 'surfac': 14, 'like': 14, 'live': 13, 'one': 13, 'could': 12, 'creatur': 12, 'sun': 12, 'ocean': 12, 'cour': 11, 'europa': 11, 'would': 11, 'object': 11, 'sunlight': 11, 'happen': 10, 'cold': 10, 'much': 9, 'laughter': 9, 'univ': 9, 'year': 8, 'fact': 8, 'talk': 8, 'im': 8, 'thing': 8, 'place': 8, 'detect': 8, 'radio': 8, 'know': 7, 'grow': 7, 'planet': 7, 'great': 6, 'time': 6, 'anim': 6, 'someth': 6, 'astronomi': 6, 'jupit': 6, 'ice': 6, 'find': 6, 'move': 6, 'anoth': 6, 'rememb': 5, 'littl': 5, 'princeton': 5, 'new': 5, 'peopl': 5, 'away': 5, 'take': 5, 'interest': 5, 'earth': 5, 'call': 5, 'origin': 5, 'still': 5, 'reflect': 5, 'turn': 4, 'twenti': 4, 'want': 4, 'realli': 4, 'soon': 4, 'breeder': 4, 'us': 4, 'way': 4, 'real': 4, 'estat': 4, 'dont': 4, 'probabl': 4, 'hard': 4, 'thick': 4, 'onto': 4, 'air': 4, 'concentr': 4, 'long': 4, 'well': 4, 'shine': 4, 'get': 4, 'run': 4, 'hundr': 3, 'battl': 3, 'import': 3, 'point': 3, 'five': 3, 'act

Counter({'kind': 29, 'link': 29, 'one': 25, 'thing': 24, 'interest': 18, 'know': 17, 'lot': 17, 'peopl': 16, 'right': 15, 'googl': 15, 'emerg': 14, 'look': 13, 'start': 13, 'think': 13, 'page': 13, 'web': 12, 'put': 11, 'well': 11, 'talk': 11, 'basic': 10, 'littl': 10, 'way': 10, 'citi': 10, 'two': 9, 'laughter': 9, 'power': 9, 'see': 9, 'chang': 9, 'person': 9, 'would': 9, 'get': 9, 'new': 9, 'like': 9, 'live': 8, 'could': 8, 'realli': 8, 'actual': 8, 'say': 8, 'search': 8, 'weblog': 8, 'want': 7, 'happen': 7, 'share': 7, 'great': 7, 'work': 7, 'system': 7, 'blog': 7, 'percent': 7, 'take': 6, 'seen': 6, 'day': 6, 'somebodi': 6, 'book': 6, 'street': 6, 'much': 6, 'make': 6, 'thousand': 6, 'hundr': 6, 'sen': 6, 'twenti': 6, 'go': 6, 'question': 6, 'pictur': 5, 'time': 5, 'densiti': 5, 'bit': 5, 'obviou': 5, 'neighborhood': 5, 'whole': 5, 'decid': 5, 'click': 5, 'steven': 5, 'ill': 5, 'site': 5, 'someth': 5, 'didnt': 5, 'term': 5, 'publish': 4, 'west': 4, 'thought': 4, 'first': 4, 'still

Counter({'second': 48, 'life': 48, 'know': 46, 'think': 41, 'go': 37, 'world': 37, 'like': 35, 'peopl': 34, 'thing': 32, 'virtual': 24, 'kind': 23, 'use': 21, 'one': 20, 'idea': 18, 'would': 18, 'way': 18, 'realli': 17, 'space': 16, 'want': 15, 'inform': 15, 'time': 14, 'internet': 14, 'sort': 14, 'jh': 14, 'real': 13, 'happen': 13, 'experi': 13, 'talk': 12, 'make': 12, 'exampl': 12, 'creat': 12, 'say': 12, 'build': 11, 'could': 11, 'someth': 11, 'gener': 11, 'human': 11, 'possibl': 11, 'right': 11, 'pr': 11, 'chang': 11, 'differ': 10, 'big': 10, 'question': 10, 'put': 10, 'mani': 10, 'us': 10, 'web': 10, 'im': 9, 'see': 9, 'anyth': 9, 'togeth': 9, 'avatar': 9, 'look': 8, 'get': 8, 'first': 8, 'creativ': 8, 'comput': 8, 'built': 8, 'thousand': 8, 'two': 8, 'five': 8, 'well': 8, 'mean': 8, 'thought': 7, 'actual': 7, 'lot': 7, 'great': 7, 'percent': 7, 'critic': 7, 'rememb': 7, 'age': 7, 'littl': 6, 'interact': 6, 'ask': 6, 'alway': 6, 'amount': 6, 'find': 6, 'fact': 6, 'twenti': 6, 'ter

Counter({'tree': 32, 'peopl': 27, 'one': 20, 'look': 19, 'forest': 16, 'get': 16, 'year': 15, 'start': 14, 'also': 14, 'go': 14, 'area': 14, 'plant': 14, 'like': 13, 'grow': 13, 'use': 13, 'palm': 11, 'orangutan': 10, 'three': 10, 'make': 10, 'thousand': 9, 'come': 9, 'see': 9, 'fire': 9, 'local': 9, 'need': 9, 'provid': 8, 'creat': 8, 'place': 8, 'land': 8, 'work': 8, 'sugar': 8, 'back': 7, 'sure': 7, 'everi': 7, 'much': 7, 'famili': 7, 'grass': 7, 'speci': 7, 'recip': 7, 'soil': 7, 'actual': 7, 'crop': 7, 'incom': 7, 'babi': 6, 'two': 6, 'thing': 6, 'mani': 6, 'know': 6, 'timber': 6, 'left': 6, 'bit': 6, 'water': 6, 'hundr': 6, 'measur': 6, 'want': 6, 'rainfal': 6, 'produc': 6, 'first': 5, 'way': 5, 'still': 5, 'put': 5, 'realli': 5, 'percent': 5, 'later': 5, 'happen': 5, 'fertil': 5, 'combin': 5, 'would': 5, 'differ': 5, 'time': 5, 'rain': 5, 'live': 4, 'quit': 4, 'applau': 4, 'save': 4, 'everyon': 4, 'right': 4, 'oil': 4, 'problem': 4, 'twenti': 4, 'materi': 4, 'dont': 4, 'solut': 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Counter({'patient': 38, 'one': 19, 'care': 17, 'know': 16, 'said': 16, 'go': 15, 'data': 14, 'thing': 13, 'health': 13, 'cancer': 12, 'year': 11, 'doctor': 11, 'well': 11, 'talk': 10, 'say': 9, 'im': 9, 'look': 9, 'like': 9, 'medic': 9, 'went': 9, 'get': 8, 'want': 8, 'help': 8, 'nine': 8, 'someth': 8, 'inform': 7, 'found': 7, 'start': 7, 'let': 7, 'thousand': 7, 'dont': 7, 'dave': 6, 'first': 6, 'peopl': 6, 'back': 6, 'could': 6, 'laughter': 6, 'think': 6, 'grow': 6, 'two': 6, 'got': 6, 'scan': 6, 'kidney': 6, 'googl': 6, 'websit': 6, 'amaz': 5, 'stori': 5, 'take': 5, 'fact': 5, 'find': 5, 'hospit': 5, 'hundr': 5, 'realli': 5, 'came': 5, 'saw': 5, 'actual': 5, 'bodi': 5, 'big': 5, 'term': 4, 'four': 4, 'mean': 4, 'tell': 4, 'way': 4, 'week': 4, 'earth': 4, 'access': 4, 'disea': 4, 'home': 4, 'lung': 4, 'make': 4, 'give': 4, 'muscl': 4, 'ive': 4, 'onlin': 4, 'cant': 4, 'read': 4, 'end': 4, 'tumor': 4, 'thought': 4, 'number': 4, 'come': 4, 'damn': 4, 'raw': 4, 'heard': 3, 'today': 3, 'a

Counter({'oxytocin': 43, 'peopl': 27, 'moral': 22, 'money': 18, 'us': 17, 'found': 15, 'experi': 14, 'one': 14, 'make': 13, 'know': 12, 'brain': 11, 'molecul': 11, 'want': 10, 'relea': 10, 'trust': 10, 'give': 10, 'person': 9, 'said': 9, 'also': 8, 'go': 8, 'trustworthi': 8, 'two': 8, 'dont': 8, 'show': 8, 'get': 8, 'hundr': 8, 'way': 8, 'feel': 8, 'connect': 8, 'think': 7, 'see': 7, 'measur': 7, 'use': 7, 'tell': 7, 'social': 6, 'say': 6, 'laughter': 6, 'men': 6, 'blood': 6, 'percent': 6, 'guy': 6, 'testosteron': 6, 'creatur': 5, 'need': 5, 'mother': 5, 'like': 5, 'call': 5, 'idea': 5, 'world': 5, 'cant': 5, 'cau': 5, 'produc': 5, 'thousand': 5, 'let': 5, 'second': 5, 'share': 5, 'system': 5, 'wed': 5, 'love': 5, 'hug': 5, 'lot': 4, 'ten': 4, 'would': 4, 'women': 4, 'right': 4, 'first': 4, 'half': 4, 'life': 4, 'keep': 4, 'countri': 4, 'im': 4, 'lab': 4, 'back': 4, 'stranger': 4, 'fact': 4, 'didnt': 4, 'increa': 4, 'investig': 4, 'took': 4, 'empathi': 4, 'immor': 4, 'day': 4, 'ill': 4

Counter({'women': 34, 'busi': 21, 'would': 10, 'world': 10, 'entrepreneur': 9, 'job': 9, 'invest': 8, 'econom': 8, 'met': 8, 'much': 8, 'think': 8, 'except': 8, 'see': 7, 'one': 7, 'work': 7, 'time': 7, 'said': 7, 'come': 7, 'year': 6, 'start': 6, 'talk': 6, 'market': 6, 'dollar': 6, 'small': 5, 'dont': 5, 'stori': 5, 'turn': 5, 'went': 5, 'five': 5, 'financ': 5, 'around': 5, 'countri': 5, 'even': 5, 'girl': 5, 'know': 5, 'peopl': 5, 'could': 5, 'loan': 5, 'way': 4, 'us': 4, 'great': 4, 'school': 4, 'differ': 4, 'go': 4, 'two': 4, 'thousand': 4, 'actual': 4, 'couldnt': 4, 'surviv': 4, 'employ': 4, 'micro': 4, 'say': 4, 'incr': 4, 'interest': 4, 'seek': 4, 'need': 4, 'emerg': 4, 'make': 4, 'cant': 3, 'write': 3, 'conflict': 3, 'left': 3, 'news': 3, 'age': 3, 'earn': 3, 'famili': 3, 'percent': 3, 'nearli': 3, 'men': 3, 'help': 3, 'import': 3, 'never': 3, 'laughter': 3, 'gayl': 3, 'first': 3, 'mani': 3, 'imf': 3, 'offici': 3, 'twenti': 3, 'applau': 3, 'word': 3, 'beyond': 3, 'theyr': 3, '

Counter({'laughter': 27, 'book': 26, 'one': 17, 'stori': 15, 'like': 13, 'go': 12, 'design': 11, 'look': 10, 'get': 10, 'first': 7, 'want': 6, 'im': 6, 'thing': 6, 'know': 6, 'right': 6, 'two': 5, 'ink': 5, 'paper': 5, 'new': 5, 'hundr': 5, 'came': 5, 'said': 5, 'dont': 5, 'put': 5, 'see': 5, 'differ': 5, 'ladi': 4, 'use': 4, 'someth': 4, 'start': 4, 'knopf': 4, 'publish': 4, 'job': 4, 'best': 4, 'form': 4, 'word': 4, 'cover': 4, 'hepburn': 4, 'dietrich': 4, 'went': 4, 'page': 4, 'took': 4, 'piec': 4, 'would': 4, 'author': 4, 'talk': 4, 'reason': 3, 'give': 3, 'happen': 3, 'public': 3, 'becom': 3, 'life': 3, 'graphic': 3, 'york': 3, 'upon': 3, 'nine': 3, 'art': 3, 'idea': 3, 'need': 3, 'content': 3, 'day': 3, 'pictur': 3, 'appl': 3, 'say': 3, 'dinosaur': 3, 'hand': 3, 'tape': 3, 'us': 3, 'love': 3, 'even': 3, 'back': 3, 'michael': 3, 'applau': 3, 'call': 3, 'think': 3, 'respon': 3, 'take': 3, 'much': 3, 'opposit': 3, 'type': 3, 'print': 3, 'shelf': 3, 'text': 3, 'plane': 3, 'good': 2, 

Counter({'tesla': 13, 'nt': 8, 'hundr': 7, 'mt': 7, 'one': 5, 'nikola': 5, 'could': 4, 'idea': 4, 'world': 4, 'use': 3, 'begin': 3, 'thousand': 3, 'six': 3, 'work': 3, 'mind': 3, 'would': 3, 'invent': 3, 'becam': 3, 'perform': 2, 'illu': 2, 'tanagra': 2, 'theater': 2, 'mirror': 2, 'creat': 2, 'let': 2, 'lit': 2, 'babi': 2, 'name': 2, 'two': 2, 'thirti': 2, 'imag': 2, 'instantli': 2, 'imagin': 2, 'great': 2, 'age': 2, 'electr': 2, 'scienc': 2, 'applau': 2, 'telegraphi': 2, 'point': 2, 'citi': 2, 'still': 2, 'man': 2, 'magician': 1, 'im': 1, 'alway': 1, 'interest': 1, 'incorpor': 1, 'element': 1, 'remark': 1, 'popular': 1, 'earli': 1, 'part': 1, '20th': 1, 'centuri': 1, 'tini': 1, 'peopl': 1, 'miniatur': 1, 'stage': 1, 'wont': 1, 'digit': 1, 'tribut': 1, 'stori': 1, 'dark': 1, 'stormi': 1, 'night': 1, 'realli': 1, '10th': 1, 'juli': 1, 'eight': 1, 'fifti': 1, 'lightn': 1, 'sky': 1, 'born': 1, 'grew': 1, 'smart': 1, 'guy': 1, 'show': 1, 'multipli': 1, 'five': 1, 'result': 1, 'eighteen': 1

Counter({'peopl': 17, 'slaveri': 15, 'work': 15, 'like': 13, 'slave': 12, 'enslav': 11, 'children': 10, 'hundr': 9, 'one': 8, 'day': 8, 'year': 8, 'famili': 8, 'forc': 8, 'mine': 7, 'shaft': 7, 'free': 7, 'dont': 7, 'mani': 7, 'see': 6, 'met': 6, 'today': 6, 'live': 6, 'theyr': 6, 'thousand': 6, 'world': 6, 'help': 6, 'even': 6, 'would': 6, 'hour': 6, 'water': 6, 'want': 6, 'talk': 5, 'head': 5, 'hole': 5, 'often': 5, 'know': 5, 'us': 5, 'back': 5, 'get': 5, 'anoth': 5, 'lake': 5, 'becom': 5, 'feet': 4, 'men': 4, 'stone': 4, 'make': 4, 'three': 4, 'hand': 4, 'still': 4, 'without': 4, 'go': 4, 'two': 4, 'start': 4, 'time': 4, 'entir': 4, 'yet': 4, 'women': 4, 'carri': 4, 'made': 4, 'id': 4, 'run': 4, 'could': 4, 'road': 4, 'difficult': 4, 'sex': 4, 'traffick': 4, 'safe': 4, 'fish': 4, 'kofi': 4, 'imag': 4, 'fifti': 3, 'ghana': 3, 'dust': 3, 'feel': 3, 'dark': 3, 'much': 3, 'hear': 3, 'rememb': 3, 'stand': 3, 'die': 3, 'got': 3, 'home': 3, 'twenti': 3, 'within': 3, 'trade': 3, 'better': 

Counter({'laughter': 23, 'one': 17, 'know': 11, 'go': 11, 'littl': 11, 'four': 10, 'thank': 10, 'would': 10, 'run': 10, 'got': 10, 'get': 10, 'dont': 7, 'start': 7, 'like': 7, 'oh': 7, 'actual': 6, 'okay': 6, 'come': 6, 'brain': 6, 'peopl': 5, 'ill': 5, 'bad': 5, 'im': 5, 'applau': 5, 'abu': 5, 'voic': 5, 'thousand': 5, 'everi': 5, 'news': 5, 'wed': 5, 'mental': 4, 'good': 4, 'even': 4, 'thing': 4, 'chemic': 4, 'talk': 4, 'eat': 4, 'didnt': 4, 'friend': 4, 'show': 4, 'realli': 4, 'hear': 4, 'hundr': 4, 'kill': 4, 'see': 4, 'zizz': 4, 'god': 4, 'two': 3, 'real': 3, 'think': 3, 'complet': 3, 'lot': 3, 'girli': 3, 'neuron': 3, 'want': 3, 'might': 3, 'let': 3, 'say': 3, 'weve': 3, 'long': 3, 'state': 3, 'lobe': 3, 'fill': 3, 'theyr': 3, 'insan': 3, 'suddenli': 3, 'constant': 3, 'happi': 3, 'yeah': 2, 'isnt': 2, 'right': 2, 'mother': 2, 'crawl': 2, 'around': 2, 'kind': 2, 'well': 2, 'alway': 2, 'breakdown': 2, 'happen': 2, 'daughter': 2, 'day': 2, 'back': 2, 'sausag': 2, 'went': 2, 'everybo

Counter({'mind': 25, 'time': 18, 'go': 14, 'thought': 14, 'life': 12, 'way': 11, 'get': 11, 'thing': 10, 'know': 10, 'medit': 9, 'take': 7, 'moment': 7, 'focu': 7, 'realli': 7, 'present': 7, 'like': 6, 'see': 6, 'back': 6, 'emot': 6, 'dont': 6, 'might': 6, 'learn': 6, 'littl': 6, 'need': 6, 'noth': 5, 'sit': 5, 'right': 5, 'stress': 5, 'distract': 5, 'peopl': 5, 'sort': 5, 'much': 5, 'bit': 5, 'everi': 4, 'kind': 4, 'look': 4, 'round': 4, 'deal': 4, 'quit': 4, 'differ': 4, 'start': 4, 'becom': 4, 'chang': 4, 'relax': 4, 'feel': 4, 'anxiou': 4, 'live': 3, 'alway': 3, 'think': 3, 'ten': 3, 'minut': 3, 'even': 3, 'lot': 3, 'laughter': 3, 'talk': 3, 'experi': 3, 'spend': 3, 'difficult': 3, 'assum': 3, 'also': 3, 'happen': 3, 'one': 3, 'work': 3, 'monk': 3, 'let': 3, 'lost': 3, 'almost': 3, 'potenti': 3, 'step': 3, 'come': 3, 'worri': 3, 'busi': 2, 'world': 2, 'often': 2, 'someth': 2, 'id': 2, 'last': 2, 'say': 2, 'mean': 2, 'eat': 2, 'face': 2, 'long': 2, 'upon': 2, 'best': 2, 'everyth': 2

Counter({'theyr': 26, 'go': 24, 'peopl': 24, 'one': 24, 'two': 22, 'see': 20, 'brain': 19, 'right': 18, 'play': 18, 'game': 17, 'think': 17, 'theori': 16, 'activ': 16, 'chimp': 14, 'like': 13, 'might': 13, 'time': 13, 'player': 13, 'use': 12, 'pick': 12, 'differ': 12, 'left': 12, 'get': 11, 'want': 11, 'three': 11, 'littl': 10, 'lot': 10, 'number': 10, 'im': 9, 'know': 9, 'step': 9, 'actual': 9, 'everyon': 8, 'someth': 8, 'say': 8, 'realli': 8, 'equilibrium': 8, 'human': 8, 'bit': 7, 'predict': 7, 'case': 7, 'comput': 7, 'averag': 7, 'well': 7, 'dont': 7, 'way': 7, 'fifti': 7, 'behavior': 7, 'first': 7, 'make': 7, 'inform': 7, 'money': 7, 'tri': 7, 'mismatch': 7, 'bargain': 6, 'data': 6, 'kind': 6, 'interest': 6, 'much': 6, 'studi': 6, 'look': 6, 'uninform': 6, 'disagr': 6, 'give': 5, 'el': 5, 'thing': 5, 'win': 5, 'thirti': 5, 'better': 5, 'four': 5, 'call': 5, 'happen': 5, 'close': 5, 'around': 5, 'trial': 5, 'area': 5, 'region': 5, 'also': 5, 'dollar': 5, 'deal': 5, 'arrow': 5, 'rew

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Counter({'world': 28, 'chang': 27, 'thousand': 25, 'climat': 25, 'two': 22, 'one': 16, 'peopl': 16, 'year': 16, 'countri': 15, 'human': 14, 'issu': 14, 'hundr': 14, 'right': 12, 'presid': 11, 'develop': 11, 'nine': 10, 'much': 10, 'emiss': 10, 'five': 10, 'cour': 9, 'energi': 9, 'togeth': 8, 'problem': 8, 'forti': 7, 'would': 7, 'safe': 7, 'behind': 7, 'need': 7, 'must': 7, 'renew': 7, 'justic': 6, 'ireland': 6, 'commun': 6, 'build': 6, 'ind': 6, 'go': 6, 'think': 6, 'realli': 6, 'come': 6, 'left': 6, 'move': 6, 'live': 6, 'four': 5, 'first': 5, 'island': 5, 'unit': 5, 'part': 5, 'didnt': 5, 'know': 5, 'dont': 5, 'look': 5, 'degr': 5, 'billion': 5, 'big': 5, 'challeng': 5, 'rapidli': 5, 'back': 5, 'new': 5, 'sustain': 5, 'start': 4, 'laughter': 4, 'stay': 4, 'call': 4, 'time': 4, 'solidar': 4, 'could': 4, 'impact': 4, 'tong': 4, 'face': 4, 'knew': 4, 'nation': 4, 'women': 4, 'enough': 4, 'destroy': 4, 'struck': 4, 'around': 4, 'moral': 4, 'busi': 4, 'carbon': 4, 'fifti': 4, 'fossil': 4

Counter({'solar': 26, 'cell': 25, 'light': 17, 'energi': 12, 'led': 10, 'harvest': 10, 'receiv': 9, 'video': 8, 'first': 6, 'internet': 6, 'devic': 6, 'use': 6, 'data': 6, 'shelf': 5, 'lamp': 5, 'fi': 5, 'also': 5, 'chang': 5, 'transmit': 4, 'standard': 4, 'connect': 4, 'li': 4, 'encod': 4, 'subtl': 4, 'bright': 4, 'would': 3, 'like': 3, 'demonstr': 3, 'time': 3, 'possibl': 3, 'act': 3, 'point': 3, 'close': 3, 'billion': 3, 'mean': 3, 'exist': 3, 'infrastructur': 3, 'fast': 3, 'around': 3, 'inform': 3, 'want': 3, 'way': 3, 'show': 3, 'instrument': 3, 'moment': 3, 'notic': 3, 'drop': 3, 'stream': 3, 'stop': 3, 'handkerchief': 3, 'integr': 3, 'public': 2, 'laptop': 2, 'massiv': 2, 'exten': 2, 'digit': 2, 'divid': 2, 'thing': 2, 'need': 2, 'much': 2, 'come': 2, 'two': 2, 'look': 2, 'us': 2, 'charg': 2, 'fluctuat': 2, 'shown': 2, 'lab': 2, 'broadband': 2, 'let': 2, 'switch': 2, 'simpli': 2, 'turn': 2, 'next': 2, 'ive': 2, 'recogn': 2, 'well': 2, 'blockag': 2, 'applau': 2, 'street': 2, 'fog

Counter({'cell': 37, 'brain': 27, 'implant': 11, 'patient': 7, 'exactli': 7, 'stem': 7, 'monkey': 7, 'us': 6, 'repair': 6, 'time': 6, 'cultur': 6, 'lesion': 6, 'neurosurgeon': 5, 'human': 5, 'sever': 5, 'function': 5, 'help': 5, 'abl': 5, 'piec': 5, 'know': 5, 'perform': 5, 'recov': 5, 'trial': 5, 'bg': 5, 'like': 4, 'one': 4, 'major': 4, 'small': 4, 'kind': 4, 'divid': 4, 'first': 4, 'thank': 4, 'case': 4, 'cour': 4, 'use': 4, 'littl': 3, 'tri': 3, 'improv': 3, 'differ': 3, 'call': 3, 'order': 3, 'realli': 3, 'dream': 3, 'think': 3, 'take': 3, 'head': 3, 'come': 3, 'swollen': 3, 'jean': 3, 'françoi': 3, 'grow': 3, 'need': 3, 'look': 3, 'theyr': 3, 'get': 3, 'long': 3, 'sure': 3, 'prove': 3, 'go': 3, 'could': 3, 'spontan': 3, 'applau': 3, 'jocelyn': 3, 'jb': 3, 'treatment': 3, 'im': 2, 'colleagu': 2, 'day': 2, 'realiz': 2, 'life': 2, 'second': 2, 'stroke': 2, 'bodi': 2, 'often': 2, 'remain': 2, 'neurolog': 2, 'deep': 2, 'depth': 2, 'neuron': 2, 'amaz': 2, 'parkinson': 2, 'disea': 2, 'm

Counter({'rock': 29, 'find': 24, 'dinosaur': 22, 'would': 15, 'go': 13, 'earth': 13, 'one': 13, 'like': 13, 'dreadnoughtu': 12, 'get': 11, 'bone': 11, 'geolog': 10, 'fossil': 9, 'age': 8, 'five': 8, 'time': 8, 'place': 8, 'right': 7, 'year': 7, 'sixti': 7, 'us': 7, 'world': 7, 'histori': 6, 'long': 6, 'record': 6, 'planet': 6, 'desert': 6, 'see': 6, 'big': 6, 'paleontologist': 5, 'first': 5, 'want': 5, 'realli': 5, 'old': 5, 'look': 5, 'two': 5, 'everi': 5, 'live': 5, 'thousand': 5, 'speci': 5, 'formula': 4, 'sedimentari': 4, 'three': 4, 'thing': 4, 'ground': 4, 'billion': 4, 'evolv': 4, 'million': 4, 'map': 4, 'next': 4, 'page': 4, 'back': 4, 'plant': 4, 'new': 4, 'around': 4, 'giant': 4, 'anoth': 4, 'day': 4, 'ton': 4, 'anim': 4, 'noth': 4, 'missouri': 4, 'must': 3, 'layer': 3, 'natur': 3, 'chanc': 3, 'good': 3, 'paleozo': 3, 'dont': 3, 'hundr': 3, 'bottom': 3, 'vast': 3, 'never': 3, 'unlik': 3, 'moon': 3, 'thu': 3, 'thought': 3, 'prospect': 3, 'give': 3, 'could': 3, 'particularli': 

Counter({'speci': 20, 'cavefish': 11, 'new': 9, 'tell': 9, 'one': 7, 'geolog': 6, 'us': 6, 'cave': 5, 'time': 5, 'laughter': 4, 'biolog': 4, 'blind': 4, 'gene': 4, 'fish': 3, 'look': 3, 'like': 3, 'yolo': 3, 'know': 3, 'life': 3, 'im': 3, 'see': 3, 'discov': 3, 'year': 3, 'mayb': 3, 'littl': 3, 'move': 3, 'name': 3, 'closest': 3, 'rel': 3, 'tri': 3, 'extinct': 3, 'ichthyolog': 2, 'big': 2, 'actual': 2, 'excit': 2, 'alreadi': 2, 'go': 2, 'spend': 2, 'find': 2, 'lot': 2, 'around': 2, 'sight': 2, 'eye': 2, 'start': 2, 'mani': 2, 'differ': 2, 'way': 2, 'describ': 2, 'southern': 2, 'system': 2, 'million': 2, 'behind': 2, 'also': 2, 'madagascar': 2, 'sick': 2, 'swim': 2, 'full': 2, 'thing': 2, 'fact': 2, 'dna': 2, 'contin': 2, 'yet': 2, 'mexico': 2, 'probabl': 2, 'someth': 2, 'studi': 1, 'bore': 1, 'word': 1, 'quit': 1, 'olog': 1, 'cool': 1, 'kid': 1, 'audienc': 1, 'stand': 1, 'live': 1, 'alway': 1, 'dreamt': 1, 'hidden': 1, 'wonder': 1, 'world': 1, 'get': 1, 'recent': 1, 'realli': 1, 'focu'

Counter({'know': 16, 'thing': 9, 'go': 8, 'everi': 7, 'knowledg': 7, 'dont': 6, 'time': 6, 'one': 5, 'night': 5, 'around': 5, 'life': 5, 'could': 5, 'much': 5, 'ignor': 5, 'year': 5, 'japan': 5, 'would': 4, 'home': 4, 'us': 4, 'way': 4, 'make': 4, 'realli': 4, 'never': 4, 'three': 4, 'day': 4, 'street': 3, 'trishaw': 3, 'began': 3, 'come': 3, 'live': 3, 'mani': 3, 'first': 3, 'twenti': 3, 'get': 3, 'someth': 3, 'littl': 3, 'new': 3, 'point': 3, 'travel': 3, 'remind': 3, 'moment': 3, 'peopl': 3, 'ive': 3, 'couldnt': 3, 'morn': 2, 'train': 2, 'rough': 2, 'came': 2, 'show': 2, 'quot': 2, 'told': 2, 'teacher': 2, 'cour': 2, 'actual': 2, 'visitor': 2, 'found': 2, 'foreign': 2, 'bump': 2, 'lost': 2, 'realiz': 2, 'happen': 2, 'final': 2, 'see': 2, 'ever': 2, 'friend': 2, 'said': 2, 'also': 2, 'take': 2, 'well': 2, 'even': 2, 'danger': 2, 'assum': 2, 'world': 2, 'cant': 2, 'discoveri': 2, 'push': 2, 'forward': 2, 'give': 2, 'decid': 2, 'love': 2, 'law': 2, 'nine': 2, 'tell': 2, 'minu': 2, 'alw

Counter({'go': 28, 'genet': 27, 'think': 26, 'babi': 21, 'like': 21, 'could': 18, 'design': 17, 'human': 17, 'make': 16, 'year': 14, 'modif': 13, 'kind': 12, 'differ': 12, 'technolog': 12, 'new': 11, 'peopl': 11, 'next': 10, 'jenna': 10, 'right': 10, 'crispr': 10, 'theyr': 10, 'eugen': 10, 'look': 9, 'kid': 9, 'one': 9, 'thousand': 8, 'born': 8, 'use': 8, 'talk': 8, 'actual': 7, 'gm': 7, 'scienc': 7, 'thing': 7, 'better': 7, 'swing': 7, 'parent': 6, 'famili': 6, 'let': 6, 'two': 6, 'modifi': 6, 'know': 6, 'million': 6, 'gener': 6, 'feel': 6, 'may': 6, 'mayb': 6, 'us': 6, 'lot': 6, 'today': 6, 'month': 6, 'happen': 6, 'might': 6, 'scientist': 5, 'decid': 5, 'deci': 5, 'thirti': 5, 'mariann': 5, 'embryo': 5, 'would': 5, 'even': 5, 'realli': 5, 'research': 5, 'world': 5, 'public': 5, 'ago': 5, 'time': 5, 'tri': 5, 'say': 5, 'back': 5, 'mani': 4, 'friend': 4, 'best': 4, 'someth': 4, 'littl': 4, 'fact': 4, 'children': 4, 'imagin': 4, 'halloween': 4, 'possibl': 4, 'im': 4, 'see': 4, 'still':

Counter({'black': 32, 'one': 19, 'white': 17, 'health': 13, 'discrimin': 13, 'racism': 11, 'nine': 10, 'educ': 10, 'america': 9, 'even': 9, 'thousand': 8, 'hundr': 8, 'state': 8, 'year': 7, 'everi': 7, 'unit': 7, 'incom': 7, 'live': 7, 'clyde': 6, 'age': 6, 'five': 6, 'medic': 6, 'stori': 5, 'seventi': 5, 'success': 5, 'also': 5, 'two': 5, 'die': 5, 'experi': 5, 'higher': 5, 'would': 5, 'twenti': 5, 'matter': 5, 'racial': 5, 'differ': 5, 'gap': 5, 'said': 5, 'research': 5, 'provid': 5, 'occur': 5, 'care': 5, 'institut': 5, 'segreg': 5, 'us': 5, 'must': 5, 'dissent': 5, 'yale': 4, 'member': 4, 'peopl': 4, 'race': 4, 'exampl': 4, 'level': 4, 'high': 4, 'school': 4, 'colleg': 4, 'scale': 4, 'captur': 4, 'minor': 4, 'receiv': 4, 'found': 4, 'effect': 4, 'group': 4, 'process': 4, 'well': 4, 'commun': 4, 'american': 4, 'famili': 4, 'univ': 4, 'rippl': 4, 'articl': 3, 'magazin': 3, 'class': 3, 'lawyer': 3, 'person': 3, 'blood': 3, 'fact': 3, 'forti': 3, 'graduat': 3, 'time': 3, 'averag': 3, '

Counter({'im': 15, 'immigr': 13, 'one': 13, 'journalist': 12, 'report': 12, 'jr': 12, 'go': 11, 'countri': 9, 'neutral': 9, 'us': 9, 'question': 8, 'say': 8, 'alway': 8, 'presid': 8, 'two': 7, 'true': 7, 'like': 7, 'said': 7, 'applau': 7, 'journal': 6, 'power': 6, 'know': 6, 'laughter': 6, 'happen': 6, 'first': 6, 'ask': 6, 'dont': 6, 'never': 6, 'right': 6, 'life': 5, 'unit': 5, 'state': 5, 'would': 5, 'million': 5, 'peopl': 5, 'cell': 5, 'think': 5, 'make': 5, 'person': 5, 'want': 5, 'candid': 5, 'thousand': 5, 'nine': 5, 'hundr': 5, 'twenti': 5, 'censor': 5, 'four': 5, 'thing': 5, 'citizen': 5, 'man': 5, 'word': 5, 'mexico': 4, 'learn': 4, 'side': 4, 'end': 4, 'record': 4, 'didnt': 4, 'take': 4, 'time': 4, 'call': 4, 'work': 4, 'also': 4, 'chang': 4, 'number': 4, 'trump': 4, 'sit': 4, 'back': 4, 'ive': 3, 'fear': 3, 'much': 3, 'tape': 3, 'case': 3, 'cover': 3, 'confront': 3, 'next': 3, 'assign': 3, 'presidenti': 3, 'elect': 3, 'let': 3, 'critic': 3, 'hand': 3, 'idea': 3, 'boss': 3, 

In [27]:
tf_idf

{(0, '19th'): 0.0022021815060441685,
 (0, 'abil'): 0.002056948235635901,
 (0, 'abstract'): 0.0020306358212107854,
 (0, 'academ'): 0.008122543284843142,
 (0, 'accord'): 0.001640949906870416,
 (0, 'achiev'): 0.0012137045482774587,
 (0, 'actual'): 0.0011937378057695807,
 (0, 'adhd'): 0.00706089563878501,
 (0, 'adopt'): 0.0018548895455416492,
 (0, 'adult'): 0.0014734977506871182,
 (0, 'advic'): 0.002047084661879086,
 (0, 'affect'): 0.0012088113813978599,
 (0, 'afford'): 0.0014855793732805656,
 (0, 'afterward'): 0.002052656887571346,
 (0, 'agent'): 0.0020754121232488833,
 (0, 'ago'): 0.0004067736561741848,
 (0, 'agr'): 0.0013216605147940259,
 (0, 'al'): 0.0020360748448621005,
 (0, 'alien'): 0.0021171557568006446,
 (0, 'allow'): 0.0006618074461076358,
 (0, 'along'): 0.0008759081688755822,
 (0, 'alway'): 0.0004990695461451796,
 (0, 'america'): 0.0010617155926770728,
 (0, 'andrew'): 0.0029049933949050273,
 (0, 'angel'): 0.005818464338803982,
 (0, 'anniversari'): 0.0026753195132523224,
 (0, 'an

# TF-IDF Matching Score Ranking¶


In [28]:
def matching_score(k, query):
    preprocessed_query = preprocess(query)
    tokens = word_tokenize(str(preprocessed_query))

    print("Matching Score")
    print("\nQuery:", query)
    print("")
    print(tokens)
    
    query_weights = {}

    for key in tf_idf:
        
        if key[1] in tokens:
            try:
                query_weights[key[0]] += tf_idf[key]
            except:
                query_weights[key[0]] = tf_idf[key]
    
    query_weights = sorted(query_weights.items(), key=lambda x: x[1], reverse=True)

    print("")
    
    l = []
    
    for i in query_weights[:10]:
        l.append(i[0])
    
    print(l)
    

matching_score(10, "Without the drive of Rebeccah's insistence, Kate lost her momentum. She stood next a slatted oak bench, canisters still clutched, surveying")


Matching Score

Query: Without the drive of Rebeccah's insistence, Kate lost her momentum. She stood next a slatted oak bench, canisters still clutched, surveying

['without', 'drive', 'rebeccah', 'insist', 'kate', 'lost', 'momentum', 'stood', 'next', 'slat', 'oak', 'bench', 'canist', 'still', 'clutch', 'survey']

[864, 841, 2121, 2268, 1445, 2349, 2255, 1147, 1300, 586]


In [31]:
data['transcript'][864]

"As a boy, I loved cars. When I turned 18, I lost my best friend to a car accident. Like this. And then I decided I'd dedicate my life to saving one million people every year. Now I haven't succeeded, so this is just a progress report, but I'm here to tell you a little bit about self-driving cars.I saw the concept first in the DARPA Grand Challenges where the U.S. government issued a prize to build a self-driving car that could navigate a desert. And even though a hundred teams were there, these cars went nowhere. So we decided at Stanford to build a different self-driving car. We built the hardware and the software. We made it learn from us, and we set it free in the desert. And the unimaginable happened: it became the first car to ever return from a DARPA Grand Challenge, winning Stanford 2 million dollars. Yet I still hadn't saved a single life.Since, our work has focused on building driving cars that can drive anywhere by themselves — any street in California. We've driven 140,000 