# NLP on Ted Talk transcropts using nltk

In [1]:
from __future__ import print_function

In [2]:
import nltk, re, pickle, os
import pandas as pd
import numpy as np


from textblob import TextBlob
from nltk.tokenize import sent_tokenize, word_tokenize, wordpunct_tokenize, MWETokenizer
from nltk.stem import porter, WordNetLemmatizer

from nltk.corpus import stopwords
from nltk.util import ngrams

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation,  TruncatedSVD 



In [4]:
nltk.download('punkt')
nltk.download('brown')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/summerrankin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package brown to
[nltk_data]     /Users/summerrankin/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/summerrankin/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# import 
1. unzip the `ted_files.zip` into the ./data directory
2. there should be 2 files `ted_main.csv` and `ted_trans.csv`
3. set a variable for our data directory called `path`
4. open these two csv files


In [5]:
path = './data'

Ted_main contains all of the metadata from each video.

We import this csv using the pandas package which dumps it into a nice dataframe that we can manipulate. We will use .head() to view the first 5 rows. 

In [7]:
ted_main = pd.read_csv(path + '/ted_main.csv')

ted_main.head()

Unnamed: 0,comments,description,duration,event,film_date,languages,main_speaker,name,num_speaker,published_date,ratings,related_talks,speaker_occupation,tags,title,url,views
0,4553,Sir Ken Robinson makes an entertaining and pro...,1164,TED2006,1140825600,60,Ken Robinson,Ken Robinson: Do schools kill creativity?,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 19645}, {...","[{'id': 865, 'hero': 'https://pe.tedcdn.com/im...",Author/educator,"['children', 'creativity', 'culture', 'dance',...",Do schools kill creativity?,https://www.ted.com/talks/ken_robinson_says_sc...,47227110
1,265,With the same humor and humanity he exuded in ...,977,TED2006,1140825600,43,Al Gore,Al Gore: Averting the climate crisis,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 544}, {'i...","[{'id': 243, 'hero': 'https://pe.tedcdn.com/im...",Climate advocate,"['alternative energy', 'cars', 'climate change...",Averting the climate crisis,https://www.ted.com/talks/al_gore_on_averting_...,3200520
2,124,New York Times columnist David Pogue takes aim...,1286,TED2006,1140739200,26,David Pogue,David Pogue: Simplicity sells,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 964}, {'i...","[{'id': 1725, 'hero': 'https://pe.tedcdn.com/i...",Technology columnist,"['computers', 'entertainment', 'interface desi...",Simplicity sells,https://www.ted.com/talks/david_pogue_says_sim...,1636292
3,200,"In an emotionally charged talk, MacArthur-winn...",1116,TED2006,1140912000,35,Majora Carter,Majora Carter: Greening the ghetto,1,1151367060,"[{'id': 3, 'name': 'Courageous', 'count': 760}...","[{'id': 1041, 'hero': 'https://pe.tedcdn.com/i...",Activist for environmental justice,"['MacArthur grant', 'activism', 'business', 'c...",Greening the ghetto,https://www.ted.com/talks/majora_carter_s_tale...,1697550
4,593,You've never seen data presented like this. Wi...,1190,TED2006,1140566400,48,Hans Rosling,Hans Rosling: The best stats you've ever seen,1,1151440680,"[{'id': 9, 'name': 'Ingenious', 'count': 3202}...","[{'id': 2056, 'hero': 'https://pe.tedcdn.com/i...",Global health expert; data visionary,"['Africa', 'Asia', 'Google', 'demo', 'economic...",The best stats you've ever seen,https://www.ted.com/talks/hans_rosling_shows_t...,12005869


ted_trans contains the transcripts and url for the video

In [8]:
ted_trans = pd.read_csv(path +'/ted_trans.csv')   
ted_trans.head()

Unnamed: 0.1,Unnamed: 0,transcript,url
0,0,Good morning. How are you?(Laughter)It's been ...,https://www.ted.com/talks/ken_robinson_says_sc...
1,1,"Thank you so much, Chris. And it's truly a gre...",https://www.ted.com/talks/al_gore_on_averting_...
2,2,"(Music: ""The Sound of Silence,"" Simon & Garfun...",https://www.ted.com/talks/david_pogue_says_sim...
3,3,If you're here today — and I'm very happy that...,https://www.ted.com/talks/majora_carter_s_tale...
4,4,"About 10 years ago, I took on the task to teac...",https://www.ted.com/talks/hans_rosling_shows_t...


Which column do these 2 dataframes have in common? 

you are going to merge on this column (using pandas again) and save it for our recommender later on.

In [10]:
ted_all = pd.merge(ted_trans, right=ted_main, on=##TODO: which column?)
ted_all.head(5)

Unnamed: 0.1,Unnamed: 0,transcript,url,comments,description,duration,event,film_date,languages,main_speaker,name,num_speaker,published_date,ratings,related_talks,speaker_occupation,tags,title,views
0,0,Good morning. How are you?(Laughter)It's been ...,https://www.ted.com/talks/ken_robinson_says_sc...,4553,Sir Ken Robinson makes an entertaining and pro...,1164,TED2006,1140825600,60,Ken Robinson,Ken Robinson: Do schools kill creativity?,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 19645}, {...","[{'id': 865, 'hero': 'https://pe.tedcdn.com/im...",Author/educator,"['children', 'creativity', 'culture', 'dance',...",Do schools kill creativity?,47227110
1,1,"Thank you so much, Chris. And it's truly a gre...",https://www.ted.com/talks/al_gore_on_averting_...,265,With the same humor and humanity he exuded in ...,977,TED2006,1140825600,43,Al Gore,Al Gore: Averting the climate crisis,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 544}, {'i...","[{'id': 243, 'hero': 'https://pe.tedcdn.com/im...",Climate advocate,"['alternative energy', 'cars', 'climate change...",Averting the climate crisis,3200520
2,2,"(Music: ""The Sound of Silence,"" Simon & Garfun...",https://www.ted.com/talks/david_pogue_says_sim...,124,New York Times columnist David Pogue takes aim...,1286,TED2006,1140739200,26,David Pogue,David Pogue: Simplicity sells,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 964}, {'i...","[{'id': 1725, 'hero': 'https://pe.tedcdn.com/i...",Technology columnist,"['computers', 'entertainment', 'interface desi...",Simplicity sells,1636292
3,3,If you're here today — and I'm very happy that...,https://www.ted.com/talks/majora_carter_s_tale...,200,"In an emotionally charged talk, MacArthur-winn...",1116,TED2006,1140912000,35,Majora Carter,Majora Carter: Greening the ghetto,1,1151367060,"[{'id': 3, 'name': 'Courageous', 'count': 760}...","[{'id': 1041, 'hero': 'https://pe.tedcdn.com/i...",Activist for environmental justice,"['MacArthur grant', 'activism', 'business', 'c...",Greening the ghetto,1697550
4,4,"About 10 years ago, I took on the task to teac...",https://www.ted.com/talks/hans_rosling_shows_t...,593,You've never seen data presented like this. Wi...,1190,TED2006,1140566400,48,Hans Rosling,Hans Rosling: The best stats you've ever seen,1,1151440680,"[{'id': 9, 'name': 'Ingenious', 'count': 3202}...","[{'id': 2056, 'hero': 'https://pe.tedcdn.com/i...",Global health expert; data visionary,"['Africa', 'Asia', 'Google', 'demo', 'economic...",The best stats you've ever seen,12005869


Save our new merged dataframe as a pickle file.

In [11]:
with open(path + '/ted_all.pkl', 'wb') as f:
    pickle.dump(ted_all, f)

# add an id col (of the index) for later and for iterating

In [12]:
ted_all['id'] = ted_all.index

# keep only the transcripts

In [13]:
talks = ted_all['transcript']

now let's take a look at this text. First we will print the 0th row, words 0 to 500. 

In [14]:
talks[0][0:500]

"Good morning. How are you?(Laughter)It's been great, hasn't it? I've been blown away by the whole thing. In fact, I'm leaving.(Laughter)There have been three themes running through the conference which are relevant to what I want to talk about. One is the extraordinary evidence of human creativity in all of the presentations that we've had and in all of the people here. Just the variety of it and the range of it. The second is that it's put us in a place where we have no idea what's going to hap"

In [16]:
#TODO: Print the 17th talk, words 200 to 300, Print a few more to get an idea of what the data looks like. 

### What are some things you notice about the transcripts? 

In [17]:
# number of transcripts you want to analyze
fileids = range(0,51)

# remove parethetical non-speech sounds from text
clean_parens_docs= [re.sub(r'\([^)]*\)', ' ', talks[fileid]) \
                    for fileid in fileids]

# Tokenize (split) into sentences

Below are multiple methods for sentence tokenization

1. the built in text blob.sentence

In [18]:
doc_sents = [TextBlob(clean_parens_docs[fileid])
             .sentences for fileid in fileids]

In [19]:
print(doc_sents[0][0:5])

[Sentence("Good morning."), Sentence("How are you?"), Sentence("It's been great, hasn't it?"), Sentence("I've been blown away by the whole thing."), Sentence("In fact, I'm leaving.")]


### another methods for sentence tokenization 

In [20]:
doc_sents1 = [sent_tokenize(clean_parens_docs[fileid]) for fileid in fileids]
print('\n-----\n'.join(sent_tokenize(clean_parens_docs[0][0:50])))


Good morning.
-----
How are you?
-----
It's been great, hasn't


# Word tokenization 

1. text blob.words 

In [21]:
doc_words = [TextBlob(str(doc_sents1[fileid]))
              .words for fileid in fileids]

In [22]:
print('\n-----\n'.join(TextBlob(str(doc_sents1[0][0:2])).words))

'Good
-----
morning
-----
'How
-----
are
-----
you


## this is another tokenizer. the one below is leaving in punctuation 

In [23]:
doc_words1 = [word_tokenize(clean_parens_docs[fileid]) \
             for fileid in fileids]

In [29]:
print('\n-----\n'.join(word_tokenize(clean_parens_docs[0][0:20])))

Good
-----
morning
-----
.
-----
How
-----
ar


### the wordpunct version takes care of all of the punctuaiton nicely

In [24]:
doc_words2 = [wordpunct_tokenize(clean_parens_docs[fileid]) \
             for fileid in fileids]

In [25]:
print('\n-----\n'.join(wordpunct_tokenize(clean_parens_docs[0][0:20])))

Good
-----
morning
-----
.
-----
How
-----
ar


# text blob allows us to pull out interesting things

In [26]:
talks_blob = [TextBlob(clean_parens_docs[fileid]) for fileid in fileids]

In [27]:
# pulls all the nouns and all the things that are associated with it 
print('\n-----\n'.join(talks_blob[0][0:500].noun_phrases))  

good morning
-----
whole thing
-----
extraordinary evidence
-----
human creativity


In [28]:
print(talks_blob[0].sentences[129] + '\n')
print(talks_blob[0].sentences[129].sentiment)

Truthfully, what happens is, as children grow up, we start to educate them progressively from the waist up.

Sentiment(polarity=0.5, subjectivity=0.5)


In [30]:
# def get_count(item):
#     return item[1]

# for word, count in sorted(talks_blob[1]
#                           .word_counts
#                           .items(), key=get_count, reverse=True):
#     print("%15s %i" % (word, count))

# Lemmatizer

an alternative method for getting the word roots
This one appears to be more conservative and also more 'correct' in that it will replace the ending with the correct letters instead of chopping it off.  i.e. children -> child,   capacities -> capacity, but also, unpredictability -> unpredictability . 

Thus, we run this one first, and then do the stemming on that result

In [29]:
lemmizer = WordNetLemmatizer()

for fileid in fileids[0:1]: 
    doc = TextBlob(clean_parens_docs[fileid]).words
    for w in doc[0:10]:
        print(lemmizer.lemmatize(w), w)

Good Good
morning morning
How How
are are
you you
It It
's 's
been been
great great
ha has


# clean up text: stemming

print out the original word next to the stemmed word to check

In [30]:
stemmer = nltk.stem.porter.PorterStemmer()

for fileid in fileids[0:1]: 
    doc = TextBlob(clean_parens_docs[fileid]).words
    for w in doc[0:10]:
        print(stemmer.stem(w.lower()),w)

good Good
morn morning
how How
are are
you you
it It
's 's
been been
great great
ha has


## now stem using the tokenized version that separated punctuation better

In [31]:
stemmer = nltk.stem.porter.PorterStemmer()

for fileid in fileids[0:1]: 
    for w in doc_words2[fileid][0:10]:
        print(stemmer.stem(w.lower()),w)

good Good
morn morning
. .
how How
are are
you you
? ?
it It
' '
s s


# Exploration OVER: Now, we will clean it up in a nice function based on the best methods from above

In [32]:
# a function to clean one document only

def clean_text_onedoc(text):

    lemmizer = WordNetLemmatizer()
    stop = stopwords.words('english')
    stop += ['.', ',',':','...','!"','?"', "'", '"',' - ',' — ',',"','."','!', ';',\
             '.\'"','[',']','—',".\'", 'ok','okay','yeah','ya','stuff', ' 000 ',' em ','get','got',\
             ' oh ','la','was','wa','?','like','go',' le ',' ca ',' I '," ? ","s", " t ","ve","re"]
    
    for word in wordpunct_tokenize(text): 
        cleaned = []
        if word.lower() not in stop:
            keepw = lemmizer.lemmatize(word)
            if keepw.lower not in stop:
                cleaned.append(keepw.lower())
                
    return cleaned

In [33]:
def clean_text(text):
    
    """ 
    Takes in a corpus of documents and cleans. ONly works with multiple docs for now
    
    1. remove parentheticals
    2. tokenize into words using wordpunct
    3. lowercase and remove stop words
    4. lemmatize 
    5. lowercase and remove stop words
    
    
    OUT: cleaned text = a list (documents) of lists (cleaned word in each doc)
    """

    lemmizer = WordNetLemmatizer()
    #stemmer = porter.PorterStemmer()

    stop = stopwords.words('english')
    stop += ['.', ',',':','...','!"','?"', "'", '"',' - ',' — ',',"','."','!', ';','♫♫','♫',\
             '.\'"','[',']','—',".\'", 'ok','okay','yeah','ya','stuff', ' 000 ',' em ',\
             ' oh ','thank','thanks','la','was','wa','?','like','go',' le ',' ca ',' I '," ? ","s", " t ","ve","re"]
    #stop = set(stop)

    cleaned_text = []
    
    for post in text:
        cleaned_words = []
        
        # remove parentheticals
        clean_parens = re.sub(r'\([^)]*\)', ' ', post)
        
        # tokenize into words
        for word  in wordpunct_tokenize(clean_parens):  
            
            # lowercase and throw out any words in stop words
            if word.lower() not in stop:
            
                # lemmatize  to roots
                low_word = lemmizer.lemmatize(word)  

                # stem and lowercase ( an alternative to lemmatize)
                #low_word = stemmer.stem(root.lower())  
            
                # keep if not in stopwords (yes, again)
                if low_word.lower() not in stop: 
                    
                    # put into a list of words for each document
                    cleaned_words.append(low_word.lower())
        
        # keep corpus of cleaned words for each document    
        cleaned_text.append(' '.join(cleaned_words))
    
    return cleaned_text

In [34]:
cleaned_talks = clean_text(talks)

In [35]:
with open(path + '/cleaned_talks.pkl', 'wb') as picklefile:
    pickle.dump(cleaned_talks, picklefile)

In [36]:
cleaned_talks[0][0:300]

'good morning great blown away whole thing fact leaving three theme running conference relevant want talk one extraordinary evidence human creativity presentation people variety range second put u place idea going happen term future idea may play interest education actually find everybody interest ed'

## Top n-grams, your favorite breakfast cereal

Note that these tri-grams are not very informative aside from new york city and X year ago, which will still get picked up in the bi-grams

In [37]:
from collections import Counter
from operator import itemgetter

counter = Counter()

n = 3
for doc in cleaned_talks:
    words = TextBlob(doc).words
    bigrams = ngrams(words, n)
    counter += Counter(bigrams)

for phrase, count in counter.most_common(30):
    print('%20s %i' % (" ".join(phrase), count))
    

       new york city 236
        000 year ago 135
      new york times 123
         10 year ago 118
    million year ago 109
    every single day 109
 people around world 101
        two year ago 100
        world war ii 99
       one two three 97
     couple year ago 96
         20 year ago 83
       five year old 78
     talk little bit 71
      spend lot time 71
    every single one 69
        six year old 69
      three year ago 69
  sub saharan africa 68
        last 20 year 67
     tell little bit 66
         12 year old 65
       four year old 64
         10 000 year 64
        last 10 year 64
      world around u 63
            da da da 61
       five year ago 61
       let take look 60
       four year ago 59


## Bi-grams
these are much better. Still some useless items like 'one thing' and 'can not'. interesting that we get some little musical notes in here ? i guess the way it was transcribed was using some other encoding for that?

In [38]:
counter = Counter()

n = 2
for doc in cleaned_talks:
    words = TextBlob(doc).words
    bigrams = ngrams(words, n)
    counter += Counter(bigrams)

for phrase, count in counter.most_common(30):
    print('%20s %i' % (" ".join(phrase), count))
    

            year ago 2074
          little bit 1607
            year old 1365
       united states 1103
           one thing 1041
        around world 938
            new york 894
             can not 877
          first time 751
           every day 692
         many people 656
           last year 604
        every single 573
             one day 559
             10 year 541
              tell u 521
         even though 519
      million people 499
           come back 492
          lot people 485
            two year 474
           long time 471
             20 year 464
           would say 464
           five year 449
      climate change 437
          every time 406
          year later 405
         high school 388
           going get 381


# Vectorize the data (only)  also look at binomial vecorizer in additino to count vectorizer for other topic modeling methods
Using Sklearn algorithms with text data
CountVectorizer: Convert a collection of text documents to a matrix of token counts This implementation produces a sparse representation.

In [39]:
# CountVectorizer is a class; so `vectorizer` below represents an instance of that object.
c_vectorizer = CountVectorizer(ngram_range=(1,3), 
                             stop_words='english', 
                             max_df = 0.6, 
                             max_features=10000)

t_vectorizer = TfidfVectorizer(ngram_range=(1, 3),  
                                   stop_words='english', 
                                   token_pattern="\\b[a-z][a-z]+\\b",
                                   lowercase=True,
                                   max_df = 0.6)


# call `fit` to build the vocabulary
c_vectorizer.fit(cleaned_talks)
# finally, call `transform` to convert text to a bag of words
c_x = c_vectorizer.transform(cleaned_talks)


# call `fit` to build the vocabulary
t_vectorizer.fit(cleaned_talks)
# finally, call `transform` to convert text to a bag of words
t_x = t_vectorizer.transform(cleaned_talks)
