# TedTalk Recommender for Encouraging Discourse

In [22]:
import json
import pandas as pd
import numpy as np
import gensim
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.corpus import stopwords
from textblob import TextBlob
import matplotlib.pyplot as plt 
%matplotlib inline

### Importing Data

In [23]:
json_talk = pd.read_json("/Users/cherylto/Dropbox/Ryerson Course/Capstone/Data/ted_talks-10-Sep-2012.json")

### Defining Functions

In [24]:
#This nifty function gets all the values from specific key pairs despite nesting :) :) 
def find(key, dictionary):
    for k, v in dictionary.items():
        if k == key:
            yield v
        elif isinstance(v, dict):
            for result in find(key, v):
                yield result
        elif isinstance(v, list):
            for d in v:
                for result in find(key, d):
                    yield result

In [25]:
lemmatizer = WordNetLemmatizer()
stopwords = set(stopwords.words('english'))
def preprocess(document):
    words = document.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(words)
    filtered_words = ' '.join([w for w in tokens if not w in stopwords])
    filtered_words = ''.join([i for i in filtered_words if not i.isdigit()])
    lemmas = lemmatizer.lemmatize(filtered_words)
    return ''.join(lemmas)

### Preparing Dataframe

In [26]:
#gets comment and video id and puts it in a dataframe
_id = []
zipped_list = []
for i in range(len(json_talk['comments'])):   
    doc_id = json_talk['id'][i] #gets the ted_talk id
    _id.append(doc_id)
    zipped = ()
    text = []
    date = []
    for j in range(len(json_talk['comments'][i])):
        dt, txt = list(find('date', json_talk['comments'][i][j]))[0], list(find('text', json_talk['comments'][i][j]))[0]   #this actually creates list of list of lists
        text.append(txt)
        date.append(dt)
    zipped = (date, text, doc_id)
    zipped_list.append(zipped)
    
df = pd.DataFrame(zipped_list, columns = ['date', 'comments', '_id'])

In [27]:
#adding transcripts to the dataframe
df['transcripts'] = json_talk['transcript']

In [28]:
df.head()

Unnamed: 0,date,comments,_id,transcripts
0,"[Sep 10 2012, Sep 10 2012, Sep 10 2012, Sep 10...",[Doesn't gunfire produce visual illumination a...,062dd0f773cd5999a09714a371e1f8017163e2a1,The murder happened a little over 21 years ago...
1,"[Jul 25 2012:, Jul 25 2012:, Jul 25 2012:, Jul...",[I would love to know how they solved the prob...,62f6479a5eca39725798b1ee300bd8d5de3a4ae3,"As a kid, I was fascinated with all things air..."
2,"[Aug 9 2012:, Jul 26 2012:, Jul 11 2012:, Jul ...","[Actually, It is simple idea that we use solar...",b35c0cd294cd10748019833cafa625fc33487065,Good evening. We are in this wonderful open-ai...
3,"[Jul 19 2012:, Jul 16 2012:, Jul 14 2012:, Jul...","[I used to do this as a kid all the time, thou...",0fa6bca242ccb96697e8de570882c6b38746591a,"So, last month, the Encyclopaedia Britannica a..."
4,"[Sep 10 2012, 2012-09-07, Aug 29 2012:, Aug 18...",[Where is the video where this guy shows us th...,41db62481aeb978fd13f591755b596ff0616be70,"So a few weeks ago, a friend of mine gave this..."


In [29]:
def sentiment_scores(comment):
    senti = []
    for i in comment:
        try:
            b = TextBlob(i)
            senti.append(b.sentiment.polarity)
        except:
            senti.append(0)
    return senti


In [64]:
sentiment_scores(df.iloc[0]['comments'])

[0.13333333333333333,
 0.13896103896103895,
 0.5,
 0.15625,
 0.0,
 0.43,
 0.03333333333333334]

### Using Gensim's LSI model to find simlarity scores

In [31]:
from gensim import corpora, models, similarities

In [32]:
#comments is a list of strings
#transcript is string
def doc_vec_sims(comments, transcript):
    similarity = []
    texts = []

    #preprocess the text data and build the dictionary
    documents = comments
    texts = [preprocess(document).split() for document in documents]
    dictionary = corpora.Dictionary(texts)

    #builds the corpus
    corpus = [dictionary.doc2bow(text) for text in texts]

    #Market Matrix Format
    corpora.MmCorpus.serialize('/tmp/corpus.mm', corpus)
    corpus = corpora.MmCorpus('/tmp/corpus.mm')

    #comparison document
    lsi = models.LsiModel(corpus, id2word=dictionary, num_topics = 10)
    query_doc = preprocess(transcript)
    vec_bow = dictionary.doc2bow(query_doc.split())
    vec_lsi = lsi[vec_bow]
    #print(vec_lsi)

    index = similarities.MatrixSimilarity(lsi[corpus])
    sims = index[vec_lsi]
    similarity.append(sims)
    return similarity
    del dictionary
    


### Finding Questions

In [33]:
#Questions starters; another way to identify a question is the use of "?"
wh_ = ["who's", 'who', 'where', 'how', 'why', 'tell me', 'explain', "isn't", "is", "doesn't", "wouldn't", "shouldn't", "couldn't"]

In [34]:
def find_questions(comments):
    #sent_tokenizes each comment, and checks to see if it start with a question word or ends with a question mark
    #each comment is labelled, each sentence in the each comment is also labelled
    #there are 2 labeling systems here
    line = 0
    big_list = []
    for k, v in enumerate(comments): 
        #k is comment index
        #v is comment
        #line is comment idex 
        sents = sent_tokenize(v)
        line += 1
        for i in sents:
            try:
                if i.partition(' ')[0].lower() in wh_ or v.partition(' ')[2].split()[0].lower() in wh_ or i.split()[-1][-1] == '?':
                    var = (1, k, i)
                elif i.partition(' ')[0].lower() == 'when' and i.split()[-1] == '?':
                    var = (1,k, i)
                elif i.partition(' ')[0].lower() == 'what' and i.split()[-1] == '?':
                    var = (1,k, i)
                elif i.partition(' ')[0].lower() == "can't" and i.split()[-1] == '?':
                    var = (1,k, i)           
                else:
                    var = (0, k, i)

            except:
                var = (0, k, i)
            big_list.append(var)
    return big_list

### Date

In [35]:
from datetime import datetime

In [36]:
def clean_date(date_list):
    date = []
    for i in range(len(date_list)):
        da = date_list[i].replace(":","").replace("-"," ")
        date.append(da)

    date2 = []
    for i in date:
        try: 
            da = datetime.strptime(i, '%b %d %Y')
            date2.append(da)
        except:
            da = datetime.strptime(i, '%Y %m %d')
            date2.append(da)
            
    days_posted=[]
    for i in date2:
        da = most_recent - i
        days_posted.append(da.days)
    return days_posted

# Prioritizing TedTalk Comments

In [42]:
most_recent = datetime.strptime('Sep 20 2012', '%b %d %Y')
def prioritize_comments(doc_index):

    comments = df.iloc[doc_index]['comments']
    transcripts = df['transcripts'][doc_index]
    d_list = df.iloc[doc_index]['date']
    
    similarity = doc_vec_sims(comments, transcripts)
    sims = similarity[0]
    sentiments = sentiment_scores(comments)
    
    comm_ = [c for c in comments]
    
    #Sentiment
    sentiment = sentiment_scores(comments)
    
    #Find Questions
    big_list = find_questions(comments)
    question_df = pd.DataFrame(big_list, columns = ['TF', 'comment_id', 'comment'])	
    tmp = question_df.groupby('comment_id').sum()
    
    ##True/False
    question = []
    for i in tmp.values:
        if i >= 1:
            question.append(1)
        else:
            question.append(0)
    
    ##Comment Length
    comment_length = [len(i) for i in comments]
    
    ##Date
    d_posted = clean_date(d_list)

    #create dataframe for sorting
    tmp = pd.DataFrame({"date_posted": d_list, "days_posted": d_posted, "similarity": sims, 
                        "question": question, "comment length": comment_length, "comments": comm_,
                        'sentiment': sentiment})
    
    #top 10 items according to definition
    ind = tmp.sort_values(['days_posted', 'similarity', 'sentiment', 'question', 'comment length'], ascending = [True, False, False, False, False])[:10].index
    
    #print recommendations
    li = ind.values
    for i in li:
        print("document_number: "+str(i), 
              "days_posted: "+str(tmp.iloc[i]['days_posted']), 
              "similarity_score: "+str(tmp.iloc[i]['similarity']), 
              "sentiment: "+str(round(tmp.iloc[i]['sentiment'], 2)),
              "comment: "+str(tmp.iloc[i]['comments']), sep="\n")
        print()

In [43]:
prioritize_comments(150)

document_number: 0
days_posted: 50
similarity_score: 0.4660036
sentiment: 0.14
comment: Although the soldiers' plight is undoubtedly real and any suffering is worthy of our sympathy, this piece is undoubtedly propaganda and doesn't belong in TED.

document_number: 1
days_posted: 57
similarity_score: 0.44467694
sentiment: 0.0
comment: Thank you for showing sympathy and humanity to the soldiers.

document_number: 2
days_posted: 114
similarity_score: 0.6133998
sentiment: -0.5
comment: if we could only put the camera on the minds of the u.s. soldiers themselves... the suicide death toll has now exceeded the combat death toll for these folks. horrific numbers to digest. http://www.psychalive.org/2012/05/memorial-day-an-opportunity-to-reach-out-to-veterans/ 
what the hell do we do about that?

document_number: 3
days_posted: 122
similarity_score: 0.095796265
sentiment: 0.0
comment: Crocodile tears for imperialist invaders. Hardly an Idea Worth Spreading Chris Anderson.

document_number: 4
da

In [44]:
#This one removes the sort criteria for date, and prioritizes question in the search criteria.
most_recent = datetime.strptime('Sep 20 2012', '%b %d %Y')
def pc_model2(doc_index):

    comments = df.iloc[doc_index]['comments']
    transcripts = df['transcripts'][doc_index]
    d_list = df.iloc[doc_index]['date']
    
    similarity = doc_vec_sims(comments, transcripts)
    sims = similarity[0]
    
    comm_ = [c for c in comments]
    
    #Sentiment
    sentiment = sentiment_scores(comments)
    
    #Find Questions
    big_list = find_questions(comments)
    question_df = pd.DataFrame(big_list, columns = ['TF', 'comment_id', 'comment'])	
    tmp = question_df.groupby('comment_id').sum()
    
    ##True/False
    question = []
    for i in tmp.values:
        if i >= 1:
            question.append(1)
        else:
            question.append(0)
    
    ##Comment Length
    comment_length = [len(i) for i in comments]
    
    ##Date
    d_posted = clean_date(d_list)

    #create dataframe for sorting
    tmp = pd.DataFrame({"date_posted": d_list, "days_posted": d_posted, "similarity": sims, 
                        "sentiment": sentiment, "question": question, 
                        "comment length": comment_length, "comments": comm_})
    
    #top 10 items according to definition
    ind = tmp.sort_values(['question', 'similarity', 'sentiment', 'comment length'], ascending = [False, False, False, False])[:10].index
    
    #print recommendations
    li = ind.values
    for i in li:
        print("comment_number: "+str(i), "days_posted: "+str(tmp.iloc[i]['days_posted']), 
              "similarity_score: "+str(tmp.iloc[i]['similarity']), 
              "sentiment: "+str(round(tmp.iloc[i]['sentiment'], 2)),
              "comment: "+str(tmp.iloc[i]['comments']), sep="\n")
        print()

In [46]:
pc_model2(150)

comment_number: 50
days_posted: 1306
similarity_score: 0.88319564
sentiment: 0.18
comment: Like so many others, I must thank you for giving this talk. Whether you are in favour of the war or not does not matter here. I simply appreciate that you have forced us to ask uncomfortable questions. We should know, indeed as a Briton I would like to know, what we are dealing with, what we ask of our soldiers and the effects of their presence on the Iraqi people. Your film tells a vital part of the ongoing story that is not available through the restrictions of reporting a story objectively by even the most even handed news broadcaster. Surely we must have all the information, statistics, perspectives and feelings to really undertake a worthwhile discussion about Iraq and the situation we have found ourselves in? So thank you Ms Scranton. Keep up the good work and be sure to keep the passion that you so obviously carry with you.

comment_number: 34
days_posted: 1005
similarity_score: 0.7300179


In [47]:
json_talk.iloc[150]['description']

['Filmmaker Deborah Scranton talks about and shows clips from her documentary The War Tapes, which puts cameras in the hands of soldiers fighting in Iraq. ',
 '\n\t\t\t\t\t\t\tThe director of the award-winning documentary The War Tapes, Deborah Scranton is committed to using new technology to give people power to tell their own stories.  ',
 '\t\t\t\t\t\t']

In [48]:
json_talk[json_talk['id'] == '59129aacfb6cebbe2c52f30ef3424209f7252e82']

Unnamed: 0,comments,description,film_date,id,publish_date,related_tags,related_themes,related_videos,speaker,ted_event,title,transcript,url,views
37,[{'user_id': 'f8557b5fc3a84049d38e33bba6a3dedb...,[Sir Ken Robinson makes an entertaining and pr...,Feb 2006,59129aacfb6cebbe2c52f30ef3424209f7252e82,Jun 2006,"[Children, Creativity, Culture, Dance, Educati...","[Bold Predictions, Stern Warnings, How the Min...",[Sir Ken Robinson: Bring on the learning revol...,Ken Robinson,TED2006,[Ken Robinson says schools kill creativity],"Good morning. How are you? It's been great, ha...",http://www.ted.com/talks/ken_robinson_says_sch...,"[12,056,699]"


In [49]:
pc_model2(37)

comment_number: 964
days_posted: 1176
similarity_score: 0.9878814
sentiment: 0.16
comment: beautifully said. in my less elegant and more outraged sense, modern compulsory schooling has been a tool of the dominant class to dump the basic training costs of their labor force on the rest of us. schools act like big factories producing obedient and compliant workers (sheep for the slaughter). BlankEmpty slates to be filled in based on industrial needs. They really have no need for artists or dancers or musicians. The question is what are we all willing to do about it? I don't understand how we willingly allow kids to be treated like nothing but just another human "resource". I say we fight back and let these bastards that run our schools know, that enough is enough. lets educate our children outside these behavioral laboratories they call schools. get out of kids way and help them find what they want to learn. Help them discover a genuine enthusiasm! DAMN THE MAN! Not for any moral imperati

In [50]:
json_talk[json_talk['id'] == 'e993215bfdaa515f6ea00fafc1918f549119f993']

Unnamed: 0,comments,description,film_date,id,publish_date,related_tags,related_themes,related_videos,speaker,ted_event,title,transcript,url,views
187,[{'user_id': '36258a9b4188d47adb520cfcdb2a85a6...,[Richard Dawkins urges all atheists to openly ...,Feb 2002,e993215bfdaa515f6ea00fafc1918f549119f993,Apr 2007,"[Atheism, Culture, God, Politics, Religion, Sc...","[Bold Predictions, Stern Warnings, Is There a ...","[Julia Sweeney: Letting go of God, Dan Dennett...",Richard Dawkins,TED2002,[Richard Dawkins: Militant atheism],"That splendid music, the coming-in music -- ""T...",http://www.ted.com/talks/richard_dawkins_on_mi...,"[1,763,738]"


In [51]:
json_talk.iloc[187]['url']

'http://www.ted.com/talks/richard_dawkins_on_militant_atheism.html'

In [52]:
pc_model2(187)

comment_number: 1483
days_posted: 1353
similarity_score: 0.9488484
sentiment: 0.19
comment: @ Francois You clearly already have already decided for me what my position is 
I try  to use terms like "seem" and "appear" I am not trying to define you I was just pointing out that when questioned on why you would back militant atheism you  seem  to deviate from the talk.  I think I do understand what you're saying as best as anyone could being that you insist on using certain vocabulary then when questioned you give me what  your  definition is for the word in question  (atheism, zealot, indoctrination).  So please forgive me for not knowing ahead of time what your definitions are for words before you mince, no julienne them.  Why you think I should have the exact same opinion as Dawkins boggles my mind 
Let me remind you.  My initial point was that the presenter was no different than any grandstanding religious fundamentalist and who's talk was more bravado than substance.  You defended his

In [56]:
scores_187 = sentiment_scores(df.iloc[187]['comments'])
scores_150 = sentiment_scores(df.iloc[150]['comments'])

In [63]:
print(min(scores_187), max(scores_187))
print(min(scores_150), max(scores_150))
#Here we see that sentiment scores for RD's talk has a wider range. This could be an indication that the topic 
#was more controversial.

-1.0 1.0
-0.6 0.7
