# TedTalk Recommender for Encouraging Discourse

In [1]:
import json
import pandas as pd
import numpy as np
import gensim
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.corpus import stopwords
from textblob import TextBlob
import matplotlib.pyplot as plt 
%matplotlib inline

### Importing Data

In [2]:
json_talk = pd.read_json("/Users/cherylto/Dropbox/Ryerson Course/Capstone/Data/ted_talks-10-Sep-2012.json")

### Defining Functions

In [3]:
#This nifty function gets all the values from specific key pairs despite nesting :) :) 
def find(key, dictionary):
    for k, v in dictionary.items():
        if k == key:
            yield v
        elif isinstance(v, dict):
            for result in find(key, v):
                yield result
        elif isinstance(v, list):
            for d in v:
                for result in find(key, d):
                    yield result

In [4]:
lemmatizer = WordNetLemmatizer()
stopwords = set(stopwords.words('english'))
def preprocess(document):
    words = document.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(words)
    filtered_words = ' '.join([w for w in tokens if not w in stopwords])
    filtered_words = ''.join([i for i in filtered_words if not i.isdigit()])
    lemmas = lemmatizer.lemmatize(filtered_words)
    return ''.join(lemmas)

### Preparing Dataframe

In [5]:
#gets comment and video id and puts it in a dataframe
_id = []
zipped_list = []
for i in range(len(json_talk['comments'])):   
    doc_id = json_talk['id'][i] #gets the ted_talk id
    _id.append(doc_id)
    zipped = ()
    text = []
    date = []
    for j in range(len(json_talk['comments'][i])):
        
        #this actually creates list of list of lists
        #gets only first comment
        dt, txt = list(find('date', json_talk['comments'][i][j]))[0], list(find('text', json_talk['comments'][i][j]))[0]   
        text.append(txt)
        date.append(dt)
        
    zipped = (date, text, doc_id)
    zipped_list.append(zipped)
    
df = pd.DataFrame(zipped_list, columns = ['date', 'comments', '_id'])

In [6]:
#adding transcripts to the dataframe
df['transcripts'] = json_talk['transcript']

In [7]:
df.head()

Unnamed: 0,date,comments,_id,transcripts
0,"[Sep 10 2012, Sep 10 2012, Sep 10 2012, Sep 10...",[Doesn't gunfire produce visual illumination a...,062dd0f773cd5999a09714a371e1f8017163e2a1,The murder happened a little over 21 years ago...
1,"[Jul 25 2012:, Jul 25 2012:, Jul 25 2012:, Jul...",[I would love to know how they solved the prob...,62f6479a5eca39725798b1ee300bd8d5de3a4ae3,"As a kid, I was fascinated with all things air..."
2,"[Aug 9 2012:, Jul 26 2012:, Jul 11 2012:, Jul ...","[Actually, It is simple idea that we use solar...",b35c0cd294cd10748019833cafa625fc33487065,Good evening. We are in this wonderful open-ai...
3,"[Jul 19 2012:, Jul 16 2012:, Jul 14 2012:, Jul...","[I used to do this as a kid all the time, thou...",0fa6bca242ccb96697e8de570882c6b38746591a,"So, last month, the Encyclopaedia Britannica a..."
4,"[Sep 10 2012, 2012-09-07, Aug 29 2012:, Aug 18...",[Where is the video where this guy shows us th...,41db62481aeb978fd13f591755b596ff0616be70,"So a few weeks ago, a friend of mine gave this..."


### Using Gensim's LSI to find simlarity scores

In [8]:
from gensim import corpora, models, similarities

In [9]:
#comments is a list of strings
#transcript is string
def doc_vec_sims(comments, transcript):
    similarity = []
    texts = []

    #preprocess the text data and build the dictionary
    documents = comments
    texts = [preprocess(document).split() for document in documents]
    dictionary = corpora.Dictionary(texts)

    #builds the corpus
    corpus = [dictionary.doc2bow(text) for text in texts]

    #Market Matrix Format
    corpora.MmCorpus.serialize('/tmp/corpus.mm', corpus)
    corpus = corpora.MmCorpus('/tmp/corpus.mm')

    #comparison document
    lsi = models.LsiModel(corpus, id2word=dictionary, num_topics = 10)
    query_doc = preprocess(transcript)
    vec_bow = dictionary.doc2bow(query_doc.split())
    vec_lsi = lsi[vec_bow]
    #print(vec_lsi)

    index = similarities.MatrixSimilarity(lsi[corpus])
    sims = index[vec_lsi]
    similarity.append(sims)
    return similarity
    del dictionary
    


### Finding Questions

In [10]:
#Questions starters; another way to identify a question is the use of "?"
wh_ = ["who's", 'who', 'where', 'how', 'why', 'tell me', 'explain', "isn't", "is", "doesn't", "wouldn't", "shouldn't", "couldn't"]

In [11]:
def find_questions(comments):
    #sent_tokenizes each comment, and checks to see if it start with a question word or ends with a question mark
    #each comment is labelled, each sentence in the each comment is also labelled
    #there are 2 labeling systems here
    line = 0
    big_list = []
    for k, v in enumerate(comments): 
        #k is comment index
        #v is comment
        #line is comment idex 
        sents = sent_tokenize(v)
        line += 1
        for i in sents:
            try:
                if i.partition(' ')[0].lower() in wh_ or v.partition(' ')[2].split()[0].lower() in wh_ or i.split()[-1][-1] == '?':
                    var = (1, k, i)
                elif i.partition(' ')[0].lower() == 'when' and i.split()[-1] == '?':
                    var = (1,k, i)
                elif i.partition(' ')[0].lower() == 'what' and i.split()[-1] == '?':
                    var = (1,k, i)
                elif i.partition(' ')[0].lower() == "can't" and i.split()[-1] == '?':
                    var = (1,k, i)           
                else:
                    var = (0, k, i)

            except:
                var = (0, k, i)
            big_list.append(var)
    return big_list

### Date

In [12]:
from datetime import datetime

In [13]:
def clean_date(date_list):
    date = []
    for i in range(len(date_list)):
        da = date_list[i].replace(":","").replace("-"," ")
        date.append(da)

    date2 = []
    for i in date:
        try: 
            da = datetime.strptime(i, '%b %d %Y')
            date2.append(da)
        except:
            da = datetime.strptime(i, '%Y %m %d')
            date2.append(da)
            
    days_posted=[]
    for i in date2:
        da = most_recent - i
        days_posted.append(da.days)
    return days_posted

# Prioritizing TedTalk Comments

In [22]:
most_recent = datetime.strptime('Sep 20 2012', '%b %d %Y')
def prioritize_comments(doc_index):

    comments = df.iloc[doc_index]['comments']
    transcripts = df['transcripts'][doc_index]
    d_list = df.iloc[doc_index]['date']
    
    similarity = doc_vec_sims(comments, transcripts)
    sims = similarity[0]
    
    comm_ = [c for c in comments]
    
    #Find Questions
    big_list = find_questions(comments)
    question_df = pd.DataFrame(big_list, columns = ['TF', 'comment_id', 'comment'])	
    tmp = question_df.groupby('comment_id').sum()
    
    ##True/False
    question = []
    for i in tmp.values:
        if i >= 1:
            question.append(1)
        else:
            question.append(0)
    
    ##Comment Length
    comment_length = [len(i) for i in comments]
    
    ##Date
    d_posted = clean_date(d_list)

    #create dataframe for sorting
    tmp = pd.DataFrame({"date_posted": d_list, "days_posted": d_posted, "similarity": sims, "question": question, "comment length": comment_length, "comments": comm_})
    
    #top 10 items according to definition
    ind = tmp.sort_values(['days_posted', 'similarity', 'question', 'comment length'], ascending = [True, False, False, False])[:10].index
    
    #print recommendations
    li = ind.values
    for i in li:
        print("document number: "+str(i), 
              "days posted: "+str(tmp.iloc[i]['days_posted']), 
              "similarity score: "+str(tmp.iloc[i]['similarity']),
              "comment: "+str(tmp.iloc[i]['comments']), 
               sep="\n")
        print()

In [23]:
prioritize_comments(150)

document number: 0
days posted: 50
similarity score: 0.4660036
comment: Although the soldiers' plight is undoubtedly real and any suffering is worthy of our sympathy, this piece is undoubtedly propaganda and doesn't belong in TED.

document number: 1
days posted: 57
similarity score: 0.44467694
comment: Thank you for showing sympathy and humanity to the soldiers.

document number: 2
days posted: 114
similarity score: 0.6133998
comment: if we could only put the camera on the minds of the u.s. soldiers themselves... the suicide death toll has now exceeded the combat death toll for these folks. horrific numbers to digest. http://www.psychalive.org/2012/05/memorial-day-an-opportunity-to-reach-out-to-veterans/ 
what the hell do we do about that?

document number: 3
days posted: 122
similarity score: 0.095796265
comment: Crocodile tears for imperialist invaders. Hardly an Idea Worth Spreading Chris Anderson.

document number: 4
days posted: 167
similarity score: 0.18614173
comment: I am appl

In [29]:
#This one removes the sort criteria for date, and prioritizes question in the search criteria.
most_recent = datetime.strptime('Sep 20 2012', '%b %d %Y')
def pc_model2(doc_index):

    comments = df.iloc[doc_index]['comments']
    transcripts = df['transcripts'][doc_index]
    d_list = df.iloc[doc_index]['date']
    
    similarity = doc_vec_sims(comments, transcripts)
    sims = similarity[0]
    
    comm_ = [c for c in comments]
    
    #Find Questions
    big_list = find_questions(comments)
    question_df = pd.DataFrame(big_list, columns = ['TF', 'comment_id', 'comment'])	
    tmp = question_df.groupby('comment_id').sum()
    
    ##True/False
    question = []
    for i in tmp.values:
        if i >= 1:
            question.append(1)
        else:
            question.append(0)
    
    ##Comment Length
    comment_length = [len(i) for i in comments]
    
    ##Date
    d_posted = clean_date(d_list)

    #create dataframe for sorting
    tmp = pd.DataFrame({"date_posted": d_list, "days_posted": d_posted, "similarity": sims, "question": question, "comment length": comment_length, "comments": comm_})
    
    #top 10 items according to definition
    ind = tmp.sort_values(['question', 'similarity', 'comment length'], ascending = [False, False, False])[:10].index
    
    #print recommendations
    li = ind.values
    for i in li:
        print("document number: "+str(i),
              "days posted: "+str(tmp.iloc[i]['days_posted']), 
              "similarity score: "+str(tmp.iloc[i]['similarity']), 
              "comment: "+str(tmp.iloc[i]['comments']),
             sep='\n')
        print()

In [30]:
pc_model2(89)

document number: 53
days posted: 1075
similarity score: 0.9377416
comment: Malcolm Gladwell covers this in Tipping Point, that you should market to "Mavens," people who are interested in specific products and get excited about it with others.  People like "remarkable" things?  Didn't Piaget cover this 60 years ago with the children's tendency to look at "novel" things?  I like his other talk about Tribes better.

document number: 104
days posted: 1985
similarity score: 0.8754969
comment: If you have anything to sell, watch this video.  
Seth gets you thinking, as usual. Is what you are selling remarkable? Do you or your product/idea have otaku? Are you focusing on the people who are like you? Who care about what you have to say?  
"Favorite quote from this video: "Oh, you like my ring? It's my grandmother!" Sorry, you gotta watch it if you want to know.

document number: 52
days posted: 1067
similarity score: 0.7711279
comment: There are many remarkable products and services that hav