In [3]:
import pandas as pd
from datetime import datetime

In [4]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\cassi\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [5]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

In [6]:
# unify date formats and sort comments according to post date
def unify_and_sort(comments):
    for comment in comments:
        if comment['date'][-1] == ':':
            comment['date'] = comment['date'][:-1]
        elif '-' in comment['date']:
            comment['date'] = datetime.strptime(comment['date'], '%Y-%m-%d').strftime('%b %d %Y')
          
    comments.sort(key=lambda c: datetime.strptime(c['date'], "%b %d %Y"))
    
    return comments

In [7]:
# count the number of negative/neutral/positive comments of each talk
def count_sentiment(scores):
    neg = 0
    neu = 0
    pos = 0
    for score in scores:
        if score['compound'] > 0.4:
            pos += 1
        elif score['compound'] < -0.4:
            neg += 1
        else:
            neu += 1
            
    return {'neg': neg, 'neu': neu, 'pos': pos}

In [15]:
# read the original datasets
talks1 = pd.read_json("ted_talks-10-Sep-2012.json")
talks2 = pd.read_json("ted_talks-25-Apr-2012.json")

In [None]:
pd.concat(g for _, g in talks1.groupby('title') if len(g) > 1)

In [16]:
# clean the first dataset
talks1['views'] = talks1['views'].apply(lambda x: int(x[0].replace(',','')))
talks1['title'] = talks1['title'].apply(lambda x: x[0])
talks1['description'] = talks1['description'].apply(lambda x: ''.join(x).replace('\t', '').replace('\n', ' ').rstrip())
talks1['collect_date'] = 'Sep 2012'

In [17]:
# sentiment analysis of the comments in the first dataset
talks1['filtered_comments'] = talks1['comments'].apply(lambda x: [{'text': d['text'], 'date': d['date']} for d in x])
talks1['filtered_comments'] = talks1['filtered_comments'].apply(lambda x: unify_and_sort(x))
talks1['scores'] = talks1['filtered_comments'].apply(lambda x: [sid.polarity_scores(c['text']) for c in x])
talks1['sentiment_count'] = talks1['scores'].apply(lambda x: count_sentiment(x))

In [None]:
pd.concat(g for _, g in talks2.groupby('title') if len(g) > 1)

In [19]:
# clean the second dataset
talks2.drop([114, 1146, 690, 1145], inplace=True)
talks2['description'] = talks2['description'].apply(lambda x: x.rstrip())
talks2.reset_index(drop=True, inplace=True)
talks2['collect_date'] = 'Apr 2012'

In [20]:
# sentiment analysis of the comments in the second dataset
talks2['filtered_comments'] = talks2['comments'].apply(lambda x: [{'text': d['text'], 'date': d['date']} for d in x])
talks2['filtered_comments'] = talks2['filtered_comments'].apply(lambda x: unify_and_sort(x))
talks2['scores'] = talks2['filtered_comments'].apply(lambda x: [sid.polarity_scores(c['text']) for c in x])
talks2['sentiment_count'] = talks2['scores'].apply(lambda x: count_sentiment(x))

In [25]:
# rearrange columns
talk_columns=['id','film_date','publish_date','title','speaker','ted_event','description','related_tags','related_themes','related_videos','views','comments','transcript','url', 'filtered_comments', 'scores', 'sentiment_count', 'collect_date']
talks1 = talks1[talk_columns]
talks2 = talks2[talk_columns]

In [28]:
talks = talks2.append(talks1)
talks.reset_index(drop=True, inplace=True)
talks.drop(columns = ['id'], inplace=True)
talks.to_csv('talks.csv')

In [32]:
repeated_talks = pd.concat(g for _, g in talks.groupby('title') if len(g) > 1)
repeated_talks.to_csv('repeated_talks.csv')