In [119]:
import ast
import numpy as np

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import plotly.express as px
from collections import Counter
import statsmodels
import datetime


In [131]:
df = pd.read_csv("ted_main.csv")
df['film_date'] = df['film_date'].apply(lambda x: datetime.datetime.fromtimestamp( int(x)))
df['published_date'] = df['published_date'].apply(lambda x: datetime.datetime.fromtimestamp( int(x)))
df['film_year'] = df['film_date'].apply(lambda it: it.year)
df['published_year'] = df['published_date'].apply(lambda it: it.year)

print(f"Total talks between 2010-2016: {len(df.loc[(df['film_year'] >= 2010) & (df['film_year'] <= 2016), :])}")

mean_views_per_year = df.groupby('published_year')['views'].mean()
print(mean_views_per_year)
fig = px.line(mean_views_per_year, x=mean_views_per_year.index, y=mean_views_per_year)
fig.show()

# We will use videos between 2010-2016
df = df.loc[(df['film_year'] >= 2010) & (df['film_year'] <= 2016), :]
print(len(df))

Total talks between 2010-2016: 1796
published_year
2006    4.130967e+06
2007    1.446360e+06
2008    1.384420e+06
2009    1.540809e+06
2010    1.566904e+06
2011    1.534482e+06
2012    1.600203e+06
2013    2.234803e+06
2014    1.856316e+06
2015    1.911760e+06
2016    1.704276e+06
2017    1.151947e+06
Name: views, dtype: float64


1796


# Views analysis

In [121]:
df.sort_values(by="views", ascending=False)[["main_speaker", "views"]]

Unnamed: 0,main_speaker,views
1346,Amy Cuddy,43155405
837,Brené Brown,31168150
1776,Julian Treasure,21594632
2114,James Veitch,20475972
1416,Cameron Russell,19787465
...,...,...
1079,Leslie Dodson,121186
1141,Keith Nolan,120274
1229,José Bowen,117756
1133,Aleph Molinari,115346


In [122]:
spk_mean_views = df.groupby('main_speaker')['views'].mean().sort_values(ascending=False)
px.scatter(spk_mean_views)


In [123]:
# We take the log value of views and normalize with mean and std

log_views = np.log(df['views'])
px.histogram((log_views - log_views.mean())/log_views.std())

How to normalize views?
- Take the difference from the total average (log values)

Define groups of topics and predict per topic
- Use tags and get most common
- Get embeddings and perform clustering to get unsupervised topics

In [124]:
# Find the most appeared tags

total_tags = [tag for row in df.tags for tag in ast.literal_eval(row)]
tags_counter = Counter(total_tags).most_common()
fig = px.histogram(x=[entry[0] for entry in tags_counter], y=[entry[1] for entry in tags_counter])
fig.show()

most_common_tags = ['technology', 'science', 'global issues', 'design', 'business', 'entertainment', 'health', 'innovation', 'society', 'art', 'social change']

In [125]:
active_tags = most_common_tags[:7]
print(f"Active tags: {active_tags}")
df['included'] = df['tags'].map(lambda it: len(set(ast.literal_eval(it)).intersection(active_tags)) > 0)
print(len(df))
print(len(df.loc[df['included'] == True, :]))

dominant_group = []
for idx, row in df.iterrows():
    entry = pd.NA
    for common_tag in reversed(active_tags):
        for tag in ast.literal_eval(row['tags']):
            if tag == common_tag and pd.isna(entry):
                entry = tag
    dominant_group.append(entry)
    
df['dominant_group'] = dominant_group
df

Active tags: ['technology', 'science', 'global issues', 'design', 'business', 'entertainment', 'health']
1796
1296


Unnamed: 0,comments,description,duration,event,film_date,languages,main_speaker,name,num_speaker,published_date,...,related_talks,speaker_occupation,tags,title,url,views,film_year,published_year,included,dominant_group
614,1137,Sharing powerful stories from his anti-obesity...,1313,TED2010,2010-02-20 02:00:00,49,Jamie Oliver,Jamie Oliver: Teach every child about food,1,2010-02-11 17:36:00,...,"[{'id': 10, 'hero': 'https://pe.tedcdn.com/ima...","Chef, activist","['business', 'education', 'food', 'global issu...",Teach every child about food,https://www.ted.com/talks/jamie_oliver\n,7638978,2010,2010,True,health
615,209,"In a demo that drew gasps at TED2010, Blaise A...",465,TED2010,2010-02-11 02:00:00,28,Blaise Agüera y Arcas,Blaise Agüera y Arcas: Augmented-reality maps,1,2010-02-13 11:54:00,...,"[{'id': 129, 'hero': 'https://pe.tedcdn.com/im...",Software architect,"['cities', 'design', 'map', 'technology', 'vir...",Augmented-reality maps,https://www.ted.com/talks/blaise_aguera\n,1718568,2010,2010,True,design
616,250,The leader of Britain's Conservative Party say...,839,TED2010,2010-02-10 02:00:00,29,David Cameron,David Cameron: The next age of government,1,2010-02-15 16:23:00,...,"[{'id': 604, 'hero': 'https://pe.tedcdn.com/im...",Politician,"['business', 'global issues', 'politics']",The next age of government,https://www.ted.com/talks/david_cameron\n,656762,2010,2010,True,business
618,948,"At TED2010, Bill Gates unveils his vision for ...",1669,TED2010,2010-02-12 02:00:00,38,Bill Gates,Bill Gates: Innovating to zero!,1,2010-02-18 03:00:00,...,"[{'id': 51, 'hero': 'https://pe.tedcdn.com/ima...",Philanthropist,"['TED Brain Trust', 'business', 'energy', 'glo...",Innovating to zero!,https://www.ted.com/talks/bill_gates\n,4329332,2010,2010,True,business
620,277,The land of the free has become a legal minefi...,1101,TED2010,2010-02-13 02:00:00,24,Philip K. Howard,Philip K. Howard: Four ways to fix a broken le...,1,2010-02-21 11:15:00,...,"[{'id': 187, 'hero': 'https://pe.tedcdn.com/im...",Legal activist,"['business', 'design', 'health care', 'law']",Four ways to fix a broken legal system,https://www.ted.com/talks/philip_howard\n,610454,2010,2010,True,business
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2525,18,Could it be wrong to help children in need by ...,791,TEDxSydney,2016-05-24 03:00:00,5,Tara Winkler,Tara Winkler: Why we need to end the era of or...,1,2017-08-29 23:00:46,...,"[{'id': 1596, 'hero': 'https://pe.tedcdn.com/i...","Child protection leader, activist, author","['TEDx', 'activism', 'children', 'family', 'po...",Why we need to end the era of orphanages,https://www.ted.com/talks/tara_winkler_why_we_...,656113,2016,2017,False,
2528,3,Digital archaeologist Chance Coughenour is usi...,717,TEDxHamburg,2016-06-08 03:00:00,5,Chance Coughenour,Chance Coughenour: How your pictures can help ...,1,2017-08-31 23:00:31,...,"[{'id': 2673, 'hero': 'https://pe.tedcdn.com/i...",Digital archaeologist,"['TEDx', 'ancient world', 'archaeology', 'cons...",How your pictures can help reclaim lost history,https://www.ted.com/talks/chance_coughenour_ho...,539207,2016,2017,True,technology
2529,45,"We all have origin stories and identity myths,...",1156,TEDxExeter,2015-04-24 03:00:00,1,Chetan Bhatt,Chetan Bhatt: Dare to refuse the origin myths ...,1,2017-09-01 17:29:03,...,"[{'id': 2811, 'hero': 'https://pe.tedcdn.com/i...","Sociologist, human rights activist","['TEDx', 'community', 'humanity', 'identity', ...",Dare to refuse the origin myths that claim who...,https://www.ted.com/talks/chetan_bhatt_dare_to...,857850,2015,2017,False,
2531,18,Terrorists and extremists aren't all naturally...,698,TEDxGhent,2016-06-18 03:00:00,1,Erin Marie Saltman,Erin Marie Saltman: How young people join viol...,1,2017-09-05 23:00:24,...,"[{'id': 2309, 'hero': 'https://pe.tedcdn.com/i...",Policy researcher,"['TEDx', 'security', 'social media', 'terroris...",How young people join violent extremist groups...,https://www.ted.com/talks/erin_marie_saltman_h...,665328,2016,2017,False,


In [126]:
# Views per topic
# The median values seems not to differ much between topics
# But we should run the experiments per topic either-way

topic_df = df.dropna()
px.histogram(np.log(topic_df['views']), color=topic_df['dominant_group'], barmode="overlay", marginal="box").show()
print("Median views:")
print(topic_df.groupby("dominant_group")["views"].median())
print("Total views:")
print(topic_df.groupby("dominant_group")["views"].sum())

Median views:
dominant_group
business         1392085.5
design           1165468.0
entertainment    1260939.0
global issues    1023184.5
health           1128877.0
science          1178746.0
technology       1244156.5
Name: views, dtype: float64
Total views:
dominant_group
business         409190395
design           259211305
entertainment    308607362
global issues    278043573
health           319192478
science          308502779
technology       258500545
Name: views, dtype: int64


In [127]:
# Views are independent of duration

px.scatter(x=df['duration'], y=np.log(df['views'])).show()

In [128]:
# Correlation between views-comments

fig = px.scatter(x=np.log(df['comments']), y=np.log(df['views']), trendline="ols")
fig.data[1].line.color = 'red'
fig.show()

In [129]:
views_per_event = df.groupby('event')['views'].sum()


# Ratings analysis

In [130]:
set([entry['name'] for ratings in df.ratings for entry in ast.literal_eval(ratings)])

{'Beautiful',
 'Confusing',
 'Courageous',
 'Fascinating',
 'Funny',
 'Informative',
 'Ingenious',
 'Inspiring',
 'Jaw-dropping',
 'Longwinded',
 'OK',
 'Obnoxious',
 'Persuasive',
 'Unconvincing'}