In [4]:
import ast
import os

import numpy as np

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import plotly.express as px
from collections import Counter
import statsmodels
import datetime
from sklearn.model_selection import train_test_split

df = pd.read_csv("../metadata/ted_main.csv")
df['film_date'] = df['film_date'].apply(lambda x: datetime.datetime.fromtimestamp(int(x)))
df['published_date'] = df['published_date'].apply(lambda x: datetime.datetime.fromtimestamp(int(x)))
df['film_year'] = df['film_date'].apply(lambda it: it.year)
df['published_year'] = df['published_date'].apply(lambda it: it.year)

# Filter dataset
print(f"Total talks between 2010-2016: {len(df.loc[(df['film_year'] >= 2010) & (df['film_year'] <= 2016), :])}")

# We will use videos between 2010-2016
df = df.loc[(df['film_year'] >= 2010) & (df['film_year'] <= 2016), :]
# Num_speakers == 1
df = df.loc[df['num_speaker'] == 1, :]
print(len(df))
df

Total talks between 2010-2016: 1796
1758


Unnamed: 0,comments,description,duration,event,film_date,languages,main_speaker,name,num_speaker,published_date,ratings,related_talks,speaker_occupation,tags,title,url,views,film_year,published_year
614,1137,Sharing powerful stories from his anti-obesity...,1313,TED2010,2010-02-20 02:00:00,49,Jamie Oliver,Jamie Oliver: Teach every child about food,1,2010-02-11 17:36:00,"[{'id': 23, 'name': 'Jaw-dropping', 'count': 1...","[{'id': 10, 'hero': 'https://pe.tedcdn.com/ima...","Chef, activist","['business', 'education', 'food', 'global issu...",Teach every child about food,https://www.ted.com/talks/jamie_oliver\n,7638978,2010,2010
615,209,"In a demo that drew gasps at TED2010, Blaise A...",465,TED2010,2010-02-11 02:00:00,28,Blaise Agüera y Arcas,Blaise Agüera y Arcas: Augmented-reality maps,1,2010-02-13 11:54:00,"[{'id': 23, 'name': 'Jaw-dropping', 'count': 8...","[{'id': 129, 'hero': 'https://pe.tedcdn.com/im...",Software architect,"['cities', 'design', 'map', 'technology', 'vir...",Augmented-reality maps,https://www.ted.com/talks/blaise_aguera\n,1718568,2010,2010
616,250,The leader of Britain's Conservative Party say...,839,TED2010,2010-02-10 02:00:00,29,David Cameron,David Cameron: The next age of government,1,2010-02-15 16:23:00,"[{'id': 8, 'name': 'Informative', 'count': 233...","[{'id': 604, 'hero': 'https://pe.tedcdn.com/im...",Politician,"['business', 'global issues', 'politics']",The next age of government,https://www.ted.com/talks/david_cameron\n,656762,2010,2010
618,948,"At TED2010, Bill Gates unveils his vision for ...",1669,TED2010,2010-02-12 02:00:00,38,Bill Gates,Bill Gates: Innovating to zero!,1,2010-02-18 03:00:00,"[{'id': 21, 'name': 'Unconvincing', 'count': 1...","[{'id': 51, 'hero': 'https://pe.tedcdn.com/ima...",Philanthropist,"['TED Brain Trust', 'business', 'energy', 'glo...",Innovating to zero!,https://www.ted.com/talks/bill_gates\n,4329332,2010,2010
620,277,The land of the free has become a legal minefi...,1101,TED2010,2010-02-13 02:00:00,24,Philip K. Howard,Philip K. Howard: Four ways to fix a broken le...,1,2010-02-21 11:15:00,"[{'id': 23, 'name': 'Jaw-dropping', 'count': 1...","[{'id': 187, 'hero': 'https://pe.tedcdn.com/im...",Legal activist,"['business', 'design', 'health care', 'law']",Four ways to fix a broken legal system,https://www.ted.com/talks/philip_howard\n,610454,2010,2010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2525,18,Could it be wrong to help children in need by ...,791,TEDxSydney,2016-05-24 03:00:00,5,Tara Winkler,Tara Winkler: Why we need to end the era of or...,1,2017-08-29 23:00:46,"[{'id': 10, 'name': 'Inspiring', 'count': 119}...","[{'id': 1596, 'hero': 'https://pe.tedcdn.com/i...","Child protection leader, activist, author","['TEDx', 'activism', 'children', 'family', 'po...",Why we need to end the era of orphanages,https://www.ted.com/talks/tara_winkler_why_we_...,656113,2016,2017
2528,3,Digital archaeologist Chance Coughenour is usi...,717,TEDxHamburg,2016-06-08 03:00:00,5,Chance Coughenour,Chance Coughenour: How your pictures can help ...,1,2017-08-31 23:00:31,"[{'id': 9, 'name': 'Ingenious', 'count': 16}, ...","[{'id': 2673, 'hero': 'https://pe.tedcdn.com/i...",Digital archaeologist,"['TEDx', 'ancient world', 'archaeology', 'cons...",How your pictures can help reclaim lost history,https://www.ted.com/talks/chance_coughenour_ho...,539207,2016,2017
2529,45,"We all have origin stories and identity myths,...",1156,TEDxExeter,2015-04-24 03:00:00,1,Chetan Bhatt,Chetan Bhatt: Dare to refuse the origin myths ...,1,2017-09-01 17:29:03,"[{'id': 9, 'name': 'Ingenious', 'count': 24}, ...","[{'id': 2811, 'hero': 'https://pe.tedcdn.com/i...","Sociologist, human rights activist","['TEDx', 'community', 'humanity', 'identity', ...",Dare to refuse the origin myths that claim who...,https://www.ted.com/talks/chetan_bhatt_dare_to...,857850,2015,2017
2531,18,Terrorists and extremists aren't all naturally...,698,TEDxGhent,2016-06-18 03:00:00,1,Erin Marie Saltman,Erin Marie Saltman: How young people join viol...,1,2017-09-05 23:00:24,"[{'id': 8, 'name': 'Informative', 'count': 64}...","[{'id': 2309, 'hero': 'https://pe.tedcdn.com/i...",Policy researcher,"['TEDx', 'security', 'social media', 'terroris...",How young people join violent extremist groups...,https://www.ted.com/talks/erin_marie_saltman_h...,665328,2016,2017


In [5]:
# Generate target set
log_views = np.log(df['views'])
df['log_views_norm'] = (log_views - log_views.mean()) / log_views.std()


def map_to_label(value):
    if value < -0.5:
        return "low"
    elif value < 0.5:
        return "medium"
    else:
        return "high"

df['log_views_norm_cat'] = df['log_views_norm'].apply(map_to_label)

In [6]:
merged_metadata_df = pd.read_csv("../metadata/merged_metadata_popularity.csv")
merged_metadata_df

Unnamed: 0,comments,description,duration,event,film_date,languages,main_speaker,name,num_speaker,published_date,...,tags,title,url,views,transcript,filename,film_year,published_year,log_views_norm,log_views_norm_cat
0,1137,Sharing powerful stories from his anti-obesity...,1313,TED2010,2010-02-20 02:00:00,49,Jamie Oliver,Jamie Oliver: Teach every child about food,1,2010-02-11 17:36:00,...,"['business', 'education', 'food', 'global issu...",Teach every child about food,https://www.ted.com/talks/jamie_oliver,7638978,"Sadly, in the next 18 minutes when I do our ch...",2010-jamie-oliver-007-fallback-93948ab6f9d8306...,2010,2010,2.595681,high
1,209,"In a demo that drew gasps at TED2010, Blaise A...",465,TED2010,2010-02-11 02:00:00,28,Blaise Agüera y Arcas,Blaise Agüera y Arcas: Augmented-reality maps,1,2010-02-13 11:54:00,...,"['cities', 'design', 'map', 'technology', 'vir...",Augmented-reality maps,https://www.ted.com/talks/blaise_aguera,1718568,"About a year and a half ago, Stephen Lawler, w...",2010-blaise-aguera-y-arcas-016-fallback-a39ea7...,2010,2010,0.369330,normal
2,250,The leader of Britain's Conservative Party say...,839,TED2010,2010-02-10 02:00:00,29,David Cameron,David Cameron: The next age of government,1,2010-02-15 16:23:00,...,"['business', 'global issues', 'politics']",The next age of government,https://www.ted.com/talks/david_cameron,656762,"Someone once said that politics is, of course,...",2010-david-cameron-009-fallback-e23a18509bb2a0...,2010,2010,-1.066266,low
3,948,"At TED2010, Bill Gates unveils his vision for ...",1669,TED2010,2010-02-12 02:00:00,38,Bill Gates,Bill Gates: Innovating to zero!,1,2010-02-18 03:00:00,...,"['TED Brain Trust', 'business', 'energy', 'glo...",Innovating to zero!,https://www.ted.com/talks/bill_gates,4329332,I'm going to talk today about energy and clima...,2010-bill-gates-006-fallback-69469ceb41a7e17ed...,2010,2010,1.748209,high
4,277,The land of the free has become a legal minefi...,1101,TED2010,2010-02-13 02:00:00,24,Philip K. Howard,Philip K. Howard: Four ways to fix a broken le...,1,2010-02-21 11:15:00,...,"['business', 'design', 'health care', 'law']",Four ways to fix a broken legal system,https://www.ted.com/talks/philip_howard,610454,I've always been interested in the relationshi...,2010-philip-k-howard-005-fallback-a9ddfb85ca66...,2010,2010,-1.175390,low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1697,18,Could it be wrong to help children in need by ...,791,TEDxSydney,2016-05-24 03:00:00,5,Tara Winkler,Tara Winkler: Why we need to end the era of or...,1,2017-08-29 23:00:46,...,"['TEDx', 'activism', 'children', 'family', 'po...",Why we need to end the era of orphanages,https://www.ted.com/talks/tara_winkler_why_we_...,656113,These are some photos of me volunteering in a ...,2016x-tara-winkler-008-fallback-88253cf89ec47f...,2016,2017,-1.067742,low
1698,3,Digital archaeologist Chance Coughenour is usi...,717,TEDxHamburg,2016-06-08 03:00:00,5,Chance Coughenour,Chance Coughenour: How your pictures can help ...,1,2017-08-31 23:00:31,...,"['TEDx', 'ancient world', 'archaeology', 'cons...",How your pictures can help reclaim lost history,https://www.ted.com/talks/chance_coughenour_ho...,539207,Why do people deliberately destroy cultural he...,2016x-chance-coughenour-006-fallback-dbb5bbc79...,2016,2017,-1.360604,low
1699,45,"We all have origin stories and identity myths,...",1156,TEDxExeter,2015-04-24 03:00:00,1,Chetan Bhatt,Chetan Bhatt: Dare to refuse the origin myths ...,1,2017-09-01 17:29:03,...,"['TEDx', 'community', 'humanity', 'identity', ...",Dare to refuse the origin myths that claim who...,https://www.ted.com/talks/chetan_bhatt_dare_to...,857850,"I'm Chetan Bhatt and when I give my name, I'm ...",2015x-chetan-bhatt-004-fallback-e542bdf061f08d...,2015,2017,-0.667629,low
1700,18,Terrorists and extremists aren't all naturally...,698,TEDxGhent,2016-06-18 03:00:00,1,Erin Marie Saltman,Erin Marie Saltman: How young people join viol...,1,2017-09-05 23:00:24,...,"['TEDx', 'security', 'social media', 'terroris...",How young people join violent extremist groups...,https://www.ted.com/talks/erin_marie_saltman_h...,665328,"So in 2011, I altered my name so that I could ...",2016x-erin-marie-saltman-004-fallback-92f5c6aa...,2016,2017,-1.046927,low


In [9]:
# Concatenate with features
import json

def get_related_result(row):
    for entry in os.listdir('../modeling_api_results'):
        if not entry.endswith(".wav_summary.json"):
            continue
        
        if entry.split(".wav_summary.json")[0] == row['filename'].split(".wav")[0]:
            with open(os.path.join('../modeling_api_results', entry)) as f:
                return json.load(f)
            
    # print(f"File {row['url']} not found in modeling_api_results")
    return None


features_cols = {
    "emotion_angry": [],
    "emotion_happy": [],
    "emotion_neutral": [],
    "emotion_sad": [],
    "strength_weak": [],
    "strength_neutral": [],
    "strength_strong": [],
    "positivity_negative": [],
    "positivity_neutral": [],
    "positivity_positive": [],
}

for idx, row in merged_metadata_df.iterrows():
    try:
        # Modeling api result
        result = get_related_result(row)
    
        # Aggregate with Thodoris script
        # TODO
        for k, v in features_cols.items():
            task, cls = k.split("_")
            
            if result is None:
                v.append(pd.NA)
            else:
                total = np.sum(list(result[task].values()))
                value = result[task].get(cls, 0)
                v.append(value/total)
    except Exception:
        print(result, row['filename'])
        for k, v in features_cols.items():
            v.append(pd.NA)
            
for k, v in features_cols.items():
    merged_metadata_df[k] = v
    
features_metadata = merged_metadata_df.dropna()
features_metadata

{'diarization': {'UNKNOWN': 12}, 'asr': {' you': 3, ' Bye!': 3, ' Oh, oh, oh.': 1, ' Thanks for watching!': 1, " I'll see you next time.": 1, ' Bye for now.': 1, ' See you soon.': 1, ' Okay.': 1}} 2011s-onyx-ashanti-014-fallback-f8e55d8b759b83e36fb3fc4aeb7f205a-1200k.wav


Unnamed: 0,comments,description,duration,event,film_date,languages,main_speaker,name,num_speaker,published_date,...,emotion_angry,emotion_happy,emotion_neutral,emotion_sad,strength_weak,strength_neutral,strength_strong,positivity_negative,positivity_neutral,positivity_positive
0,1137,Sharing powerful stories from his anti-obesity...,1313,TED2010,2010-02-20 02:00:00,49,Jamie Oliver,Jamie Oliver: Teach every child about food,1,2010-02-11 17:36:00,...,0.373418,0.335443,0.278481,0.012658,0.006329,0.341772,0.651899,0.221519,0.746835,0.031646
1,209,"In a demo that drew gasps at TED2010, Blaise A...",465,TED2010,2010-02-11 02:00:00,28,Blaise Agüera y Arcas,Blaise Agüera y Arcas: Augmented-reality maps,1,2010-02-13 11:54:00,...,0.0,0.233333,0.766667,0.0,0.033333,0.883333,0.083333,0.0,0.933333,0.066667
2,250,The leader of Britain's Conservative Party say...,839,TED2010,2010-02-10 02:00:00,29,David Cameron,David Cameron: The next age of government,1,2010-02-15 16:23:00,...,0.008696,0.026087,0.965217,0.0,0.0,0.947826,0.052174,0.0,0.991304,0.008696
3,948,"At TED2010, Bill Gates unveils his vision for ...",1669,TED2010,2010-02-12 02:00:00,38,Bill Gates,Bill Gates: Innovating to zero!,1,2010-02-18 03:00:00,...,0.09542,0.167939,0.736641,0.0,0.003817,0.469466,0.526718,0.068702,0.923664,0.007634
4,277,The land of the free has become a legal minefi...,1101,TED2010,2010-02-13 02:00:00,24,Philip K. Howard,Philip K. Howard: Four ways to fix a broken le...,1,2010-02-21 11:15:00,...,0.0125,0.03125,0.925,0.03125,0.0,0.875,0.125,0.025,0.975,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
491,82,"Onstage at TED2012, Jack Choi demonstrates a p...",396,TED2012,2012-02-29 02:00:00,31,Jack Choi,Jack Choi: On the virtual dissection table,1,2012-04-05 18:14:11,...,0.0,0.041667,0.958333,0.0,0.166667,0.833333,0.0,0.0,0.895833,0.104167
492,177,TED Fellow Lucy McRae is a body architect -- s...,239,TED2012,2012-02-27 02:00:00,41,Lucy McRae,Lucy McRae: How can technology transform the h...,1,2012-04-06 18:40:36,...,0.0,0.0,1.0,0.0,0.0,0.928571,0.071429,0.0,0.928571,0.071429
493,419,A skyrocketing demand for food means that agri...,1066,TEDxTC,2010-10-13 03:00:00,25,Jonathan Foley,Jonathan Foley: The other inconvenient truth,1,2012-04-08 17:00:31,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
494,168,"""Secrets can take many forms -- they can be sh...",684,TED2012,2012-02-29 02:00:00,40,Frank Warren,Frank Warren: Half a million secrets,1,2012-04-09 18:08:11,...,0.0,0.192308,0.807692,0.0,0.0,0.961538,0.038462,0.0,0.961538,0.038462


In [11]:
import plotly.express as px

topic_metadata = features_metadata.loc[:, :]


active_feats = ['emotion_happy', 'strength_strong', 'positivity_positive']
sum_of_feats = np.sum([topic_metadata[k] for k in active_feats], axis=0)

X = np.asarray(sum_of_feats).astype(np.float64)
Y = np.asarray(topic_metadata['log_views_norm']).astype(np.float64)

idx = np.where(X > 0)[0]
X = X[idx]
Y = Y[idx]

fig = px.scatter(x=X, y=Y, trendline="ols")
fig.data[1].line.color = 'red'
fig.show()