In [38]:
import ast
import os

import numpy as np

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import plotly.express as px
from collections import Counter
import statsmodels
import datetime
from sklearn.model_selection import train_test_split

df = pd.read_csv("ted_main.csv")
df['film_date'] = df['film_date'].apply(lambda x: datetime.datetime.fromtimestamp(int(x)))
df['published_date'] = df['published_date'].apply(lambda x: datetime.datetime.fromtimestamp(int(x)))
df['film_year'] = df['film_date'].apply(lambda it: it.year)
df['published_year'] = df['published_date'].apply(lambda it: it.year)

# Filter dataset
print(f"Total talks between 2010-2016: {len(df.loc[(df['film_year'] >= 2010) & (df['film_year'] <= 2016), :])}")

# We will use videos between 2010-2016
df = df.loc[(df['film_year'] >= 2010) & (df['film_year'] <= 2016), :]
# Num_speakers == 1
df = df.loc[df['num_speaker'] == 1, :]
print(len(df))
df

Total talks between 2010-2016: 1796
1758


Unnamed: 0,comments,description,duration,event,film_date,languages,main_speaker,name,num_speaker,published_date,ratings,related_talks,speaker_occupation,tags,title,url,views,film_year,published_year
614,1137,Sharing powerful stories from his anti-obesity...,1313,TED2010,2010-02-20 02:00:00,49,Jamie Oliver,Jamie Oliver: Teach every child about food,1,2010-02-11 17:36:00,"[{'id': 23, 'name': 'Jaw-dropping', 'count': 1...","[{'id': 10, 'hero': 'https://pe.tedcdn.com/ima...","Chef, activist","['business', 'education', 'food', 'global issu...",Teach every child about food,https://www.ted.com/talks/jamie_oliver\n,7638978,2010,2010
615,209,"In a demo that drew gasps at TED2010, Blaise A...",465,TED2010,2010-02-11 02:00:00,28,Blaise Agüera y Arcas,Blaise Agüera y Arcas: Augmented-reality maps,1,2010-02-13 11:54:00,"[{'id': 23, 'name': 'Jaw-dropping', 'count': 8...","[{'id': 129, 'hero': 'https://pe.tedcdn.com/im...",Software architect,"['cities', 'design', 'map', 'technology', 'vir...",Augmented-reality maps,https://www.ted.com/talks/blaise_aguera\n,1718568,2010,2010
616,250,The leader of Britain's Conservative Party say...,839,TED2010,2010-02-10 02:00:00,29,David Cameron,David Cameron: The next age of government,1,2010-02-15 16:23:00,"[{'id': 8, 'name': 'Informative', 'count': 233...","[{'id': 604, 'hero': 'https://pe.tedcdn.com/im...",Politician,"['business', 'global issues', 'politics']",The next age of government,https://www.ted.com/talks/david_cameron\n,656762,2010,2010
618,948,"At TED2010, Bill Gates unveils his vision for ...",1669,TED2010,2010-02-12 02:00:00,38,Bill Gates,Bill Gates: Innovating to zero!,1,2010-02-18 03:00:00,"[{'id': 21, 'name': 'Unconvincing', 'count': 1...","[{'id': 51, 'hero': 'https://pe.tedcdn.com/ima...",Philanthropist,"['TED Brain Trust', 'business', 'energy', 'glo...",Innovating to zero!,https://www.ted.com/talks/bill_gates\n,4329332,2010,2010
620,277,The land of the free has become a legal minefi...,1101,TED2010,2010-02-13 02:00:00,24,Philip K. Howard,Philip K. Howard: Four ways to fix a broken le...,1,2010-02-21 11:15:00,"[{'id': 23, 'name': 'Jaw-dropping', 'count': 1...","[{'id': 187, 'hero': 'https://pe.tedcdn.com/im...",Legal activist,"['business', 'design', 'health care', 'law']",Four ways to fix a broken legal system,https://www.ted.com/talks/philip_howard\n,610454,2010,2010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2525,18,Could it be wrong to help children in need by ...,791,TEDxSydney,2016-05-24 03:00:00,5,Tara Winkler,Tara Winkler: Why we need to end the era of or...,1,2017-08-29 23:00:46,"[{'id': 10, 'name': 'Inspiring', 'count': 119}...","[{'id': 1596, 'hero': 'https://pe.tedcdn.com/i...","Child protection leader, activist, author","['TEDx', 'activism', 'children', 'family', 'po...",Why we need to end the era of orphanages,https://www.ted.com/talks/tara_winkler_why_we_...,656113,2016,2017
2528,3,Digital archaeologist Chance Coughenour is usi...,717,TEDxHamburg,2016-06-08 03:00:00,5,Chance Coughenour,Chance Coughenour: How your pictures can help ...,1,2017-08-31 23:00:31,"[{'id': 9, 'name': 'Ingenious', 'count': 16}, ...","[{'id': 2673, 'hero': 'https://pe.tedcdn.com/i...",Digital archaeologist,"['TEDx', 'ancient world', 'archaeology', 'cons...",How your pictures can help reclaim lost history,https://www.ted.com/talks/chance_coughenour_ho...,539207,2016,2017
2529,45,"We all have origin stories and identity myths,...",1156,TEDxExeter,2015-04-24 03:00:00,1,Chetan Bhatt,Chetan Bhatt: Dare to refuse the origin myths ...,1,2017-09-01 17:29:03,"[{'id': 9, 'name': 'Ingenious', 'count': 24}, ...","[{'id': 2811, 'hero': 'https://pe.tedcdn.com/i...","Sociologist, human rights activist","['TEDx', 'community', 'humanity', 'identity', ...",Dare to refuse the origin myths that claim who...,https://www.ted.com/talks/chetan_bhatt_dare_to...,857850,2015,2017
2531,18,Terrorists and extremists aren't all naturally...,698,TEDxGhent,2016-06-18 03:00:00,1,Erin Marie Saltman,Erin Marie Saltman: How young people join viol...,1,2017-09-05 23:00:24,"[{'id': 8, 'name': 'Informative', 'count': 64}...","[{'id': 2309, 'hero': 'https://pe.tedcdn.com/i...",Policy researcher,"['TEDx', 'security', 'social media', 'terroris...",How young people join violent extremist groups...,https://www.ted.com/talks/erin_marie_saltman_h...,665328,2016,2017


In [3]:
# Generate target set
log_views = np.log(df['views'])
df['log_views_norm'] = (log_views - log_views.mean()) / log_views.std()


def map_to_label(value):
    if value < -1.5:
        return "very_low"
    elif value < -0.5:
        return "low"
    elif value < 0.5:
        return "medium"
    elif value < 1.5:
        return "high"
    else:
        return "very_high"


df['log_views_norm_cat'] = df['log_views_norm'].apply(map_to_label)

In [41]:
# Concatenate with features
import json

url_filenames = pd.read_csv('url_filenames_all.txt', names=['url', 'filename'], header=None)

def get_related_result(row):
    related_filenames = url_filenames.loc[url_filenames['url'] == row['url'].split("\n")[0], 'filename']
    if not len(related_filenames):
        print(f"No matching for file {row['url']}")
        return None
    else:
        related_filename = related_filenames.iloc[0]
    
    for entry in os.listdir('modeling_api_results'):
        if entry.split(".json")[0] == related_filename:
            with open(os.path.join('modeling_api_results', entry)) as f:
                return json.load(f)
            
    print(f"File {row['url']} not found in modeling_api_results")
    return None


for idx, row in df.iterrows():
    # Modeling api result
    result = get_related_result(row)

    # Aggregate with Thodoris script
    # TODO
    

No matching for file https://www.ted.com/talks/jesse_schell_when_games_invade_real_life

No matching for file https://www.ted.com/talks/a_choir_as_big_as_the_internet
No matching for file https://www.ted.com/talks/jeff_bezos_gifts_vs_choices

No matching for file https://www.ted.com/talks/jeremy_rifkin_on_the_empathic_civilization
No matching for file https://www.ted.com/talks/joel_burns_tells_gay_teens_it_gets_better
No matching for file https://www.ted.com/talks/ken_robinson_changing_education_paradigms
No matching for file https://www.ted.com/talks/gel_gotta_share

No matching for file https://www.ted.com/talks/emiliano_salinas_a_civil_response_to_violence

No matching for file https://www.ted.com/talks/lucianne_walkowicz_finding_planets_around_other_stars

No matching for file https://www.ted.com/talks/sasha_dichter
No matching for file https://www.ted.com/talks/iain_mcgilchrist_the_divided_brain

No matching for file https://www.ted.com/talks/lucianne_walkowicz_look_up_for_a_chang