In [1]:
import pandas as pd
import numpy as np
import re
import networkx as nx
import itertools
import matplotlib.pyplot as plt

In [2]:
# read the datasets
talks =  pd.read_json("Datasets/talks.json")
users = pd.read_json("Datasets/users.json")

In [3]:
talks.head()

Unnamed: 0,id,film_date,publish_date,title,speaker,ted_event,description,related_tags,related_themes,related_videos,views,comments,transcript,url
0,062dd0f773cd5999a09714a371e1f8017163e2a1,May 2012,Sep 2012,[Scott Fraser: Why eyewitnesses get it wrong],Scott Fraser,TEDxUSC,[Scott Fraser studies how humans remember crim...,"[Crime, Global issues, Government, Law, Memory...",[How the Mind Works],[Bryan Stevenson: We need to talk about an inj...,21025,[{'user_id': '3b6dbeb2375b632ad5f9dc9d15a9aa94...,The murder happened a little over 21 years ago...,http://www.ted.com/talks/scott_fraser_the_prob...
1,62f6479a5eca39725798b1ee300bd8d5de3a4ae3,Dec 2011,Jun 2012,[Jon Nguyen: Tour the solar system from home],Jon Nguyen,TEDxSanDiego,[Want to navigate the solar system without hav...,"[Demo, Exploration, Science, Software, Space, ...",[Peering into Space],[Carter Emmart demos a 3D atlas of the universe ],220795,[{'user_id': '7cb24a132dd5f8fd6c2d0c8e248b9708...,"As a kid, I was fascinated with all things air...",http://www.ted.com/talks/jon_nguyen_tour_the_s...
2,b35c0cd294cd10748019833cafa625fc33487065,Apr 2012,Jun 2012,[Wolfgang Kessling: How to air-condition outdo...,Wolfgang Kessling,TEDxSummit,"[During the hot summer months, watching an out...","[Entertainment, Environment, Global issues, In...","[Design That Matters, Presentation Innovation,...","[Peter Diamandis: Abundance is our future, Amy...",230813,[{'user_id': '658b3da5aefb6a2ede53560d8ea4d960...,Good evening. We are in this wonderful open-ai...,http://www.ted.com/talks/wolfgang_kessling_how...
3,0fa6bca242ccb96697e8de570882c6b38746591a,Apr 2012,Jun 2012,[Rives: Reinventing the encyclopedia game],Rives,TEDxSummit,[Prompted by the Encyclopaedia Britannica endi...,"[Entertainment, Internet, Science, Web, Wikipe...","[Master Storytellers, How We Learn]","[Rives: A story of mixed emoticons, A.J. Jacob...",200203,[{'user_id': '75adf5391c6cd74a190e5259e6caafeb...,"So, last month, the Encyclopaedia Britannica a...",http://www.ted.com/talks/rives_reinventing_the...
4,41db62481aeb978fd13f591755b596ff0616be70,Jun 2012,Jun 2012,[Massimo Banzi: How Arduino is open-sourcing i...,Massimo Banzi,TEDGlobal 2012,"[Massimo Banzi helped invent the Arduino, a ti...","[Creativity, Open-source, Robots, Technology]","[Tales of Invention, Art Unusual]",[Kate Hartman: The art of wearable communicati...,358737,[{'user_id': '520b4877fcc4047921af318801a168d3...,"So a few weeks ago, a friend of mine gave this...",http://www.ted.com/talks/massimo_banzi_how_ard...


In [4]:
users.head()

Unnamed: 0,user_id,favorites
0,e57cec766488c5a72d02dd6bcdbd1d67201ddc7f,"[Adam Ostrow: After your final status update, ..."
1,4c3e7cf74b5c596cf234e9055a436a23d32cb1b7,"[David S. Rose on pitching to VCs, Elizabeth G..."
2,394723943ac2a83beb72c860d77a8eca22087185,[Temple Grandin: The world needs all kinds of ...
3,a2715f02d578bfc667e0fb4691f5a5b1572b9b2e,"[Richard St. John's 8 secrets of success, Geve..."
4,2c0871325f6f3e10bdeee9059d7a2e745929f702,"[Richard St. John: ""Success is a continuous jo..."


In [6]:
type(talks['title'][0]) is list

True

In [16]:
# remove array brackets from talks given column names
def remove_col_array(df, col_name):

    temp = []
    for i in range(df.shape[0]):
        col_value = df[col_name][i][0] if type(df[col_name][i]) is list else df[col_name][i]
        temp.append(col_value)
    df[col_name] = temp
    return df

In [17]:
talks = remove_col_array(talks, 'title')
talks = remove_col_array(talks, 'description')

In [18]:
# check duplicated talks with same titles but different ids

len(talks['title'].unique())

1424

In [19]:
len(talks['id'].unique())

1203

In [20]:
len(talks)

2352

### Extract features from the talks dataset over time and tags

In [96]:
talks.drop(talks.loc[talks['publish_date']==str('')].index, inplace=True) # drop 22 rows where publish date is missing
talks.reset_index(drop=True, inplace=True)

# convert date string to datetime format
talks['publish_date']=list(pd.to_datetime(talks['publish_date']))
talks['publish_date']=talks['publish_date'].dt.to_period('M')

# count number of words in the transcript as 'duration'
talks['word_cnt']=list(talks['transcript'].apply(lambda x: len(re.findall(r'\w+', x))))

# drop comments & transcripts
talks=talks.drop(columns=['transcript','comments'])

In [83]:
def split_row(data, column):
    """split related tags as seperate rows

    :param data: the dataframe
    :param column: column name to split
    :type data: pandas.core.frame.DataFrame
    :type column: str
    """
    row_len = list(map(len, data[column].values))
    rows = []
    for i in data.columns:
        if i == column:
            row = np.concatenate(data[i].values)
        else:
            row = np.repeat(data[i].values, row_len)
        rows.append(row)
    return pd.DataFrame(np.dstack(tuple(rows))[0], columns=data.columns)

In [101]:
# split related_tags into seperate rows
talk_tags=split_row(talks, column='related_tags')

In [100]:
np.sort(talks['publish_date'].unique())

array([Period('2006-06', 'M'), Period('2006-07', 'M'),
       Period('2006-08', 'M'), Period('2006-09', 'M'),
       Period('2006-10', 'M'), Period('2006-11', 'M'),
       Period('2006-12', 'M'), Period('2007-01', 'M'),
       Period('2007-02', 'M'), Period('2007-04', 'M'),
       Period('2007-05', 'M'), Period('2007-06', 'M'),
       Period('2007-07', 'M'), Period('2007-08', 'M'),
       Period('2007-09', 'M'), Period('2007-10', 'M'),
       Period('2007-11', 'M'), Period('2007-12', 'M'),
       Period('2008-01', 'M'), Period('2008-02', 'M'),
       Period('2008-03', 'M'), Period('2008-04', 'M'),
       Period('2008-05', 'M'), Period('2008-06', 'M'),
       Period('2008-07', 'M'), Period('2008-08', 'M'),
       Period('2008-09', 'M'), Period('2008-10', 'M'),
       Period('2008-11', 'M'), Period('2008-12', 'M'),
       Period('2009-01', 'M'), Period('2009-02', 'M'),
       Period('2009-03', 'M'), Period('2009-04', 'M'),
       Period('2009-05', 'M'), Period('2009-06', 'M'),
       Per

In [103]:
talks_feature_cnt=talk_tags.drop(columns=['film_date','title','ted_event','description','related_themes','related_videos','url'])

In [110]:
feature_cnt=talks_feature_cnt.groupby(by=['publish_date','related_tags'],as_index=False).agg(
    view_cnt=pd.NamedAgg(column="views", aggfunc="sum"),
    word_cnt=pd.NamedAgg(column="word_cnt", aggfunc="sum"),
    publish_cnt=pd.NamedAgg(column="word_cnt", aggfunc="count")
)

In [113]:
feature_cnt['view_per_video']=np.ceil(feature_cnt['view_cnt']/feature_cnt['publish_cnt'])

In [114]:
feature_cnt.head()

Unnamed: 0,publish_date,related_tags,view_cnt,word_cnt,publish_cnt,view_per_video
0,2006-06,Activism,1052541,6344,2,526271.0
1,2006-06,Africa,8380681,6506,2,4190341.0
2,2006-06,Alternative energy,2918261,4392,2,1459131.0
3,2006-06,Asia,8380681,6506,2,4190341.0
4,2006-06,Business,5389526,16634,4,1347382.0


In [120]:
feature_cnt.describe()

Unnamed: 0,view_cnt,word_cnt,publish_cnt,view_per_video
count,4081.0,4081.0,4081.0,4081.0
mean,1851855.0,7703.489341,3.312178,570908.8
std,2524340.0,7733.225622,2.912207,758575.6
min,21025.0,0.0,1.0,21025.0
25%,540345.0,3122.0,2.0,252754.0
50%,974381.0,5724.0,2.0,382591.0
75%,2034498.0,8817.0,4.0,606395.0
max,29469800.0,107293.0,36.0,11107280.0


In [191]:
feature_cnt.to_csv('talks_feature_cnt.csv',index=False)

In [121]:
len(feature_cnt[feature_cnt['view_per_video']>600000]['related_tags'].unique())

220

### Construct topic derivation network

In [170]:
# add edges to the network, where node i -> node j means topic i derives topic j
def topic_derivation_network(df,nodes):
    """
    Construct the topic derivation network
    :df: the dataframe contains the publish date and related tags (as list)
    :nodes: all the tags form the node set
    """
    
    # Construct an empty network and add all the nodes
    G=nx.DiGraph()
    G.add_nodes_from(nodes)
    p_dates = list(np.sort(df['publish_date'].unique()))

    tags_month = df.groupby(by=['publish_date']).agg("sum")

    # Contruct the edge set
    edges = set()
    prev_tags = set()
    
    for d in p_dates:    
        if d == '2006-06':
            prev_tags.update(tags_month.loc[d]['related_tags'])
            continue
        current_df = df[df['publish_date'] == d].reset_index()
        for i in range(current_df.shape[0]):
            source = []
            target = []
            for j in current_df.loc[i]['related_tags']:
                if j in prev_tags:
                    source.append(j)
                else:
                    target.append(j)
            for elem in itertools.product(source,target):
                edges.add(elem)
        prev_tags.update(tags_month.loc[d]['related_tags'])
    
    G.add_edges_from(list(edges))

    return G    

In [171]:
# all tags form the node set of this network
nodes = list(feature_cnt['related_tags'].unique())

talks_tag_time=talks[['publish_date','related_tags']]

topic_net = topic_derivation_network(talks_tag_time,nodes)

In [185]:
len(topic_net.edges())

1677

In [198]:
# compute the earliest publish date of a tag as one attibute of the node
tags_time=talk_tags[['related_tags','publish_date']]
earliest_date=tags_time.groupby(by=['related_tags'],as_index=False).agg('min')

# export nodes and edges as CSV files
nodes_df = earliest_date.rename(columns={"related_tags": "Name", "publish_date": "Earliest_date"})
nodes_df.to_csv('topic_net_nodes.csv',index=True)


In [199]:
nodes_df.head()

Unnamed: 0,Name,Earliest_date
0,AI,2007-05
1,AIDS,2006-10
2,Activism,2006-06
3,Adventure,2007-06
4,Advertising,2009-10


In [201]:
# convert each edge into node ID list
nodes_list=list(nodes_df['Name'])
node_dict=dict()
for i,n in enumerate(nodes_list):
    node_dict[n] = i

edges=[]
for s,t in topic_net.edges():
    edges.append([node_dict[s],node_dict[t]])

edges_df=pd.DataFrame(edges,columns=['source','target'])
edges_df.to_csv('topic_net_edges.csv',index=False)

In [206]:
# analyze high out degree nodes as popular topics
out_deg_cnt=[]
for i,j in topic_net.out_degree():
    out_deg_cnt.append([i,j])
out_deg_df=pd.DataFrame(out_deg_cnt,columns=['tag','out_deg'])

In [208]:
out_deg_df=out_deg_df.sort_values(by=['out_deg'],ascending=False)
out_deg_df.to_csv('topic_net_deg_cnt.csv',index=False)