## 0 - Packages

In [31]:
from datetime import date
from plotly.subplots import make_subplots

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

## 1 - Read Data

In [19]:
df = pd.read_csv('../data/ted-talks.csv')

df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5440 entries, 0 to 5439
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   5440 non-null   object
 1   author  5439 non-null   object
 2   date    5440 non-null   object
 3   views   5440 non-null   int64 
 4   likes   5440 non-null   int64 
 5   link    5440 non-null   object
dtypes: int64(2), object(4)
memory usage: 255.1+ KB


Unnamed: 0,title,author,date,views,likes,link
0,Climate action needs new frontline leadership,Ozawa Bineshi Albert,December 2021,404000,12000,https://ted.com/talks/ozawa_bineshi_albert_cli...
1,The dark history of the overthrow of Hawaii,Sydney Iaukea,February 2022,214000,6400,https://ted.com/talks/sydney_iaukea_the_dark_h...
2,How play can spark new ideas for your business,Martin Reeves,September 2021,412000,12000,https://ted.com/talks/martin_reeves_how_play_c...
3,Why is China appointing judges to combat clima...,James K. Thornton,October 2021,427000,12000,https://ted.com/talks/james_k_thornton_why_is_...
4,Cement's carbon problem — and 2 ways to fix it,Mahendra Singhi,October 2021,2400,72,https://ted.com/talks/mahendra_singhi_cement_s...


## 2 - Clean Data

In [32]:
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5439 entries, 0 to 5439
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   title   5439 non-null   object        
 1   author  5439 non-null   object        
 2   date    5439 non-null   datetime64[ns]
 3   views   5439 non-null   int64         
 4   likes   5439 non-null   int64         
 5   link    5439 non-null   object        
dtypes: datetime64[ns](1), int64(2), object(3)
memory usage: 297.4+ KB


In [34]:
df['date'] = pd.to_datetime(df['date'])

df.info()
df['date']

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5439 entries, 0 to 5439
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   title   5439 non-null   object        
 1   author  5439 non-null   object        
 2   date    5439 non-null   datetime64[ns]
 3   views   5439 non-null   int64         
 4   likes   5439 non-null   int64         
 5   link    5439 non-null   object        
dtypes: datetime64[ns](1), int64(2), object(3)
memory usage: 297.4+ KB


0      2021-12-01
1      2022-02-01
2      2021-09-01
3      2021-10-01
4      2021-10-01
          ...    
5435   2006-02-01
5436   2006-02-01
5437   2006-02-01
5438   2006-02-01
5439   2006-02-01
Name: date, Length: 5439, dtype: datetime64[ns]

## 3 - The most popular TED talks

In [5]:
top_views = df.sort_values('views')[-20:]

px.bar(top_views, x='views', y='title', category_orders=['descending'], title='Top TED talks by views')

## 4 - Finding TED talks with the best view to like ratio

In [6]:
top_view_like_ratio = pd.concat([df['title'], (df['views'] / df['likes']).rename('view_like_ratio')], axis=1)
top_view_like_ratio = top_view_like_ratio.sort_values('view_like_ratio')[-20:]

px.bar(
    top_view_like_ratio,
    x='view_like_ratio', y='title',
    category_orders=['descending'],
    title='Top TED talks by view to like ratio'
)

## 5 - The most popular speakers

### in terms of video counts

In [7]:
top_speakers_byCount = df.groupby('author').agg('count').sort_values('views')[-20:].reset_index()
top_speakers_byCount = top_speakers_byCount.rename(columns={'views': 'count'})[['author', 'count']]

px.bar(top_speakers_byCount, x='count', y='author', category_orders=['descending'], title='Top speakers')

### in terms of views

In [8]:
top_speakers_byViews = df.groupby('author').sum().sort_values('views')[-20:].reset_index()

px.bar(top_speakers_byViews, x='views', y='author', category_orders=['descending'], title='Top speakers')

## 6 - Month-wise Analysis of TED talk frequency

#### Video counts per upload month

In [9]:
videos_per_month = df.groupby(df['date'].dt.month).agg('count')['views'].reset_index()
videos_per_month = videos_per_month.rename(columns={'date': 'month', 'views': 'count'})

videos_per_month.head()

#### Average views per upload month

In [10]:
views_per_upload_month = df.groupby(df['date'].dt.month)['views'].mean().reset_index()
views_per_upload_month = views_per_upload_month.rename(columns={'date': 'month'})

views_per_upload_month.head()

#### Plot

In [11]:
fig = make_subplots(rows=2, cols=1, subplot_titles=("Video uploads per month", "Average views per upload month"))

fig.append_trace(go.Scatter(
    x=videos_per_month['month'],
    y=videos_per_month['count'],
    name='counts'
), row=1, col=1)

fig.update_xaxes(title_text="month", row=1, col=1, tickvals=np.arange(1, 13))
fig.update_yaxes(title_text="count", row=1, col=1)

fig.append_trace(go.Scatter(
    x=views_per_upload_month['month'],
    y=views_per_upload_month['views'],
    name='views'
), row=2, col=1)

fig.update_xaxes(title_text="month", row=2, col=1, tickvals=np.arange(1, 13))
fig.update_yaxes(title_text="views", row=2, col=1)

fig.show()

## 7 - Year-wise Analysis of TED talk frequency

#### Video counts per upload year

In [12]:
videos_per_year = df.groupby(df['date'].dt.year).agg('count')['views'].reset_index()
videos_per_year = videos_per_year.rename(columns={'date': 'year', 'views': 'count'})
videos_per_year = videos_per_year[videos_per_year['year'] > 2000]

videos_per_year.head()

#### Total views per upload year

In [13]:
views_per_upload_year = df.groupby(df['date'].dt.year)['views'].sum().reset_index()
views_per_upload_year = views_per_upload_year.rename(columns={'date': 'year'})
views_per_upload_year = views_per_upload_year[views_per_upload_year['year'] > 2000]

views_per_upload_year.head()

#### Plot

In [14]:
fig = make_subplots(rows=2, cols=1, subplot_titles=("Video uploads per year", "Total views per upload year"))

fig.append_trace(go.Scatter(
    x=videos_per_year['year'],
    y=videos_per_year['count'],
    name='counts'
), row=1, col=1)

fig.update_xaxes(title_text="year", row=1, col=1, tickvals=np.arange(1970, 2023), tickangle=45)
fig.update_yaxes(title_text="count", row=1, col=1)

fig.append_trace(go.Scatter(
    x=views_per_upload_year['year'],
    y=views_per_upload_year['views'],
    name='views'
), row=2, col=1)

fig.update_xaxes(title_text="year", row=2, col=1, tickvals=np.arange(1970, 2023), tickangle=45)
fig.update_yaxes(title_text="views", row=2, col=1)

fig.show()

## 8 - Finding TED talks of your favorite author