### 1. Loading the dataset

In [1]:
import numpy as np
import pandas as pd
df = pd.read_csv('/kaggle/input/ted-talks/data.csv')

df.head()

Unnamed: 0,title,author,date,views,likes,link
0,Climate action needs new frontline leadership,Ozawa Bineshi Albert,December 2021,404000,12000,https://ted.com/talks/ozawa_bineshi_albert_cli...
1,The dark history of the overthrow of Hawaii,Sydney Iaukea,February 2022,214000,6400,https://ted.com/talks/sydney_iaukea_the_dark_h...
2,How play can spark new ideas for your business,Martin Reeves,September 2021,412000,12000,https://ted.com/talks/martin_reeves_how_play_c...
3,Why is China appointing judges to combat clima...,James K. Thornton,October 2021,427000,12000,https://ted.com/talks/james_k_thornton_why_is_...
4,Cement's carbon problem — and 2 ways to fix it,Mahendra Singhi,October 2021,2400,72,https://ted.com/talks/mahendra_singhi_cement_s...


### 2. Dealing with Null & Duplicae Values
##### 2.1 Checking null values

In [2]:
df.isnull().sum()

title     0
author    1
date      0
views     0
likes     0
link      0
dtype: int64

##### 2.2 Removing null values

In [3]:
df.dropna(inplace = True)

##### 2.3 Checking duplicate values

In [4]:
print(len(df) - len(df.drop_duplicates()))

0


### 3. Exploratory Data Analysis

##### 3.1 Explore Views Column

In [5]:
arr = np.array(df['views'])

print('----- Views Analysis -----')
print('Minimun : ', np.min(arr))
print('Maximum : ', np.max(arr))
print('Total   : ', np.sum(arr))
print('Average : ', int(np.mean(arr)))
print('--------------------------')

----- Views Analysis -----
Minimun :  1200
Maximum :  72000000
Total   :  11214972679
Average :  2061954
--------------------------


##### 3.2 Explore Likes Column

In [6]:
arr = np.array(df['likes'])

print('----- Likes Analysis -----')
print('Minimun : ', np.min(arr))
print('Maximum : ', np.max(arr))
print('Total   : ', np.sum(arr))
print('Average : ', int(np.mean(arr)))
print('--------------------------')

----- Likes Analysis -----
Minimun :  37
Maximum :  2100000
Total   :  340585420
Average :  62619
--------------------------


##### 3.3 Explore Titles Column

In [7]:
print(len(df) - len(df['title'].unique()))

0


##### 3.4 Explore Authors Column

In [8]:
print('Number of authors : ', len(df['author'].unique()))

Number of authors :  4443


### 4. Data Cleaning & Feature Engineering
##### 4.1 Creating new columns from data | month & year

In [9]:
month = []
year = []

for i in df['date']:
    month.append(i.split(' ')[0])
    year.append(int(i.split(' ')[1]))

df['month'] = month
df['year'] = year
df.head()

Unnamed: 0,title,author,date,views,likes,link,month,year
0,Climate action needs new frontline leadership,Ozawa Bineshi Albert,December 2021,404000,12000,https://ted.com/talks/ozawa_bineshi_albert_cli...,December,2021
1,The dark history of the overthrow of Hawaii,Sydney Iaukea,February 2022,214000,6400,https://ted.com/talks/sydney_iaukea_the_dark_h...,February,2022
2,How play can spark new ideas for your business,Martin Reeves,September 2021,412000,12000,https://ted.com/talks/martin_reeves_how_play_c...,September,2021
3,Why is China appointing judges to combat clima...,James K. Thornton,October 2021,427000,12000,https://ted.com/talks/james_k_thornton_why_is_...,October,2021
4,Cement's carbon problem — and 2 ways to fix it,Mahendra Singhi,October 2021,2400,72,https://ted.com/talks/mahendra_singhi_cement_s...,October,2021


##### 4.2 Cleaning link column

In [10]:
df['link'] = [i.split('/')[-1] for i in df['link']]
df.head()

Unnamed: 0,title,author,date,views,likes,link,month,year
0,Climate action needs new frontline leadership,Ozawa Bineshi Albert,December 2021,404000,12000,ozawa_bineshi_albert_climate_action_needs_new_...,December,2021
1,The dark history of the overthrow of Hawaii,Sydney Iaukea,February 2022,214000,6400,sydney_iaukea_the_dark_history_of_the_overthro...,February,2022
2,How play can spark new ideas for your business,Martin Reeves,September 2021,412000,12000,martin_reeves_how_play_can_spark_new_ideas_for...,September,2021
3,Why is China appointing judges to combat clima...,James K. Thornton,October 2021,427000,12000,james_k_thornton_why_is_china_appointing_judge...,October,2021
4,Cement's carbon problem — and 2 ways to fix it,Mahendra Singhi,October 2021,2400,72,mahendra_singhi_cement_s_carbon_problem_and_2_...,October,2021


##### 4.3 Creating views_to_like ratio column

In [14]:
df['view_to_like_ratio'] = round(df['views']/df['likes'],2)
df.head()

Unnamed: 0,title,author,date,views,likes,link,month,year,view_to_like_ratio
0,Climate action needs new frontline leadership,Ozawa Bineshi Albert,December 2021,404000,12000,ozawa_bineshi_albert_climate_action_needs_new_...,December,2021,33.67
1,The dark history of the overthrow of Hawaii,Sydney Iaukea,February 2022,214000,6400,sydney_iaukea_the_dark_history_of_the_overthro...,February,2022,33.44
2,How play can spark new ideas for your business,Martin Reeves,September 2021,412000,12000,martin_reeves_how_play_can_spark_new_ideas_for...,September,2021,34.33
3,Why is China appointing judges to combat clima...,James K. Thornton,October 2021,427000,12000,james_k_thornton_why_is_china_appointing_judge...,October,2021,35.58
4,Cement's carbon problem — and 2 ways to fix it,Mahendra Singhi,October 2021,2400,72,mahendra_singhi_cement_s_carbon_problem_and_2_...,October,2021,33.33


##### 4.4 Removing year outliers

In [15]:
df = df[df['year'] >= 2001]

### 5. Data Analysis
##### 5.1 Authors

5.1.1) Top 10 Authours with Most number of speeches as Ted

In [41]:
df.groupby(by = 'author').size().sort_values(ascending = False).head(10)

author
Alex Gendler        45
Iseult Gillespie    33
Matt Walker         18
Alex Rosenthal      15
Elizabeth Cox       13
Emma Bryce          12
Juan Enriquez       11
Daniel Finkel       11
Greg Gage            9
Mona Chalabi         9
dtype: int64

5.1.2) Top 10 Authours with Most number of views as Ted

In [17]:
df.groupby(by='author').sum()['views'].sort_values(ascending = False).head(10)

author
Alex Gendler        187196000
Sir Ken Robinson     95654000
Bill Gates           77800000
Simon Sinek          74800000
Brené Brown          72000000
Julian Treasure      64300000
Amy Cuddy            64000000
Tim Urban            60000000
Iseult Gillespie     54998000
Mia Nacamulli        44174000
Name: views, dtype: int64

5.1.3) Top 10 Authours with Most number of likes as Ted

In [18]:
df.groupby(by='author').sum()['likes'].sort_values(ascending = False).head(10)

author
Alex Gendler        5691000
Sir Ken Robinson    2833600
Bill Gates          2349000
Simon Sinek         2246000
Brené Brown         2204000
Amy Cuddy           1900000
Julian Treasure     1870000
Tim Urban           1800000
Iseult Gillespie    1660900
Mia Nacamulli       1395000
Name: likes, dtype: int64

5.1.4) Top 10 Authours with best views to like ratio

In [23]:
df.groupby(by='author').mean(numeric_only = True)['view_to_like_ratio'].sort_values(ascending = False).head(10)

author
David Lindell            36.40
Ioannis Papachimonas     36.18
Sandra Fisher-Martins    36.10
Srdja Popovic            36.09
Virginia Postrel         36.08
Paul Snelgrove           36.00
Philip Krinks            36.00
Nathalia Holt            36.00
Mick Mountz              36.00
Jonas Gahr Støre         35.92
Name: view_to_like_ratio, dtype: float64

##### 5.2 Timeframe | Date, Month, Year

5.2.1) Number of videos published each month

In [27]:
df.groupby('month').size().sort_values()

month
January      145
August       200
May          321
December     333
September    349
July         445
June         493
April        576
March        579
October      585
November     682
February     717
dtype: int64

5.2.2) Number of videos published each year

In [29]:
df.groupby('year').size()

year
2001      5
2002     26
2003     34
2004     33
2005     65
2006     49
2007    113
2008     84
2009    233
2010    267
2011    271
2012    302
2013    388
2014    357
2015    376
2016    399
2017    495
2018    473
2019    544
2020    501
2021    390
2022     20
dtype: int64

5.2.3) Top 10 months of the year where most number of videos published

In [35]:
df.groupby('date').size().sort_values(ascending = False).head(10)

date
April 2018       127
April 2019       124
April 2017       123
November 2018    115
November 2017    109
October 2020      97
March 2014        96
February 2016     89
March 2015        88
June 2012         84
dtype: int64

5.2.3) Number of Views got in each Months

In [38]:
df.groupby(by='month').sum()['views'].sort_values(ascending = False)

month
February     1975366996
March        1374553896
April        1214604196
November     1180802199
June         1164034699
October       965331498
July          864042799
May           656959600
September     606392100
December      601964300
January       302598399
August        295820997
Name: views, dtype: int64

In [43]:
df.groupby(by='month').sum()['likes'].sort_values(ascending = False)

month
February     59897756
March        41815175
April        36985230
November     35867373
June         35242674
October      29302963
July         26234756
May          19920849
September    18433456
December     18309597
January       9188542
August        9008649
Name: likes, dtype: int64

In [47]:
df.groupby(by=['month','year'])['likes'].sum().sort_values(ascending = False)

month      year
April      2017    10449301
February   2016     9869383
March      2015     9442066
June       2013     8515000
March      2014     8397200
                     ...   
August     2011       16200
May        2001       15000
September  2005       14000
December   2009       10000
October    2004        1600
Name: likes, Length: 192, dtype: int64

In [48]:
df

Unnamed: 0,title,author,date,views,likes,link,month,year,view_to_like_ratio
0,Climate action needs new frontline leadership,Ozawa Bineshi Albert,December 2021,404000,12000,ozawa_bineshi_albert_climate_action_needs_new_...,December,2021,33.67
1,The dark history of the overthrow of Hawaii,Sydney Iaukea,February 2022,214000,6400,sydney_iaukea_the_dark_history_of_the_overthro...,February,2022,33.44
2,How play can spark new ideas for your business,Martin Reeves,September 2021,412000,12000,martin_reeves_how_play_can_spark_new_ideas_for...,September,2021,34.33
3,Why is China appointing judges to combat clima...,James K. Thornton,October 2021,427000,12000,james_k_thornton_why_is_china_appointing_judge...,October,2021,35.58
4,Cement's carbon problem — and 2 ways to fix it,Mahendra Singhi,October 2021,2400,72,mahendra_singhi_cement_s_carbon_problem_and_2_...,October,2021,33.33
...,...,...,...,...,...,...,...,...,...
5435,The best stats you've ever seen,Hans Rosling,February 2006,15000000,458000,hans_rosling_the_best_stats_you_ve_ever_seen,February,2006,32.75
5436,Do schools kill creativity?,Sir Ken Robinson,February 2006,72000000,2100000,sir_ken_robinson_do_schools_kill_creativity,February,2006,34.29
5437,Greening the ghetto,Majora Carter,February 2006,2900000,88000,majora_carter_greening_the_ghetto,February,2006,32.95
5438,Simplicity sells,David Pogue,February 2006,2000000,60000,david_pogue_simplicity_sells,February,2006,33.33


Finding the most popular TED talks

Month-wise Analysis of TED talk frequency

Year-wise Analysis of TED talk frequency

Finding TED talks of your favorite Author

Finding TED talks with the best view to like ratio

Finding TED talks based on tags(like climate)

Finding the most popular TED talks Speaker (in terms of number of views)

Finding top 10 authors as per their views to no of videos ratio


In [None]:
arr = np.array(df['view_to_like_ratio'])

print('----- Ratio Analysis -----')
print('Minimun : ', np.min(arr))
print('Maximum : ', np.max(arr))
print('Total   : ', np.sum(arr))
print('Average : ', int(np.mean(arr)))
print('--------------------------')