In [492]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import statsmodels.api as sm
from statsmodels.formula.api import ols
import ast
from textblob import TextBlob
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor




df_ted_main = pd.read_csv("data/ted_main.csv")
df_ted_transcripts = df = pd.read_csv("data/transcripts.csv")

In [442]:
############### PREPROCESSING OF THE DATA ###############
# - removing columns that are not needed
# - one hot encoding columns
# - adding sentiment analysis of transcripts 
# - combining columns together to create a new structure of the data frame

In [443]:
# ----------------------------------------------------------
# 1. remove unwanted columns
# ----------------------------------------------------------

df_ted_main.drop(['description', 'film_date', 'main_speaker', 'name', 'num_speaker', 'related_talks', 'speaker_occupation', 'title', 'url'], axis=1, inplace=True)
df_ted_main.head()

Unnamed: 0,comments,duration,event,languages,published_date,ratings,tags,views
0,4553,1164,TED2006,60,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 19645}, {...","['children', 'creativity', 'culture', 'dance',...",47227110
1,265,977,TED2006,43,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 544}, {'i...","['alternative energy', 'cars', 'climate change...",3200520
2,124,1286,TED2006,26,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 964}, {'i...","['computers', 'entertainment', 'interface desi...",1636292
3,200,1116,TED2006,35,1151367060,"[{'id': 3, 'name': 'Courageous', 'count': 760}...","['MacArthur grant', 'activism', 'business', 'c...",1697550
4,593,1190,TED2006,48,1151440680,"[{'id': 9, 'name': 'Ingenious', 'count': 3202}...","['Africa', 'Asia', 'Google', 'demo', 'economic...",12005869


In [444]:
# ----------------------------------------------------------
# 2. one hot encoding
# ----------------------------------------------------------

# one hot encoding tags

# iterate trough the data set
for i, row in df_ted_main.iterrows():
    # change string formed list into normal list
    element_list_form = ast.literal_eval(df_ted_main.at[i,'tags'])
    element_list_form = [i.strip() for i in element_list_form]
    # finish converting by replacing string version of a list for the list type
    df_ted_main.at[i,'tags'] = element_list_form

# one hot encoding with column named "tags"
# since "tags" is a Series on each row, we need to split them first and then we can do one hot encoding
df_ted_main = df_ted_main.drop('tags', 1).join(
    pd.get_dummies(
        pd.DataFrame(df_ted_main.tags.tolist()).stack()
    ).astype(int).sum(level=0)
)

df_ted_main.head()

Unnamed: 0,comments,duration,event,languages,published_date,ratings,views,3d printing,AI,AIDS,...,wikipedia,wind energy,women,women in business,work,work-life balance,world cultures,writing,wunderkind,youth
0,4553,1164,TED2006,60,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 19645}, {...",47227110,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,265,977,TED2006,43,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 544}, {'i...",3200520,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,124,1286,TED2006,26,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 964}, {'i...",1636292,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,200,1116,TED2006,35,1151367060,"[{'id': 3, 'name': 'Courageous', 'count': 760}...",1697550,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,593,1190,TED2006,48,1151440680,"[{'id': 9, 'name': 'Ingenious', 'count': 3202}...",12005869,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [445]:
df_ted_main['event_class'] = 'Other'

In [446]:
#Loop over every talk and assign event category based on name of event
for ii in range(len(df_ted_main)):
    if df_ted_main['event'][ii].count('TED20') >0:
        df_ted_main['event_class'][ii] = 'Yearly TED Conference'
    elif df_ted_main['event'][ii].count('TED19') >0:
        df_ted_main['event_class'][ii] = 'Yearly TED Conference'
    elif df_ted_main['event'][ii].count('TEDx') >0:
        df_ted_main['event_class'][ii] = 'TEDx'
    elif df_ted_main['event'][ii].count('TEDGlobal') >0:
        df_ted_main['event_class'][ii] = 'TEDGlobal'
    elif df_ted_main['event'][ii].count('TEDWomen') >0:
        df_ted_main['event_class'][ii] = 'TEDWomen'
    if df_ted_main['event'][ii].count('TEDSalon') >0:
        df_ted_main['event_class'][ii] = 'TEDSalon'
    if df_ted_main['event'][ii].count('TEDNYC') >0:
        df_ted_main['event_class'][ii] = 'TEDNYC'
    else:
        pass

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ted_main['event_class'][ii] = 'Yearly TED Conference'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ted_main['event_class'][ii] = 'TEDGlobal'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ted_main['event_class'][ii] = 'TEDSalon'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ted_main['event_cla

In [447]:
df_ted_main['event_class'].unique()

array(['Yearly TED Conference', 'TEDGlobal', 'TEDSalon', 'Other', 'TEDx',
       'TEDWomen', 'TEDNYC'], dtype=object)

In [448]:
del df_ted_main['event']

In [449]:
# one hot encoding "event" column

def one_hot_encode(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]])
    res = pd.concat([original_dataframe, dummies], axis=1)
    res = res.drop([feature_to_encode], axis=1)
    return(res)

df_ted_main = one_hot_encode(df_ted_main, 'event_class')
df_ted_main.head()

Unnamed: 0,comments,duration,languages,published_date,ratings,views,3d printing,AI,AIDS,Addiction,...,writing,wunderkind,youth,event_class_Other,event_class_TEDGlobal,event_class_TEDNYC,event_class_TEDSalon,event_class_TEDWomen,event_class_TEDx,event_class_Yearly TED Conference
0,4553,1164,60,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 19645}, {...",47227110,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,265,977,43,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 544}, {'i...",3200520,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,124,1286,26,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 964}, {'i...",1636292,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,200,1116,35,1151367060,"[{'id': 3, 'name': 'Courageous', 'count': 760}...",1697550,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,593,1190,48,1151440680,"[{'id': 9, 'name': 'Ingenious', 'count': 3202}...",12005869,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [450]:
# filtered_df = df_ted_main[['comments','duration','languages','event_class_TEDGlobal','views','event_class_TEDSalon','event_class_TEDWomen','event_class_Yearly TED Conference']]

# pca = PCA(n_components=2)
# principalComponents = pca.fit_transform(filtered_df)


# print (pca.explained_variance_)
# print (pca.explained_variance_ratio_)
# print (pca.explained_variance_ratio_.cumsum())

# feat_cols = ['feature'+str(i) for i in range(principalComponents.shape[1])]
# normalised_data = pd.DataFrame(principalComponents,columns=feat_cols)
# normalised_data.tail()



In [451]:
#rating analysis

df_ted_main['ratings']=df_ted_main['ratings'].str.replace("'",'"')

df_ted_main=df_ted_main.merge(df_ted_main.ratings.apply(lambda x: pd.Series(pd.read_json(x)['count'].values,index=pd.read_json(x)['name'])), 
            left_index=True, right_index=True)

df_ted_main.drop('ratings', axis=1, inplace=True)

df_ted_main.head()

Unnamed: 0,comments,duration,languages,published_date,views,3d printing,AI,AIDS,Addiction,Africa,...,Longwinded,Confusing,Informative,Fascinating,Unconvincing,Persuasive,Jaw-dropping,OK,Obnoxious,Inspiring
0,4553,1164,60,1151367060,47227110,0,0,0,0,0,...,387,242,7346,10581,300,10704,4439,1174,209,24924
1,265,977,43,1151367060,3200520,0,0,0,0,0,...,113,62,443,132,258,268,116,203,131,413
2,124,1286,26,1151367060,1636292,0,0,0,0,0,...,78,27,395,166,104,230,54,146,142,230
3,200,1116,35,1151367060,1697550,0,0,0,0,0,...,53,32,380,132,36,460,230,85,35,1070
4,593,1190,48,1151440680,12005869,0,0,0,0,1,...,110,72,5433,4606,67,2542,3736,248,61,2893


In [452]:
# sentiment analysis of transcript
pol = lambda x: TextBlob(x).sentiment.polarity
sub = lambda x: TextBlob(x).sentiment.subjectivity

df_ted_transcripts['polarity'] = df_ted_transcripts['transcript'].apply(pol)
df_ted_transcripts['subjectivity'] = df_ted_transcripts['transcript'].apply(sub)
df_ted_transcripts

Unnamed: 0,transcript,url,polarity,subjectivity
0,Good morning. How are you?(Laughter)It's been ...,https://www.ted.com/talks/ken_robinson_says_sc...,0.146452,0.462051
1,"Thank you so much, Chris. And it's truly a gre...",https://www.ted.com/talks/al_gore_on_averting_...,0.157775,0.424101
2,"(Music: ""The Sound of Silence,"" Simon & Garfun...",https://www.ted.com/talks/david_pogue_says_sim...,0.136579,0.475229
3,If you're here today — and I'm very happy that...,https://www.ted.com/talks/majora_carter_s_tale...,0.082928,0.439165
4,"About 10 years ago, I took on the task to teac...",https://www.ted.com/talks/hans_rosling_shows_t...,0.096483,0.450631
...,...,...,...,...
2462,"So, Ma was trying to explain something to me a...",https://www.ted.com/talks/duarte_geraldino_wha...,0.038878,0.352668
2463,This is a picture of a sunset on Mars taken by...,https://www.ted.com/talks/armando_azua_bustos_...,0.127087,0.538377
2464,"In my early days as a graduate student, I went...",https://www.ted.com/talks/radhika_nagpal_what_...,0.193538,0.526466
2465,I took a cell phone and accidentally made myse...,https://www.ted.com/talks/theo_e_j_wilson_a_bl...,0.100929,0.498050


In [453]:
# merging both datasets and delete the rows that don't have a transcipt
result = pd.merge(df_ted_main, df_ted_transcripts, left_index=True, right_index=True)

In [454]:
result

Unnamed: 0,comments,duration,languages,published_date,views,3d printing,AI,AIDS,Addiction,Africa,...,Unconvincing,Persuasive,Jaw-dropping,OK,Obnoxious,Inspiring,transcript,url,polarity,subjectivity
0,4553,1164,60,1151367060,47227110,0,0,0,0,0,...,300,10704,4439,1174,209,24924,Good morning. How are you?(Laughter)It's been ...,https://www.ted.com/talks/ken_robinson_says_sc...,0.146452,0.462051
1,265,977,43,1151367060,3200520,0,0,0,0,0,...,258,268,116,203,131,413,"Thank you so much, Chris. And it's truly a gre...",https://www.ted.com/talks/al_gore_on_averting_...,0.157775,0.424101
2,124,1286,26,1151367060,1636292,0,0,0,0,0,...,104,230,54,146,142,230,"(Music: ""The Sound of Silence,"" Simon & Garfun...",https://www.ted.com/talks/david_pogue_says_sim...,0.136579,0.475229
3,200,1116,35,1151367060,1697550,0,0,0,0,0,...,36,460,230,85,35,1070,If you're here today — and I'm very happy that...,https://www.ted.com/talks/majora_carter_s_tale...,0.082928,0.439165
4,593,1190,48,1151440680,12005869,0,0,0,0,1,...,67,2542,3736,248,61,2893,"About 10 years ago, I took on the task to teac...",https://www.ted.com/talks/hans_rosling_shows_t...,0.096483,0.450631
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2462,35,920,20,1496156399,1079698,0,1,0,0,0,...,9,103,11,26,4,258,"So, Ma was trying to explain something to me a...",https://www.ted.com/talks/duarte_geraldino_wha...,0.038878,0.352668
2463,34,452,17,1496242092,957803,0,0,0,0,0,...,11,74,21,30,4,185,This is a picture of a sunset on Mars taken by...,https://www.ted.com/talks/armando_azua_bustos_...,0.127087,0.538377
2464,18,1230,14,1496327443,874817,0,0,0,0,1,...,0,55,10,22,3,196,"In my early days as a graduate student, I went...",https://www.ted.com/talks/radhika_nagpal_what_...,0.193538,0.526466
2465,10,885,8,1496400857,946815,0,0,0,0,0,...,0,9,24,0,9,63,I took a cell phone and accidentally made myse...,https://www.ted.com/talks/theo_e_j_wilson_a_bl...,0.100929,0.498050


In [455]:
# deleting the transcript and url columns because we don't need them.
del result['transcript']
del result['url']

In [456]:
for c in result.columns:
    print(c)

comments
duration
languages
published_date
views
3d printing
AI
AIDS
Addiction
Africa
Alzheimer's
Anthropocene
Asia
Autism spectrum disorder
Bioethics
Blindness
Brand
Brazil
Buddhism
CRISPR
Christianity
Criminal Justice
DNA
Debate
Egypt
Europe
Foreign Policy
Gender equality
Gender spectrum
God
Google
Guns
HIV
Human body
Internet
Iran
Islam
LGBT
MacArthur grant
Mars
Middle East
Moon
NASA
Natural resources
New York
Nobel prize
PTSD
Planets
Senses
Slavery
South America
String theory
Surgery
Surveillance
Syria
TED Books
TED Brain Trust
TED Fellows
TED Prize
TED Residency
TED en Español
TED-Ed
TEDMED
TEDNYC
TEDYouth
TEDx
Transgender
United States
Vaccines
activism
adventure
advertising
aging
agriculture
aircraft
algorithm
alternative energy
ancient world
animals
animation
anthropology
ants
apes
archaeology
architecture
art
asteroid
astrobiology
astronomy
atheism
augmented reality
bacteria
beauty
bees
behavioral economics
big bang
big problems
biodiversity
biology
biomechanics
biomimicry
bio

In [457]:
result

Unnamed: 0,comments,duration,languages,published_date,views,3d printing,AI,AIDS,Addiction,Africa,...,Informative,Fascinating,Unconvincing,Persuasive,Jaw-dropping,OK,Obnoxious,Inspiring,polarity,subjectivity
0,4553,1164,60,1151367060,47227110,0,0,0,0,0,...,7346,10581,300,10704,4439,1174,209,24924,0.146452,0.462051
1,265,977,43,1151367060,3200520,0,0,0,0,0,...,443,132,258,268,116,203,131,413,0.157775,0.424101
2,124,1286,26,1151367060,1636292,0,0,0,0,0,...,395,166,104,230,54,146,142,230,0.136579,0.475229
3,200,1116,35,1151367060,1697550,0,0,0,0,0,...,380,132,36,460,230,85,35,1070,0.082928,0.439165
4,593,1190,48,1151440680,12005869,0,0,0,0,1,...,5433,4606,67,2542,3736,248,61,2893,0.096483,0.450631
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2462,35,920,20,1496156399,1079698,0,1,0,0,0,...,88,120,9,103,11,26,4,258,0.038878,0.352668
2463,34,452,17,1496242092,957803,0,0,0,0,0,...,50,22,11,74,21,30,4,185,0.127087,0.538377
2464,18,1230,14,1496327443,874817,0,0,0,0,1,...,49,38,0,55,10,22,3,196,0.193538,0.526466
2465,10,885,8,1496400857,946815,0,0,0,0,0,...,13,43,0,9,24,0,9,63,0.100929,0.498050


In [458]:
scaler = MinMaxScaler() 
views = result['views']
result = result.loc[:, result.columns != 'views']
scaled_values = scaler.fit_transform(result) 
result.loc[:,:] = scaled_values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [459]:
result

Unnamed: 0,comments,duration,languages,published_date,3d printing,AI,AIDS,Addiction,Africa,Alzheimer's,...,Informative,Fascinating,Unconvincing,Persuasive,Jaw-dropping,OK,Obnoxious,Inspiring,polarity,subjectivity
0,0.710736,0.200937,0.833333,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.750588,0.732401,0.136737,1.000000,0.301399,0.875466,0.153564,1.000000,0.424460,0.594066
1,0.040631,0.164421,0.597222,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.045264,0.009137,0.117593,0.025037,0.007876,0.151380,0.096253,0.016452,0.439007,0.545273
2,0.018597,0.224761,0.361111,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.040360,0.011490,0.047402,0.021487,0.003666,0.108874,0.104335,0.009109,0.411776,0.611009
3,0.030474,0.191564,0.486111,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.038827,0.009137,0.016408,0.042975,0.015617,0.063386,0.025716,0.042815,0.342850,0.564641
4,0.091889,0.206014,0.666667,0.000213,0.0,0.0,0.0,0.0,1.0,0.0,...,0.555124,0.318821,0.030538,0.237481,0.253666,0.184937,0.044820,0.115966,0.360264,0.579382
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2462,0.004688,0.153290,0.277778,0.999242,0.0,1.0,0.0,0.0,0.0,0.0,...,0.008992,0.008306,0.004102,0.009623,0.000747,0.019389,0.002939,0.010232,0.286259,0.453431
2463,0.004532,0.061902,0.236111,0.999490,0.0,0.0,0.0,0.0,0.0,0.0,...,0.005109,0.001523,0.005014,0.006913,0.001426,0.022371,0.002939,0.007303,0.399582,0.692199
2464,0.002032,0.213825,0.194444,0.999737,0.0,0.0,0.0,0.0,1.0,0.0,...,0.005007,0.002630,0.000000,0.005138,0.000679,0.016406,0.002204,0.007744,0.484953,0.676884
2465,0.000781,0.146456,0.111111,0.999950,0.0,0.0,0.0,0.0,0.0,0.0,...,0.001328,0.002976,0.000000,0.000841,0.001630,0.000000,0.006613,0.002408,0.365977,0.640350


In [460]:
views

0       47227110
1        3200520
2        1636292
3        1697550
4       12005869
          ...   
2462     1079698
2463      957803
2464      874817
2465      946815
2466     1323235
Name: views, Length: 2467, dtype: int64

In [461]:
result['views'] = views

In [462]:
result

Unnamed: 0,comments,duration,languages,published_date,3d printing,AI,AIDS,Addiction,Africa,Alzheimer's,...,Fascinating,Unconvincing,Persuasive,Jaw-dropping,OK,Obnoxious,Inspiring,polarity,subjectivity,views
0,0.710736,0.200937,0.833333,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.732401,0.136737,1.000000,0.301399,0.875466,0.153564,1.000000,0.424460,0.594066,47227110
1,0.040631,0.164421,0.597222,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.009137,0.117593,0.025037,0.007876,0.151380,0.096253,0.016452,0.439007,0.545273,3200520
2,0.018597,0.224761,0.361111,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.011490,0.047402,0.021487,0.003666,0.108874,0.104335,0.009109,0.411776,0.611009,1636292
3,0.030474,0.191564,0.486111,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.009137,0.016408,0.042975,0.015617,0.063386,0.025716,0.042815,0.342850,0.564641,1697550
4,0.091889,0.206014,0.666667,0.000213,0.0,0.0,0.0,0.0,1.0,0.0,...,0.318821,0.030538,0.237481,0.253666,0.184937,0.044820,0.115966,0.360264,0.579382,12005869
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2462,0.004688,0.153290,0.277778,0.999242,0.0,1.0,0.0,0.0,0.0,0.0,...,0.008306,0.004102,0.009623,0.000747,0.019389,0.002939,0.010232,0.286259,0.453431,1079698
2463,0.004532,0.061902,0.236111,0.999490,0.0,0.0,0.0,0.0,0.0,0.0,...,0.001523,0.005014,0.006913,0.001426,0.022371,0.002939,0.007303,0.399582,0.692199,957803
2464,0.002032,0.213825,0.194444,0.999737,0.0,0.0,0.0,0.0,1.0,0.0,...,0.002630,0.000000,0.005138,0.000679,0.016406,0.002204,0.007744,0.484953,0.676884,874817
2465,0.000781,0.146456,0.111111,0.999950,0.0,0.0,0.0,0.0,0.0,0.0,...,0.002976,0.000000,0.000841,0.001630,0.000000,0.006613,0.002408,0.365977,0.640350,946815


In [463]:
#60% - train set, 20% - validation set, 20% - test set
train, validate, test = np.split(result.sample(frac=1), [int(.6*len(result)), int(.8*len(result))])

In [464]:
train

Unnamed: 0,comments,duration,languages,published_date,3d printing,AI,AIDS,Addiction,Africa,Alzheimer's,...,Fascinating,Unconvincing,Persuasive,Jaw-dropping,OK,Obnoxious,Inspiring,polarity,subjectivity,views
1857,0.010783,0.088069,0.458333,0.764633,0.0,0.0,0.0,0.0,0.0,0.0,...,0.010521,0.020966,0.018685,0.000543,0.162565,0.005143,0.031299,0.343511,0.629229,2230481
1132,0.020472,0.186878,0.361111,0.512899,0.0,0.0,0.0,0.0,0.0,0.0,...,0.034609,0.003191,0.019806,0.006518,0.033557,0.006613,0.022511,0.380749,0.619161,940841
1081,0.013440,0.114626,0.347222,0.497710,0.0,0.0,0.0,0.0,0.0,0.0,...,0.002907,0.014129,0.010463,0.001290,0.047726,0.005878,0.006661,0.508807,0.691767,276132
391,0.025004,0.059559,0.486111,0.243646,0.0,0.0,0.0,0.0,0.0,0.0,...,0.006922,0.006837,0.036435,0.030350,0.040268,0.011756,0.006139,0.518362,0.623865,1097511
1342,0.058603,0.131615,0.472222,0.572063,0.0,0.0,0.0,0.0,0.0,0.0,...,0.014605,0.006837,0.077635,0.032930,0.034303,0.003674,0.004895,0.452132,0.603079,2228138
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1988,0.013596,0.211482,0.402778,0.816952,0.0,0.0,0.0,0.0,0.0,0.0,...,0.021665,0.009572,0.006353,0.003870,0.031320,0.000000,0.005818,0.373229,0.654997,1617977
1505,0.014377,0.064636,0.319444,0.629652,0.0,0.0,0.0,0.0,0.0,0.0,...,0.010383,0.002279,0.001214,0.003599,0.024609,0.002939,0.002608,0.461612,0.689562,1304597
1823,0.012815,0.079867,0.375000,0.752603,0.0,0.0,0.0,0.0,0.0,0.0,...,0.022150,0.013218,0.011771,0.003191,0.115585,0.019104,0.019100,0.325899,0.475271,2526255
1680,0.031724,0.099004,0.430556,0.698523,0.0,0.0,0.0,0.0,0.0,0.0,...,0.025403,0.067457,0.023356,0.001765,0.215511,0.027921,0.025802,0.477356,0.594878,3840850


In [465]:
validate

Unnamed: 0,comments,duration,languages,published_date,3d printing,AI,AIDS,Addiction,Africa,Alzheimer's,...,Fascinating,Unconvincing,Persuasive,Jaw-dropping,OK,Obnoxious,Inspiring,polarity,subjectivity,views
901,0.044382,0.023238,0.597222,0.435342,0.0,0.0,0.0,0.0,0.0,0.0,...,0.025403,0.024157,0.018965,0.023832,0.097688,0.011021,0.015168,0.342945,0.695218,2442977
327,0.006407,0.185901,0.375000,0.214641,0.0,0.0,0.0,0.0,0.0,0.0,...,0.001869,0.005014,0.003270,0.000543,0.019389,0.008817,0.005818,0.431652,0.631521,304800
1671,0.011095,0.104472,0.430556,0.695770,0.0,0.0,0.0,0.0,0.0,0.0,...,0.002838,0.005014,0.008221,0.000951,0.022371,0.006613,0.009510,0.315188,0.581657,1034678
576,0.017190,0.084944,0.402778,0.317918,0.0,0.0,0.0,0.0,0.0,0.0,...,0.018343,0.020055,0.009436,0.001154,0.114840,0.011756,0.006019,0.507125,0.661621,819664
956,0.036881,0.193907,0.388889,0.453129,0.0,0.0,0.0,0.0,0.0,0.0,...,0.023257,0.025068,0.024010,0.003055,0.045488,0.017634,0.010192,0.361558,0.627514,1071132
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2138,0.017034,0.144698,0.277778,0.880815,0.0,0.0,0.0,0.0,0.0,0.0,...,0.003599,0.036919,0.008688,0.001697,0.029828,0.034533,0.003250,0.477840,0.562051,1008812
846,0.027192,0.100762,0.402778,0.415828,0.0,0.0,0.0,0.0,0.0,0.0,...,0.016474,0.015497,0.001962,0.003599,0.066368,0.014695,0.006380,0.376247,0.484177,1301834
404,0.027192,0.165593,0.388889,0.247902,0.0,0.0,0.0,0.0,0.0,0.0,...,0.015020,0.025068,0.027186,0.005975,0.120060,0.022777,0.018218,0.316340,0.532522,1352069
1450,0.220034,0.099590,0.472222,0.612138,0.0,0.0,0.0,0.0,0.0,0.0,...,0.021665,0.002279,0.066984,0.014598,0.021626,0.007348,0.094017,0.361377,0.534283,3005687


In [466]:
test

Unnamed: 0,comments,duration,languages,published_date,3d printing,AI,AIDS,Addiction,Africa,Alzheimer's,...,Fascinating,Unconvincing,Persuasive,Jaw-dropping,OK,Obnoxious,Inspiring,polarity,subjectivity,views
768,0.018753,0.183363,0.388889,0.386205,0.0,0.0,0.0,0.0,0.0,0.0,...,0.015920,0.014585,0.011584,0.002920,0.035048,0.007348,0.003852,0.438354,0.565035,568761
1569,0.051258,0.154267,0.375000,0.652692,0.0,0.0,0.0,0.0,0.0,0.0,...,0.018620,0.007748,0.002242,0.012765,0.014169,0.014695,0.002688,0.466186,0.607051,1044961
906,0.023285,0.201718,0.430556,0.436848,0.0,0.0,0.0,0.0,0.0,0.0,...,0.019312,0.022789,0.018124,0.003599,0.067860,0.027186,0.012399,0.381143,0.537028,2049103
1771,0.010783,0.020894,0.472222,0.730066,0.0,0.0,0.0,0.0,0.0,0.0,...,0.004153,0.016864,0.000654,0.000339,0.102163,0.026451,0.003090,0.274947,0.517391,1545239
2441,0.038287,0.452060,0.291667,0.991823,0.0,0.0,0.0,0.0,0.0,0.0,...,0.089707,0.014585,0.006072,0.023085,0.030574,0.008817,0.051563,0.575818,0.652228,5666038
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
599,0.015471,0.168912,0.000000,0.326855,0.0,0.0,0.0,0.0,0.0,0.0,...,0.009483,0.014585,0.001121,0.012357,0.035794,0.014695,0.004695,0.276723,0.603364,556163
2151,0.009845,0.145675,0.430556,0.885575,0.0,0.0,0.0,0.0,0.0,0.0,...,0.016682,0.005014,0.011491,0.001222,0.031320,0.005878,0.004093,0.376975,0.585841,1587090
2321,0.018128,0.147042,0.319444,0.948173,0.0,0.0,0.0,0.0,0.0,0.0,...,0.007476,0.030538,0.022702,0.005228,0.023117,0.012491,0.026885,0.422481,0.571080,1177013
1208,0.019378,0.109939,0.430556,0.531754,0.0,0.0,0.0,0.0,0.0,0.0,...,0.004845,0.037830,0.007380,0.000407,0.052946,0.015430,0.007865,0.446239,0.589231,1197677


In [467]:
result.columns

Index(['comments', 'duration', 'languages', 'published_date', '3d printing',
       'AI', 'AIDS', 'Addiction', 'Africa', 'Alzheimer's',
       ...
       'Fascinating', 'Unconvincing', 'Persuasive', 'Jaw-dropping', 'OK',
       'Obnoxious', 'Inspiring', 'polarity', 'subjectivity', 'views'],
      dtype='object', length=444)

In [468]:
for c in result.columns:
    print(c)

comments
duration
languages
published_date
3d printing
AI
AIDS
Addiction
Africa
Alzheimer's
Anthropocene
Asia
Autism spectrum disorder
Bioethics
Blindness
Brand
Brazil
Buddhism
CRISPR
Christianity
Criminal Justice
DNA
Debate
Egypt
Europe
Foreign Policy
Gender equality
Gender spectrum
God
Google
Guns
HIV
Human body
Internet
Iran
Islam
LGBT
MacArthur grant
Mars
Middle East
Moon
NASA
Natural resources
New York
Nobel prize
PTSD
Planets
Senses
Slavery
South America
String theory
Surgery
Surveillance
Syria
TED Books
TED Brain Trust
TED Fellows
TED Prize
TED Residency
TED en Español
TED-Ed
TEDMED
TEDNYC
TEDYouth
TEDx
Transgender
United States
Vaccines
activism
adventure
advertising
aging
agriculture
aircraft
algorithm
alternative energy
ancient world
animals
animation
anthropology
ants
apes
archaeology
architecture
art
asteroid
astrobiology
astronomy
atheism
augmented reality
bacteria
beauty
bees
behavioral economics
big bang
big problems
biodiversity
biology
biomechanics
biomimicry
biosphere

In [469]:
tags_result = result[result.columns.difference(['views','subjectivity','polarity','Inspiring','Obnoxious','OK',
                                  'Jaw-dropping','Persuasive','Unconvincing','Fascinating','Informative','Confusing',
                                  'Longwinded','Courageous','Ingenious','Beautiful','Funny','event_class_Yearly TED Conference',
                                  'event_class_TEDx','event_class_TEDWomen','event_class_TEDSalon','event_class_TEDNYC',
                                 'event_class_TEDNYC','event_class_TEDGlobal','event_class_Other','comments','duration',
                                  'languages','published_date'])]

other_than_tags = result[['views','subjectivity','polarity','Inspiring','Obnoxious','OK',
                                  'Jaw-dropping','Persuasive','Unconvincing','Fascinating','Informative','Confusing',
                                  'Longwinded','Courageous','Ingenious','Beautiful','Funny','event_class_Yearly TED Conference',
                                  'event_class_TEDx','event_class_TEDWomen','event_class_TEDSalon','event_class_TEDNYC',
                                 'event_class_TEDNYC','event_class_TEDGlobal','event_class_Other','comments','duration',
                                  'languages','published_date']]


tags_result.head(10)
print(len(tags_result.columns))



416


In [470]:
pca = PCA(n_components=90)
principalComponents = pca.fit_transform(tags_result)


# print (pca.explained_variance_)
# print (pca.explained_variance_ratio_)
print (pca.explained_variance_ratio_.cumsum())

feat_cols = ['feature'+str(i) for i in range(principalComponents.shape[1])]
normalised_data_tags = pd.DataFrame(principalComponents,columns=feat_cols)
normalised_data_tags.tail()

[0.04941927 0.0864253  0.11892303 0.14657868 0.16996053 0.1915126
 0.21287596 0.23392454 0.25381705 0.27150194 0.28681082 0.30040861
 0.31347148 0.32604259 0.33787567 0.34866472 0.35927266 0.36942054
 0.37918666 0.38857986 0.39758052 0.40626188 0.41478881 0.42306389
 0.43101307 0.4388885  0.44670051 0.45428082 0.46162648 0.46882604
 0.47563144 0.48234699 0.48880128 0.49519303 0.50152683 0.50781881
 0.51404346 0.52010231 0.52611389 0.53191079 0.5376323  0.54312887
 0.54860237 0.55395517 0.55920709 0.56432669 0.56936201 0.57418328
 0.57898525 0.58372442 0.58833913 0.59291825 0.59744133 0.6017618
 0.60605623 0.61028004 0.61443458 0.61854363 0.62263962 0.62669095
 0.6306634  0.63457757 0.63841332 0.64220463 0.64589889 0.64953958
 0.65312004 0.65666764 0.66016896 0.66360369 0.6669992  0.67034151
 0.67365629 0.67690739 0.68012537 0.68331148 0.68639018 0.68942337
 0.69243649 0.69537736 0.69829361 0.70118783 0.70402754 0.70684652
 0.70963679 0.71238846 0.71510265 0.71777741 0.72035006 0.722894

Unnamed: 0,feature0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,...,feature80,feature81,feature82,feature83,feature84,feature85,feature86,feature87,feature88,feature89
2462,0.708198,-0.42118,0.980707,0.379221,-0.223008,0.384034,-0.325996,-0.295267,0.079615,0.125157,...,0.154324,-0.105653,-0.028975,0.145845,-0.251759,0.004124,-0.022317,0.128487,-0.240444,0.024994
2463,-0.476938,0.654497,0.594643,1.109452,0.080289,0.505985,-0.144903,0.040767,0.045402,-0.323904,...,-0.088202,0.230342,-0.338996,0.001301,-0.094064,0.186425,-0.182981,0.0837,-0.187949,0.092067
2464,0.478794,0.768709,1.130858,0.079673,-0.913137,-0.016905,1.214056,-0.142552,-0.60854,0.44147,...,0.266922,0.094141,-0.177737,-0.040821,0.265956,-0.037515,-0.254343,-0.055688,0.094262,-0.011328
2465,-0.331273,-0.112308,-0.425093,0.212219,0.151791,0.1555,0.218054,-0.452961,0.153857,0.266228,...,-0.001875,0.004935,0.105817,-0.092882,-0.121378,0.045468,0.005001,0.133921,-0.028162,0.022499
2466,-0.167881,-0.462061,-0.144781,0.210303,0.255199,-0.27411,0.250321,0.362215,-0.04925,-0.403997,...,0.037269,-0.096484,0.025187,-0.019905,-0.023869,0.035538,0.03068,0.092917,0.061391,0.07714


In [474]:
# result = other_than_tags.join(normalised_data_tags)

# views = result['views']
# result = result.loc[:, result.columns != 'views']

pca = PCA(n_components=43)
principalComponents = pca.fit_transform(result)


# print (pca.explained_variance_)
# print (pca.explained_variance_ratio_)
print (pca.explained_variance_ratio_.cumsum())

feat_cols = ['feature'+str(i) for i in range(principalComponents.shape[1])]
normalised_data = pd.DataFrame(principalComponents,columns=feat_cols)
result.tail()

[0.06727693 0.12568633 0.17272412 0.21341415 0.25276734 0.28853163
 0.31623532 0.34360502 0.36869606 0.39333101 0.41631532 0.43723955
 0.455278   0.47196968 0.48756278 0.50256325 0.51639861 0.52919217
 0.54156531 0.55378348 0.56558608 0.57697721 0.58764746 0.59799078
 0.60807443 0.61787172 0.62739172 0.63678111 0.64598866 0.65490219
 0.66366074 0.67223972 0.68053624 0.68848428 0.69621121 0.70369578
 0.71101298 0.71830111 0.72541716 0.73244817 0.73927653 0.74597141
 0.75245604]


Unnamed: 0,subjectivity,polarity,Inspiring,Obnoxious,OK,Jaw-dropping,Persuasive,Unconvincing,Fascinating,Informative,...,feature80,feature81,feature82,feature83,feature84,feature85,feature86,feature87,feature88,feature89
2462,0.453431,0.286259,0.010232,0.002939,0.019389,0.000747,0.009623,0.004102,0.008306,0.008992,...,0.154324,-0.105653,-0.028975,0.145845,-0.251759,0.004124,-0.022317,0.128487,-0.240444,0.024994
2463,0.692199,0.399582,0.007303,0.002939,0.022371,0.001426,0.006913,0.005014,0.001523,0.005109,...,-0.088202,0.230342,-0.338996,0.001301,-0.094064,0.186425,-0.182981,0.0837,-0.187949,0.092067
2464,0.676884,0.484953,0.007744,0.002204,0.016406,0.000679,0.005138,0.0,0.00263,0.005007,...,0.266922,0.094141,-0.177737,-0.040821,0.265956,-0.037515,-0.254343,-0.055688,0.094262,-0.011328
2465,0.64035,0.365977,0.002408,0.006613,0.0,0.00163,0.000841,0.0,0.002976,0.001328,...,-0.001875,0.004935,0.105817,-0.092882,-0.121378,0.045468,0.005001,0.133921,-0.028162,0.022499
2466,0.52496,0.323856,0.006862,0.002204,0.024609,0.000339,0.001121,0.005925,0.003669,0.005109,...,0.037269,-0.096484,0.025187,-0.019905,-0.023869,0.035538,0.03068,0.092917,0.061391,0.07714


In [481]:
# Labels are the values we want to predict
labels =views

In [482]:
# Saving result names for later use
result_list = list(result.columns)

In [483]:
# Convert to numpy array
result = np.array(result)

In [484]:
# Split the data into training and testing sets
train_set, test_set, train_labels, test_labels = train_test_split(result, labels, test_size = 0.25, random_state = 42)

In [487]:
print('Training result Shape:', train_set.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing result Shape:', test_set.shape)
print('Testing Labels Shape:', test_labels.shape)

Training result Shape: (1850, 118)
Training Labels Shape: (1850,)
Testing result Shape: (617, 118)
Testing Labels Shape: (617,)


In [489]:
scaler_stand = MinMaxScaler() 
data_scaled_stand = scaler_stand.fit_transform(train_set)

In [490]:
scaler_stand = MinMaxScaler() 
data_scaled_stand = scaler_stand.fit_transform(test_set)

In [493]:
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)

# Train the model on training data
rf.fit(train_set, train_labels);

In [494]:
# Use the forest's predict method on the test data
predictions = rf.predict(test_set)

# Calculate the absolute errors
errors = abs(predictions - test_labels)

# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

Mean Absolute Error: 495074.72 degrees.


In [495]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / test_labels)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 65.42 %.
