In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import statsmodels.api as sm
from statsmodels.formula.api import ols
import ast
from textblob import TextBlob
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor




df_ted_main = pd.read_csv("data/ted_main.csv")
df_ted_transcripts = df = pd.read_csv("data/transcripts.csv")

In [4]:
############### PREPROCESSING OF THE DATA ###############
# - removing columns that are not needed
# - one hot encoding columns
# - adding sentiment analysis of transcripts 
# - combining columns together to create a new structure of the data frame

In [5]:
# ----------------------------------------------------------
# 1. remove unwanted columns
# ----------------------------------------------------------

df_ted_main.drop(['description', 'film_date', 'main_speaker', 'name', 'num_speaker', 'related_talks', 'speaker_occupation', 'title', 'url'], axis=1, inplace=True)
df_ted_main.head()

Unnamed: 0,comments,duration,event,languages,published_date,ratings,tags,views
0,4553,1164,TED2006,60,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 19645}, {...","['children', 'creativity', 'culture', 'dance',...",47227110
1,265,977,TED2006,43,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 544}, {'i...","['alternative energy', 'cars', 'climate change...",3200520
2,124,1286,TED2006,26,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 964}, {'i...","['computers', 'entertainment', 'interface desi...",1636292
3,200,1116,TED2006,35,1151367060,"[{'id': 3, 'name': 'Courageous', 'count': 760}...","['MacArthur grant', 'activism', 'business', 'c...",1697550
4,593,1190,TED2006,48,1151440680,"[{'id': 9, 'name': 'Ingenious', 'count': 3202}...","['Africa', 'Asia', 'Google', 'demo', 'economic...",12005869


In [6]:
# ----------------------------------------------------------
# 2. one hot encoding
# ----------------------------------------------------------

# one hot encoding tags

# iterate trough the data set
for i, row in df_ted_main.iterrows():
    # change string formed list into normal list
    element_list_form = ast.literal_eval(df_ted_main.at[i,'tags'])
    element_list_form = [i.strip() for i in element_list_form]
    # finish converting by replacing string version of a list for the list type
    df_ted_main.at[i,'tags'] = element_list_form

# one hot encoding with column named "tags"
# since "tags" is a Series on each row, we need to split them first and then we can do one hot encoding
df_ted_main = df_ted_main.drop('tags', 1).join(
    pd.get_dummies(
        pd.DataFrame(df_ted_main.tags.tolist()).stack()
    ).astype(int).sum(level=0)
)

df_ted_main.head()

Unnamed: 0,comments,duration,event,languages,published_date,ratings,views,3d printing,AI,AIDS,...,wikipedia,wind energy,women,women in business,work,work-life balance,world cultures,writing,wunderkind,youth
0,4553,1164,TED2006,60,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 19645}, {...",47227110,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,265,977,TED2006,43,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 544}, {'i...",3200520,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,124,1286,TED2006,26,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 964}, {'i...",1636292,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,200,1116,TED2006,35,1151367060,"[{'id': 3, 'name': 'Courageous', 'count': 760}...",1697550,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,593,1190,TED2006,48,1151440680,"[{'id': 9, 'name': 'Ingenious', 'count': 3202}...",12005869,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
df_ted_main['event_class'] = 'Other'

In [8]:
#Loop over every talk and assign event category based on name of event
for ii in range(len(df_ted_main)):
    if df_ted_main['event'][ii].count('TED20') >0:
        df_ted_main['event_class'][ii] = 'Yearly TED Conference'
    elif df_ted_main['event'][ii].count('TED19') >0:
        df_ted_main['event_class'][ii] = 'Yearly TED Conference'
    elif df_ted_main['event'][ii].count('TEDx') >0:
        df_ted_main['event_class'][ii] = 'TEDx'
    elif df_ted_main['event'][ii].count('TEDGlobal') >0:
        df_ted_main['event_class'][ii] = 'TEDGlobal'
    elif df_ted_main['event'][ii].count('TEDWomen') >0:
        df_ted_main['event_class'][ii] = 'TEDWomen'
    if df_ted_main['event'][ii].count('TEDSalon') >0:
        df_ted_main['event_class'][ii] = 'TEDSalon'
    if df_ted_main['event'][ii].count('TEDNYC') >0:
        df_ted_main['event_class'][ii] = 'TEDNYC'
    else:
        pass

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the cavea

In [9]:
df_ted_main['event_class'].unique()

array(['Yearly TED Conference', 'TEDGlobal', 'TEDSalon', 'Other', 'TEDx',
       'TEDWomen', 'TEDNYC'], dtype=object)

In [10]:
del df_ted_main['event']

In [11]:
# one hot encoding "event" column

def one_hot_encode(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]])
    res = pd.concat([original_dataframe, dummies], axis=1)
    res = res.drop([feature_to_encode], axis=1)
    return(res)

df_ted_main = one_hot_encode(df_ted_main, 'event_class')
df_ted_main.head()

Unnamed: 0,comments,duration,languages,published_date,ratings,views,3d printing,AI,AIDS,Addiction,...,writing,wunderkind,youth,event_class_Other,event_class_TEDGlobal,event_class_TEDNYC,event_class_TEDSalon,event_class_TEDWomen,event_class_TEDx,event_class_Yearly TED Conference
0,4553,1164,60,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 19645}, {...",47227110,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,265,977,43,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 544}, {'i...",3200520,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,124,1286,26,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 964}, {'i...",1636292,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,200,1116,35,1151367060,"[{'id': 3, 'name': 'Courageous', 'count': 760}...",1697550,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,593,1190,48,1151440680,"[{'id': 9, 'name': 'Ingenious', 'count': 3202}...",12005869,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [12]:
# filtered_df = df_ted_main[['comments','duration','languages','event_class_TEDGlobal','views','event_class_TEDSalon','event_class_TEDWomen','event_class_Yearly TED Conference']]

# pca = PCA(n_components=2)
# principalComponents = pca.fit_transform(filtered_df)


# print (pca.explained_variance_)
# print (pca.explained_variance_ratio_)
# print (pca.explained_variance_ratio_.cumsum())

# feat_cols = ['feature'+str(i) for i in range(principalComponents.shape[1])]
# normalised_data = pd.DataFrame(principalComponents,columns=feat_cols)
# normalised_data.tail()



In [13]:
#rating analysis

df_ted_main['ratings']=df_ted_main['ratings'].str.replace("'",'"')

df_ted_main=df_ted_main.merge(df_ted_main.ratings.apply(lambda x: pd.Series(pd.read_json(x)['count'].values,index=pd.read_json(x)['name'])), 
            left_index=True, right_index=True)

df_ted_main.drop('ratings', axis=1, inplace=True)

df_ted_main.head()

Unnamed: 0,comments,duration,languages,published_date,views,3d printing,AI,AIDS,Addiction,Africa,...,Longwinded,Confusing,Informative,Fascinating,Unconvincing,Persuasive,Jaw-dropping,OK,Obnoxious,Inspiring
0,4553,1164,60,1151367060,47227110,0,0,0,0,0,...,387,242,7346,10581,300,10704,4439,1174,209,24924
1,265,977,43,1151367060,3200520,0,0,0,0,0,...,113,62,443,132,258,268,116,203,131,413
2,124,1286,26,1151367060,1636292,0,0,0,0,0,...,78,27,395,166,104,230,54,146,142,230
3,200,1116,35,1151367060,1697550,0,0,0,0,0,...,53,32,380,132,36,460,230,85,35,1070
4,593,1190,48,1151440680,12005869,0,0,0,0,1,...,110,72,5433,4606,67,2542,3736,248,61,2893


In [14]:
# sentiment analysis of transcript
pol = lambda x: TextBlob(x).sentiment.polarity
sub = lambda x: TextBlob(x).sentiment.subjectivity

df_ted_transcripts['polarity'] = df_ted_transcripts['transcript'].apply(pol)
df_ted_transcripts['subjectivity'] = df_ted_transcripts['transcript'].apply(sub)
df_ted_transcripts

Unnamed: 0,transcript,url,polarity,subjectivity
0,Good morning. How are you?(Laughter)It's been ...,https://www.ted.com/talks/ken_robinson_says_sc...,0.146452,0.462051
1,"Thank you so much, Chris. And it's truly a gre...",https://www.ted.com/talks/al_gore_on_averting_...,0.157775,0.424101
2,"(Music: ""The Sound of Silence,"" Simon & Garfun...",https://www.ted.com/talks/david_pogue_says_sim...,0.136579,0.475229
3,If you're here today — and I'm very happy that...,https://www.ted.com/talks/majora_carter_s_tale...,0.082928,0.439165
4,"About 10 years ago, I took on the task to teac...",https://www.ted.com/talks/hans_rosling_shows_t...,0.096483,0.450631
...,...,...,...,...
2462,"So, Ma was trying to explain something to me a...",https://www.ted.com/talks/duarte_geraldino_wha...,0.038878,0.352668
2463,This is a picture of a sunset on Mars taken by...,https://www.ted.com/talks/armando_azua_bustos_...,0.127087,0.538377
2464,"In my early days as a graduate student, I went...",https://www.ted.com/talks/radhika_nagpal_what_...,0.193538,0.526466
2465,I took a cell phone and accidentally made myse...,https://www.ted.com/talks/theo_e_j_wilson_a_bl...,0.100929,0.498050


In [15]:
# merging both datasets and delete the rows that don't have a transcipt
result = pd.merge(df_ted_main, df_ted_transcripts, left_index=True, right_index=True)

In [16]:
result

Unnamed: 0,comments,duration,languages,published_date,views,3d printing,AI,AIDS,Addiction,Africa,...,Unconvincing,Persuasive,Jaw-dropping,OK,Obnoxious,Inspiring,transcript,url,polarity,subjectivity
0,4553,1164,60,1151367060,47227110,0,0,0,0,0,...,300,10704,4439,1174,209,24924,Good morning. How are you?(Laughter)It's been ...,https://www.ted.com/talks/ken_robinson_says_sc...,0.146452,0.462051
1,265,977,43,1151367060,3200520,0,0,0,0,0,...,258,268,116,203,131,413,"Thank you so much, Chris. And it's truly a gre...",https://www.ted.com/talks/al_gore_on_averting_...,0.157775,0.424101
2,124,1286,26,1151367060,1636292,0,0,0,0,0,...,104,230,54,146,142,230,"(Music: ""The Sound of Silence,"" Simon & Garfun...",https://www.ted.com/talks/david_pogue_says_sim...,0.136579,0.475229
3,200,1116,35,1151367060,1697550,0,0,0,0,0,...,36,460,230,85,35,1070,If you're here today — and I'm very happy that...,https://www.ted.com/talks/majora_carter_s_tale...,0.082928,0.439165
4,593,1190,48,1151440680,12005869,0,0,0,0,1,...,67,2542,3736,248,61,2893,"About 10 years ago, I took on the task to teac...",https://www.ted.com/talks/hans_rosling_shows_t...,0.096483,0.450631
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2462,35,920,20,1496156399,1079698,0,1,0,0,0,...,9,103,11,26,4,258,"So, Ma was trying to explain something to me a...",https://www.ted.com/talks/duarte_geraldino_wha...,0.038878,0.352668
2463,34,452,17,1496242092,957803,0,0,0,0,0,...,11,74,21,30,4,185,This is a picture of a sunset on Mars taken by...,https://www.ted.com/talks/armando_azua_bustos_...,0.127087,0.538377
2464,18,1230,14,1496327443,874817,0,0,0,0,1,...,0,55,10,22,3,196,"In my early days as a graduate student, I went...",https://www.ted.com/talks/radhika_nagpal_what_...,0.193538,0.526466
2465,10,885,8,1496400857,946815,0,0,0,0,0,...,0,9,24,0,9,63,I took a cell phone and accidentally made myse...,https://www.ted.com/talks/theo_e_j_wilson_a_bl...,0.100929,0.498050


In [17]:
# deleting the transcript and url columns because we don't need them.
del result['transcript']
del result['url']

In [18]:
for c in result.columns:
    print(c)

comments
duration
languages
published_date
views
3d printing
AI
AIDS
Addiction
Africa
Alzheimer's
Anthropocene
Asia
Autism spectrum disorder
Bioethics
Blindness
Brand
Brazil
Buddhism
CRISPR
Christianity
Criminal Justice
DNA
Debate
Egypt
Europe
Foreign Policy
Gender equality
Gender spectrum
God
Google
Guns
HIV
Human body
Internet
Iran
Islam
LGBT
MacArthur grant
Mars
Middle East
Moon
NASA
Natural resources
New York
Nobel prize
PTSD
Planets
Senses
Slavery
South America
String theory
Surgery
Surveillance
Syria
TED Books
TED Brain Trust
TED Fellows
TED Prize
TED Residency
TED en Español
TED-Ed
TEDMED
TEDNYC
TEDYouth
TEDx
Transgender
United States
Vaccines
activism
adventure
advertising
aging
agriculture
aircraft
algorithm
alternative energy
ancient world
animals
animation
anthropology
ants
apes
archaeology
architecture
art
asteroid
astrobiology
astronomy
atheism
augmented reality
bacteria
beauty
bees
behavioral economics
big bang
big problems
biodiversity
biology
biomechanics
biomimicry
bio

In [19]:
result

Unnamed: 0,comments,duration,languages,published_date,views,3d printing,AI,AIDS,Addiction,Africa,...,Informative,Fascinating,Unconvincing,Persuasive,Jaw-dropping,OK,Obnoxious,Inspiring,polarity,subjectivity
0,4553,1164,60,1151367060,47227110,0,0,0,0,0,...,7346,10581,300,10704,4439,1174,209,24924,0.146452,0.462051
1,265,977,43,1151367060,3200520,0,0,0,0,0,...,443,132,258,268,116,203,131,413,0.157775,0.424101
2,124,1286,26,1151367060,1636292,0,0,0,0,0,...,395,166,104,230,54,146,142,230,0.136579,0.475229
3,200,1116,35,1151367060,1697550,0,0,0,0,0,...,380,132,36,460,230,85,35,1070,0.082928,0.439165
4,593,1190,48,1151440680,12005869,0,0,0,0,1,...,5433,4606,67,2542,3736,248,61,2893,0.096483,0.450631
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2462,35,920,20,1496156399,1079698,0,1,0,0,0,...,88,120,9,103,11,26,4,258,0.038878,0.352668
2463,34,452,17,1496242092,957803,0,0,0,0,0,...,50,22,11,74,21,30,4,185,0.127087,0.538377
2464,18,1230,14,1496327443,874817,0,0,0,0,1,...,49,38,0,55,10,22,3,196,0.193538,0.526466
2465,10,885,8,1496400857,946815,0,0,0,0,0,...,13,43,0,9,24,0,9,63,0.100929,0.498050


In [51]:
scaler = MinMaxScaler() 
views = result['views']
result = result.loc[:, result.columns != 'views']
scaled_values = scaler.fit_transform(result) 
result.loc[:,:] = scaled_values
scaled_values

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [50]:
result

array([[7.10736053e-01, 2.00937317e-01, 8.33333333e-01, ...,
        4.24459857e-01, 5.94065902e-01, 4.72271100e+07],
       [4.06313486e-02, 1.64421012e-01, 5.97222222e-01, ...,
        4.39006705e-01, 5.45273071e-01, 3.20052000e+06],
       [1.85966557e-02, 2.24760789e-01, 3.61111111e-01, ...,
        4.11776127e-01, 6.11009128e-01, 1.63629200e+06],
       ...,
       [2.03156743e-03, 2.13825425e-01, 1.94444444e-01, ...,
        4.84952547e-01, 6.76884358e-01, 8.74817000e+05],
       [7.81372089e-04, 1.46455770e-01, 1.11111111e-01, ...,
        3.65976883e-01, 6.40350309e-01, 9.46815000e+05],
       [2.34411627e-03, 1.19312634e-01, 2.22222222e-01, ...,
        3.23855907e-01, 5.24959834e-01, 1.32323500e+06]])

In [22]:
views

0       47227110
1        3200520
2        1636292
3        1697550
4       12005869
          ...   
2462     1079698
2463      957803
2464      874817
2465      946815
2466     1323235
Name: views, Length: 2467, dtype: int64

In [23]:
result['views'] = views

In [24]:
result

Unnamed: 0,comments,duration,languages,published_date,3d printing,AI,AIDS,Addiction,Africa,Alzheimer's,...,Fascinating,Unconvincing,Persuasive,Jaw-dropping,OK,Obnoxious,Inspiring,polarity,subjectivity,views
0,0.710736,0.200937,0.833333,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.732401,0.136737,1.000000,0.301399,0.875466,0.153564,1.000000,0.424460,0.594066,47227110
1,0.040631,0.164421,0.597222,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.009137,0.117593,0.025037,0.007876,0.151380,0.096253,0.016452,0.439007,0.545273,3200520
2,0.018597,0.224761,0.361111,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.011490,0.047402,0.021487,0.003666,0.108874,0.104335,0.009109,0.411776,0.611009,1636292
3,0.030474,0.191564,0.486111,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.009137,0.016408,0.042975,0.015617,0.063386,0.025716,0.042815,0.342850,0.564641,1697550
4,0.091889,0.206014,0.666667,0.000213,0.0,0.0,0.0,0.0,1.0,0.0,...,0.318821,0.030538,0.237481,0.253666,0.184937,0.044820,0.115966,0.360264,0.579382,12005869
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2462,0.004688,0.153290,0.277778,0.999242,0.0,1.0,0.0,0.0,0.0,0.0,...,0.008306,0.004102,0.009623,0.000747,0.019389,0.002939,0.010232,0.286259,0.453431,1079698
2463,0.004532,0.061902,0.236111,0.999490,0.0,0.0,0.0,0.0,0.0,0.0,...,0.001523,0.005014,0.006913,0.001426,0.022371,0.002939,0.007303,0.399582,0.692199,957803
2464,0.002032,0.213825,0.194444,0.999737,0.0,0.0,0.0,0.0,1.0,0.0,...,0.002630,0.000000,0.005138,0.000679,0.016406,0.002204,0.007744,0.484953,0.676884,874817
2465,0.000781,0.146456,0.111111,0.999950,0.0,0.0,0.0,0.0,0.0,0.0,...,0.002976,0.000000,0.000841,0.001630,0.000000,0.006613,0.002408,0.365977,0.640350,946815


In [25]:
#60% - train set, 20% - validation set, 20% - test set
train, validate, test = np.split(result.sample(frac=1), [int(.6*len(result)), int(.8*len(result))])

In [26]:
train

Unnamed: 0,comments,duration,languages,published_date,3d printing,AI,AIDS,Addiction,Africa,Alzheimer's,...,Fascinating,Unconvincing,Persuasive,Jaw-dropping,OK,Obnoxious,Inspiring,polarity,subjectivity,views
796,0.009064,0.011716,0.472222,0.395218,0.0,0.0,0.0,0.0,0.0,0.0,...,0.001938,0.010939,0.000747,0.001562,0.035794,0.031594,0.003892,0.409452,0.605026,621361
655,0.056728,0.305214,0.000000,0.344634,0.0,0.0,0.0,0.0,0.0,0.0,...,0.022634,0.046946,0.014387,0.010253,0.078300,0.071271,0.007062,0.411867,0.562490,449161
1901,0.004376,0.115602,0.347222,0.784159,0.0,0.0,0.0,0.0,0.0,0.0,...,0.002630,0.000912,0.005045,0.001086,0.044743,0.001470,0.006541,0.391115,0.676556,976621
177,0.005470,0.187854,0.305556,0.142008,0.0,0.0,0.0,0.0,0.0,0.0,...,0.005676,0.019599,0.003830,0.000475,0.061894,0.014695,0.004334,0.385733,0.527538,585052
1256,0.020784,0.204843,0.347222,0.544509,0.0,0.0,0.0,0.0,0.0,0.0,...,0.011075,0.004102,0.007474,0.001697,0.017897,0.001470,0.005578,0.398831,0.620835,572281
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
737,0.018440,0.169303,0.236111,0.373435,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000831,0.010027,0.009996,0.005432,0.008949,0.003674,0.000522,0.488697,0.685197,220099
803,0.129551,0.056044,0.500000,0.397966,0.0,0.0,0.0,0.0,0.0,0.0,...,0.007545,0.042388,0.030643,0.002852,0.082774,0.003674,0.005016,0.415317,0.623907,543551
388,0.025473,0.187659,0.430556,0.242394,0.0,0.0,0.0,0.0,0.0,0.0,...,0.017028,0.006837,0.043348,0.014666,0.019389,0.008817,0.031379,0.444616,0.627208,1139732
597,0.004688,0.050576,0.333333,0.326363,0.0,0.0,0.0,0.0,0.0,0.0,...,0.003807,0.005014,0.001028,0.000747,0.032811,0.005143,0.002247,0.381981,0.607677,437800


In [27]:
validate

Unnamed: 0,comments,duration,languages,published_date,3d printing,AI,AIDS,Addiction,Africa,Alzheimer's,...,Fascinating,Unconvincing,Persuasive,Jaw-dropping,OK,Obnoxious,Inspiring,polarity,subjectivity,views
368,0.013127,0.141379,0.305556,0.233630,0.0,0.0,0.0,0.0,0.0,0.0,...,0.008445,0.021878,0.015695,0.001358,0.069351,0.010287,0.006460,0.445201,0.569708,930647
2354,0.013596,0.149190,0.250000,0.962696,0.0,0.0,0.0,0.0,0.0,0.0,...,0.002007,0.012306,0.011678,0.001494,0.009694,0.005878,0.005818,0.385101,0.593791,899560
863,0.013127,0.042570,0.486111,0.421586,0.0,0.0,0.0,0.0,0.0,0.0,...,0.005953,0.010483,0.002709,0.006586,0.031320,0.012491,0.012359,0.382542,0.590704,599444
539,0.013127,0.045108,0.333333,0.303323,0.0,0.0,0.0,0.0,0.0,0.0,...,0.015643,0.013218,0.001214,0.011067,0.058911,0.006613,0.001043,0.327154,0.436565,748862
2159,0.017972,0.188244,0.347222,0.887856,0.0,0.0,0.0,0.0,0.0,0.0,...,0.013913,0.037830,0.028587,0.006043,0.035794,0.013226,0.043979,0.340165,0.505508,1399811
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1055,0.028598,0.144894,0.416667,0.490694,0.0,1.0,0.0,0.0,0.0,0.0,...,0.035786,0.017320,0.006820,0.011814,0.039523,0.007348,0.005297,0.319381,0.580452,720940
1934,0.006251,0.040031,0.402778,0.796419,0.0,0.0,0.0,0.0,0.0,0.0,...,0.008998,0.023701,0.005419,0.000407,0.086503,0.009552,0.012600,0.466220,0.610808,1550060
1633,0.090170,0.143527,0.597222,0.678743,0.0,0.0,0.0,0.0,0.0,0.0,...,0.028933,0.031449,0.061846,0.004210,0.125280,0.016165,0.122868,0.450239,0.629800,5893321
23,0.086107,0.248389,0.458333,0.017778,0.0,0.0,0.0,0.0,0.0,0.0,...,0.145774,0.113491,0.141349,0.014055,0.293811,0.081558,0.043337,0.505705,0.636020,9260764


In [28]:
test

Unnamed: 0,comments,duration,languages,published_date,3d printing,AI,AIDS,Addiction,Africa,Alzheimer's,...,Fascinating,Unconvincing,Persuasive,Jaw-dropping,OK,Obnoxious,Inspiring,polarity,subjectivity,views
2149,0.004845,0.092755,0.416667,0.885070,0.0,0.0,0.0,0.0,0.0,0.0,...,0.018966,0.003191,0.006913,0.007672,0.011931,0.001470,0.014205,0.384170,0.666599,901880
952,0.020316,0.164616,0.416667,0.452119,0.0,0.0,0.0,0.0,0.0,0.0,...,0.007545,0.056062,0.021207,0.001290,0.080537,0.055841,0.011476,0.414350,0.582823,681873
49,0.030317,0.012302,0.666667,0.042568,0.0,0.0,0.0,0.0,0.0,0.0,...,0.005884,0.027803,0.030923,0.004210,0.099180,0.010287,0.005939,0.398076,0.572443,2299265
1052,0.013752,0.040812,0.597222,0.489940,0.0,0.0,0.0,0.0,0.0,0.0,...,0.017789,0.024613,0.000280,0.009302,0.087248,0.023512,0.003210,0.438725,0.682586,1042789
746,0.014690,0.078891,0.347222,0.379188,0.0,0.0,0.0,0.0,0.0,0.0,...,0.003322,0.014129,0.014667,0.001018,0.075317,0.013226,0.011075,0.383800,0.489600,799855
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
393,0.010783,0.143527,0.333333,0.244157,0.0,0.0,0.0,0.0,0.0,0.0,...,0.011698,0.007293,0.000841,0.002784,0.041014,0.005143,0.006059,0.458786,0.685115,582812
197,0.010158,0.052138,0.361111,0.152981,0.0,0.0,0.0,0.0,0.0,0.0,...,0.024711,0.032361,0.002429,0.020777,0.053691,0.029390,0.008266,0.378353,0.615253,1034064
260,0.122832,0.282171,0.375000,0.187807,0.0,0.0,0.0,0.0,0.0,0.0,...,0.020350,0.162261,0.038397,0.010049,0.081283,0.116091,0.043618,0.339829,0.609067,1532675
1389,0.010783,0.051943,0.458333,0.587849,0.0,0.0,0.0,0.0,1.0,0.0,...,0.005468,0.004558,0.014948,0.000543,0.029828,0.002939,0.003892,0.494352,0.632476,926948


In [29]:
result.columns

Index(['comments', 'duration', 'languages', 'published_date', '3d printing',
       'AI', 'AIDS', 'Addiction', 'Africa', 'Alzheimer's',
       ...
       'Fascinating', 'Unconvincing', 'Persuasive', 'Jaw-dropping', 'OK',
       'Obnoxious', 'Inspiring', 'polarity', 'subjectivity', 'views'],
      dtype='object', length=444)

In [30]:
for c in result.columns:
    print(c)

comments
duration
languages
published_date
3d printing
AI
AIDS
Addiction
Africa
Alzheimer's
Anthropocene
Asia
Autism spectrum disorder
Bioethics
Blindness
Brand
Brazil
Buddhism
CRISPR
Christianity
Criminal Justice
DNA
Debate
Egypt
Europe
Foreign Policy
Gender equality
Gender spectrum
God
Google
Guns
HIV
Human body
Internet
Iran
Islam
LGBT
MacArthur grant
Mars
Middle East
Moon
NASA
Natural resources
New York
Nobel prize
PTSD
Planets
Senses
Slavery
South America
String theory
Surgery
Surveillance
Syria
TED Books
TED Brain Trust
TED Fellows
TED Prize
TED Residency
TED en Español
TED-Ed
TEDMED
TEDNYC
TEDYouth
TEDx
Transgender
United States
Vaccines
activism
adventure
advertising
aging
agriculture
aircraft
algorithm
alternative energy
ancient world
animals
animation
anthropology
ants
apes
archaeology
architecture
art
asteroid
astrobiology
astronomy
atheism
augmented reality
bacteria
beauty
bees
behavioral economics
big bang
big problems
biodiversity
biology
biomechanics
biomimicry
biosphere

In [31]:
tags_result = result[result.columns.difference(['views','subjectivity','polarity','Inspiring','Obnoxious','OK',
                                  'Jaw-dropping','Persuasive','Unconvincing','Fascinating','Informative','Confusing',
                                  'Longwinded','Courageous','Ingenious','Beautiful','Funny','event_class_Yearly TED Conference',
                                  'event_class_TEDx','event_class_TEDWomen','event_class_TEDSalon','event_class_TEDNYC',
                                 'event_class_TEDNYC','event_class_TEDGlobal','event_class_Other','comments','duration',
                                  'languages','published_date'])]

other_than_tags = result[['views','subjectivity','polarity','Inspiring','Obnoxious','OK',
                                  'Jaw-dropping','Persuasive','Unconvincing','Fascinating','Informative','Confusing',
                                  'Longwinded','Courageous','Ingenious','Beautiful','Funny','event_class_Yearly TED Conference',
                                  'event_class_TEDx','event_class_TEDWomen','event_class_TEDSalon','event_class_TEDNYC',
                                 'event_class_TEDNYC','event_class_TEDGlobal','event_class_Other','comments','duration',
                                  'languages','published_date']]


tags_result.head(10)
print(len(tags_result.columns))



416


In [32]:
pca = PCA(n_components=90)
principalComponents = pca.fit_transform(tags_result)


# print (pca.explained_variance_)
# print (pca.explained_variance_ratio_)
print (pca.explained_variance_ratio_.cumsum())

feat_cols = ['feature'+str(i) for i in range(principalComponents.shape[1])]
normalised_data_tags = pd.DataFrame(principalComponents,columns=feat_cols)
normalised_data_tags.tail()

[0.04941927 0.0864253  0.11892303 0.14657868 0.16996053 0.1915126
 0.21287596 0.23392454 0.25381705 0.27150194 0.28681082 0.30040861
 0.31347148 0.32604259 0.33787567 0.34866471 0.35927263 0.36942048
 0.37918659 0.38857979 0.39758046 0.40626183 0.41478876 0.42306397
 0.43101321 0.43888866 0.44670068 0.45428085 0.46162649 0.4688257
 0.47563095 0.48234593 0.4887999  0.49519181 0.50152603 0.50781741
 0.51404227 0.5201     0.52611134 0.53190704 0.53762789 0.54312474
 0.54859738 0.55395025 0.55919581 0.56431469 0.56935082 0.57416997
 0.57896683 0.5837061  0.58832558 0.59290349 0.59742039 0.60174008
 0.60602649 0.61025998 0.61440744 0.6185237  0.62261514 0.62667197
 0.63064531 0.63454769 0.63837003 0.64215883 0.64584374 0.64948816
 0.65305736 0.65660263 0.66006908 0.66351247 0.66693763 0.67025459
 0.67350759 0.67673485 0.67993986 0.68309634 0.68617658 0.68921909
 0.69222612 0.69522692 0.69814487 0.70102256 0.7038772  0.7066455
 0.70939722 0.71211998 0.71482396 0.71746376 0.72006316 0.7226504

Unnamed: 0,feature0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,...,feature80,feature81,feature82,feature83,feature84,feature85,feature86,feature87,feature88,feature89
2462,0.708199,-0.42118,0.980703,0.379241,-0.223035,0.384038,-0.325968,-0.295249,0.079644,0.125205,...,0.059031,0.034824,0.206162,-0.008302,0.125527,0.045221,-0.321999,0.158417,-0.210596,-0.01277
2463,-0.47694,0.654498,0.594641,1.109455,0.080309,0.505936,-0.144932,0.040784,0.04536,-0.323996,...,0.158096,-0.144416,-0.430785,0.215645,-0.090495,0.047176,-0.192129,0.009249,0.075502,0.00869
2464,0.478795,0.768711,1.130882,0.079705,-0.913176,-0.016842,1.214099,-0.142576,-0.608533,0.44173,...,0.072009,-0.030734,0.215995,0.104601,-0.127224,0.218991,-0.094073,0.30769,0.111474,-0.05439
2465,-0.331272,-0.112308,-0.425088,0.212216,0.151791,0.155464,0.218068,-0.452984,0.153851,0.266245,...,0.117258,-0.081642,-0.023793,-0.019539,0.116513,-0.076836,-0.117515,-0.002043,-0.061289,-0.046772
2466,-0.16788,-0.462059,-0.14478,0.210313,0.255193,-0.27415,0.250323,0.362207,-0.049303,-0.403977,...,0.067304,-0.013608,0.051007,-0.040739,-0.016778,-0.024308,-0.052595,-0.090388,0.039766,0.072711


In [33]:
# result = other_than_tags.join(normalised_data_tags)

# views = result['views']
# result = result.loc[:, result.columns != 'views']

pca = PCA(n_components=43)
principalComponents = pca.fit_transform(result)


# print (pca.explained_variance_)
# print (pca.explained_variance_ratio_)
print (pca.explained_variance_ratio_.cumsum())

feat_cols = ['feature'+str(i) for i in range(principalComponents.shape[1])]
normalised_data = pd.DataFrame(principalComponents,columns=feat_cols)
result.tail()

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


Unnamed: 0,comments,duration,languages,published_date,3d printing,AI,AIDS,Addiction,Africa,Alzheimer's,...,Fascinating,Unconvincing,Persuasive,Jaw-dropping,OK,Obnoxious,Inspiring,polarity,subjectivity,views
2462,0.004688,0.15329,0.277778,0.999242,0.0,1.0,0.0,0.0,0.0,0.0,...,0.008306,0.004102,0.009623,0.000747,0.019389,0.002939,0.010232,0.286259,0.453431,1079698
2463,0.004532,0.061902,0.236111,0.99949,0.0,0.0,0.0,0.0,0.0,0.0,...,0.001523,0.005014,0.006913,0.001426,0.022371,0.002939,0.007303,0.399582,0.692199,957803
2464,0.002032,0.213825,0.194444,0.999737,0.0,0.0,0.0,0.0,1.0,0.0,...,0.00263,0.0,0.005138,0.000679,0.016406,0.002204,0.007744,0.484953,0.676884,874817
2465,0.000781,0.146456,0.111111,0.99995,0.0,0.0,0.0,0.0,0.0,0.0,...,0.002976,0.0,0.000841,0.00163,0.0,0.006613,0.002408,0.365977,0.64035,946815
2466,0.002344,0.119313,0.222222,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.003669,0.005925,0.001121,0.000339,0.024609,0.002204,0.006862,0.323856,0.52496,1323235


In [34]:
# Labels are the values we want to predict
labels =views

In [35]:
# Saving result names for later use
result_list = list(result.columns)

In [36]:
# Convert to numpy array
result = np.array(result)

In [37]:
# Split the data into training and testing sets
train_set, test_set, train_labels, test_labels = train_test_split(result, labels, test_size = 0.25, random_state = 42)

In [38]:
print('Training result Shape:', train_set.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing result Shape:', test_set.shape)
print('Testing Labels Shape:', test_labels.shape)

Training result Shape: (1850, 444)
Training Labels Shape: (1850,)
Testing result Shape: (617, 444)
Testing Labels Shape: (617,)


In [39]:
scaler_stand = MinMaxScaler() 
data_scaled_stand = scaler_stand.fit_transform(train_set)

In [40]:
scaler_stand = MinMaxScaler() 
data_scaled_stand = scaler_stand.fit_transform(test_set)

In [41]:
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 50, random_state = 42)

# Train the model on training data
rf.fit(train_set, train_labels);

In [42]:
# Use the forest's predict method on the test data
predictions = rf.predict(test_set)

# Calculate the absolute errors
errors = abs(predictions - test_labels)

# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

Mean Absolute Error: 10689.16 degrees.


In [43]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / test_labels)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 99.68 %.
