In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import statsmodels.api as sm
from statsmodels.formula.api import ols
import ast
from textblob import TextBlob
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics


df_ted_main = pd.read_csv("data/ted_main.csv")
df_ted_transcripts = df = pd.read_csv("data/transcripts.csv")

In [2]:
df_ted_main.drop(['description', 'film_date', 'main_speaker', 'name', 'event', 'num_speaker', 'related_talks', 'speaker_occupation', 'title', 'url'], axis=1, inplace=True)
df_ted_main.head()

Unnamed: 0,comments,duration,languages,published_date,ratings,tags,views
0,4553,1164,60,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 19645}, {...","['children', 'creativity', 'culture', 'dance',...",47227110
1,265,977,43,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 544}, {'i...","['alternative energy', 'cars', 'climate change...",3200520
2,124,1286,26,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 964}, {'i...","['computers', 'entertainment', 'interface desi...",1636292
3,200,1116,35,1151367060,"[{'id': 3, 'name': 'Courageous', 'count': 760}...","['MacArthur grant', 'activism', 'business', 'c...",1697550
4,593,1190,48,1151440680,"[{'id': 9, 'name': 'Ingenious', 'count': 3202}...","['Africa', 'Asia', 'Google', 'demo', 'economic...",12005869


In [3]:
# ----------------------------------------------------------
# 2. one hot encoding
# ----------------------------------------------------------

# one hot encoding tags

# iterate trough the data set
for i, row in df_ted_main.iterrows():
    # change string formed list into normal list
    element_list_form = ast.literal_eval(df_ted_main.at[i,'tags'])
    element_list_form = [i.strip() for i in element_list_form]
    # finish converting by replacing string version of a list for the list type
    df_ted_main.at[i,'tags'] = element_list_form

# one hot encoding with column named "tags"
# since "tags" is a Series on each row, we need to split them first and then we can do one hot encoding
df_ted_main = df_ted_main.drop('tags', 1).join(
    pd.get_dummies(
        pd.DataFrame(df_ted_main.tags.tolist()).stack()
    ).astype(int).sum(level=0)
)

df_ted_main.head()

Unnamed: 0,comments,duration,languages,published_date,ratings,views,3d printing,AI,AIDS,Addiction,...,wikipedia,wind energy,women,women in business,work,work-life balance,world cultures,writing,wunderkind,youth
0,4553,1164,60,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 19645}, {...",47227110,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,265,977,43,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 544}, {'i...",3200520,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,124,1286,26,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 964}, {'i...",1636292,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,200,1116,35,1151367060,"[{'id': 3, 'name': 'Courageous', 'count': 760}...",1697550,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,593,1190,48,1151440680,"[{'id': 9, 'name': 'Ingenious', 'count': 3202}...",12005869,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# # one hot encoding "event" column

# def one_hot_encode(original_dataframe, feature_to_encode):
#     dummies = pd.get_dummies(original_dataframe[[feature_to_encode]])
#     res = pd.concat([original_dataframe, dummies], axis=1)
#     res = res.drop([feature_to_encode], axis=1)
#     return(res)

# df_ted_main = one_hot_encode(df_ted_main, 'event')
# df_ted_main.head()

In [5]:
#rating analysis

df_ted_main['ratings']=df_ted_main['ratings'].str.replace("'",'"')

df_ted_main=df_ted_main.merge(df_ted_main.ratings.apply(lambda x: pd.Series(pd.read_json(x)['count'].values,index=pd.read_json(x)['name'])), 
            left_index=True, right_index=True)

df_ted_main.drop('ratings', axis=1, inplace=True)

df_ted_main.head()

Unnamed: 0,comments,duration,languages,published_date,views,3d printing,AI,AIDS,Addiction,Africa,...,Longwinded,Confusing,Informative,Fascinating,Unconvincing,Persuasive,Jaw-dropping,OK,Obnoxious,Inspiring
0,4553,1164,60,1151367060,47227110,0,0,0,0,0,...,387,242,7346,10581,300,10704,4439,1174,209,24924
1,265,977,43,1151367060,3200520,0,0,0,0,0,...,113,62,443,132,258,268,116,203,131,413
2,124,1286,26,1151367060,1636292,0,0,0,0,0,...,78,27,395,166,104,230,54,146,142,230
3,200,1116,35,1151367060,1697550,0,0,0,0,0,...,53,32,380,132,36,460,230,85,35,1070
4,593,1190,48,1151440680,12005869,0,0,0,0,1,...,110,72,5433,4606,67,2542,3736,248,61,2893


In [6]:
# sentiment analysis of transcript
pol = lambda x: TextBlob(x).sentiment.polarity
sub = lambda x: TextBlob(x).sentiment.subjectivity

df_ted_transcripts['polarity'] = df_ted_transcripts['transcript'].apply(pol)
df_ted_transcripts['subjectivity'] = df_ted_transcripts['transcript'].apply(sub)
df_ted_transcripts.head()

Unnamed: 0,transcript,url,polarity,subjectivity
0,Good morning. How are you?(Laughter)It's been ...,https://www.ted.com/talks/ken_robinson_says_sc...,0.146452,0.462051
1,"Thank you so much, Chris. And it's truly a gre...",https://www.ted.com/talks/al_gore_on_averting_...,0.157775,0.424101
2,"(Music: ""The Sound of Silence,"" Simon & Garfun...",https://www.ted.com/talks/david_pogue_says_sim...,0.136579,0.475229
3,If you're here today — and I'm very happy that...,https://www.ted.com/talks/majora_carter_s_tale...,0.082928,0.439165
4,"About 10 years ago, I took on the task to teac...",https://www.ted.com/talks/hans_rosling_shows_t...,0.096483,0.450631


In [7]:
# merging both datasets and delete the rows that don't have a transcipt
df_ted_talks = pd.merge(df_ted_main, df_ted_transcripts, left_index=True, right_index=True)

In [8]:
# deleting the transcript and url columns because we don't need them.
del df_ted_talks['transcript']
del df_ted_talks['url']
df_ted_talks.head()

Unnamed: 0,comments,duration,languages,published_date,views,3d printing,AI,AIDS,Addiction,Africa,...,Informative,Fascinating,Unconvincing,Persuasive,Jaw-dropping,OK,Obnoxious,Inspiring,polarity,subjectivity
0,4553,1164,60,1151367060,47227110,0,0,0,0,0,...,7346,10581,300,10704,4439,1174,209,24924,0.146452,0.462051
1,265,977,43,1151367060,3200520,0,0,0,0,0,...,443,132,258,268,116,203,131,413,0.157775,0.424101
2,124,1286,26,1151367060,1636292,0,0,0,0,0,...,395,166,104,230,54,146,142,230,0.136579,0.475229
3,200,1116,35,1151367060,1697550,0,0,0,0,0,...,380,132,36,460,230,85,35,1070,0.082928,0.439165
4,593,1190,48,1151440680,12005869,0,0,0,0,1,...,5433,4606,67,2542,3736,248,61,2893,0.096483,0.450631


In [9]:
# from sklearn.preprocessing import StandardScaler

# scaled_features = StandardScaler().fit_transform(result.values)

# result = pd.DataFrame(scaled_features, index=result.index, columns=result.columns)
# result

In [10]:
# # shuffle data set before splitting
# result = result.sample(frac=1).reset_index(drop=True)
# result

In [11]:
# from sklearn.model_selection import StratifiedShuffleSplit

# split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
# for train_index, test_index in split.split(df_ted_talks, df_ted_talks["views"]):
#     df_train_set = df_ted_talks.loc[train_index]
#     df_test_set = df_ted_talks.loc[test_index]

# df_test_set["views"].value_counts() / len(df_test_set)

In [12]:
#We will shuffle the whole dataset first (df.sample(frac=1)) and then split our data set into the following parts:
# 60% - train set
# 20% - validation set
# 20% - test set

# df_train, df_val, df_test = np.split(df_ted_talks.sample(frac=1), [int(.6*len(df_ted_talks)), int(.8*len(df_ted_talks))])

In [13]:
# from sklearn.preprocessing import StandardScaler


# df_train_stand = StandardScaler().fit_transform(df_train.values)
# df_val_stand = StandardScaler().fit_transform(df_val.values)
# df_test_stand = StandardScaler().fit_transform(df_test.values)



# df_train = pd.DataFrame(df_train_stand, index=df_train.index, columns=df_train.columns)
# df_val = pd.DataFrame(df_val_stand, index=df_val.index, columns=df_val.columns)
# df_test = pd.DataFrame(df_test_stand, index=df_test.index, columns=df_test.columns)

In [14]:
# Splitting
labels = df_ted_talks['views']
train_set, test_set, train_labels, test_labels = train_test_split(df_ted_talks, labels, test_size = 1/7.0, random_state = 42)

In [15]:
# Standardization

from sklearn.preprocessing import StandardScaler

# Fit on training set only.
# https://stats.stackexchange.com/questions/174823/how-to-apply-standardization-normalization-to-train-and-testset-if-prediction-i
scaler = StandardScaler()
scaler.fit(train_set)

# Apply transform to both the training set and the test set.
df_train_set_scaled = scaler.transform(train_set)
df_test_set_scaled = scaler.transform(test_set)

# convert to data frames
df_train_set = pd.DataFrame(df_train_set_scaled, index=train_set.index, columns=train_set.columns)
df_test_set = pd.DataFrame(df_test_set_scaled, index=test_set.index, columns=test_set.columns)



In [16]:
# PCA preprocessing - getting tag columns

# columns without PCA
columns_without_pca = ['views','subjectivity','polarity','Inspiring','Obnoxious','OK',
                                  'Jaw-dropping','Persuasive','Unconvincing','Fascinating','Informative','Confusing',
                                  'Longwinded','Courageous','Ingenious','Beautiful','Funny','comments','duration',
                                  'languages','published_date']

# get tag columns for both, training and test sets
tag_columns_train_set = df_train_set[df_train_set.columns.difference(columns_without_pca)]
tag_columns_test_set = df_test_set[df_test_set.columns.difference(columns_without_pca)]

# saving indexes of training and test sets for the next step.
# PCA will convert dataframe into numpy array and indexes will be lost. 
# this way I will keep them stored
tag_columns_train_set_indexes = tag_columns_train_set.index
tag_columns_test_set_indexes = tag_columns_test_set.index


##### DELETION #####
# all tag columns as a lists
tag_columns_train_set_list = list(tag_columns_train_set)
tag_columns_test_set_list = list(tag_columns_test_set)

# remove tag columns from original sets
df_train_set.drop(tag_columns_train_set_list, axis=1, inplace=True)
df_test_set.drop(tag_columns_test_set_list, axis=1, inplace=True)


# display all column names in the dataframe without tags
# list(df_train_set)

In [17]:
# PCA
pca = PCA(.80)

# fit PCA only on training set of tag columns
pca.fit(tag_columns_train_set)

# use PCA to transform both sets of tag columns
tag_columns_train_set = pca.transform(tag_columns_train_set)
tag_columns_test_set = pca.transform(tag_columns_test_set)


In [18]:
# combine: 
# PCA transformed training and test sets of tag columns, 
# with original training and test sets

# training set
feat_cols_train = ['feature'+str(i) for i in range(tag_columns_train_set.shape[1])]
# creating a dataframe from PCA results
# df_pca_train = pd.DataFrame(tag_columns_train_set, index=tag_columns_train_set_indexes)
df_pca_train = pd.DataFrame(tag_columns_train_set, columns=feat_cols_train, index=tag_columns_train_set_indexes)


# test set
feat_cols_test = ['feature'+str(i) for i in range(tag_columns_test_set.shape[1])]
# creating a dataframe from PCA results
# df_pca_test = pd.DataFrame(tag_columns_test_set,columns=tag_columns_test_set_indexes)
df_pca_test = pd.DataFrame(tag_columns_test_set, columns=feat_cols_test, index=tag_columns_test_set_indexes)

# results of combining
df_train_set = df_train_set.join(df_pca_train, how="left")
df_test_set = df_test_set.join(df_pca_test, how="left")


In [19]:
df_train_set.info()
df_test_set.info()

# print('Training result Shape:', df_train_set.shape)
# print('Testing result Shape:', df_test_set.shape)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 2114 entries, 2240 to 860
Columns: 247 entries, comments to feature225
dtypes: float64(247)
memory usage: 4.1 MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 353 entries, 1711 to 1987
Columns: 247 entries, comments to feature225
dtypes: float64(247)
memory usage: 693.9 KB


In [20]:
df_train_set

Unnamed: 0,comments,duration,languages,published_date,views,Funny,Beautiful,Ingenious,Courageous,Longwinded,...,feature216,feature217,feature218,feature219,feature220,feature221,feature222,feature223,feature224,feature225
2240,-0.488876,-0.542353,-0.098925,1.397547,-0.322920,-0.248032,-0.300960,-0.192644,0.021828,-0.416349,...,-0.798559,1.603121,-0.379399,0.049373,0.609996,-0.387291,-0.633508,-1.144576,0.453698,-0.599351
2115,-0.553355,0.262518,0.341574,1.232969,0.277369,-0.203612,-0.021918,-0.136158,-0.194311,-0.206030,...,-1.648164,0.415784,-0.954074,-1.984280,2.369547,-0.686033,1.899612,-0.412546,1.287202,1.003407
1241,-0.325784,0.252031,-0.539424,-0.008371,-0.476259,-0.254377,-0.398202,-0.508297,-0.216143,0.004289,...,0.315670,0.270713,-0.606480,0.502964,-0.553205,-1.640681,-0.024057,0.713324,-0.073705,0.316816
25,-0.037528,0.807838,0.121325,-1.955519,-0.242211,-0.049731,-0.218516,0.372209,-0.172478,0.261345,...,0.611486,-0.860212,-0.508229,0.820453,0.895390,-0.323604,1.129766,0.120096,-0.700810,0.398685
497,0.793104,0.726564,-0.319174,-0.965832,-0.445751,-0.252791,-0.400316,0.083137,-0.185578,0.144502,...,0.286345,0.487553,0.923323,-0.102922,-0.195832,-0.197039,0.164758,0.407234,0.284895,0.162588
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1638,0.224178,0.065888,0.121325,0.516429,-0.329690,-0.244859,-0.332670,0.016684,-0.192127,-0.533192,...,0.019402,0.515199,-0.269387,0.674941,-0.360185,-0.891136,0.282791,0.830921,0.491677,-0.745556
1095,0.641390,0.760646,0.451699,-0.155188,-0.011158,-0.241686,-0.298846,-0.209257,-0.334037,2.691696,...,-0.569927,1.320780,-0.725084,-0.997317,0.826329,-1.064077,-1.723156,-0.213031,0.830875,1.112829
1130,-0.496462,-0.390293,-0.098925,-0.113079,-0.544435,-0.254377,-0.385519,-0.089641,-0.347136,-0.626667,...,0.351567,0.338216,-0.042636,0.312027,-0.198456,-0.250696,1.477059,-0.140380,-0.003877,-0.416479
1294,-0.481291,0.042292,-0.759673,0.046817,-0.440722,-0.255964,-0.381291,0.106396,-0.323121,-0.673405,...,-0.066068,0.059129,-0.101180,-0.221717,0.224442,-0.264615,0.132827,-0.101856,0.079495,0.135885


In [21]:
df_test_set

Unnamed: 0,comments,duration,languages,published_date,views,Funny,Beautiful,Ingenious,Courageous,Longwinded,...,feature216,feature217,feature218,feature219,feature220,feature221,feature222,feature223,feature224,feature225
1711,-0.090628,-0.610518,0.451699,0.623953,0.548063,-0.221063,1.068885,-0.465102,1.740022,-0.416349,...,0.528620,2.601715,-0.236989,1.581045,0.032494,2.061871,-0.592463,1.522558,1.178092,-1.438776
1557,-0.120971,-0.492540,0.341574,0.395724,-0.255468,-0.251204,-0.286162,-0.092964,0.010912,-0.790249,...,0.058177,0.693813,-0.558967,0.300258,1.085488,-0.947930,-0.273464,-0.681352,-0.191981,0.785101
1870,-0.344748,0.427687,0.011200,0.846619,-0.103269,-0.240100,-0.100134,-0.455134,-0.024020,-0.579930,...,-0.209801,-0.521573,-0.196326,-0.556653,0.317521,-0.166903,-1.644995,3.035983,-0.043845,-0.952574
1703,0.080050,0.364765,0.231450,0.613680,-0.372856,-0.238513,-0.368607,-0.465102,0.078592,-0.416349,...,-0.520387,0.400676,0.275621,0.753712,0.914029,0.650657,0.475686,1.148817,-1.564900,-0.380353
1857,-0.454741,-0.644601,0.561824,0.828866,0.197733,-0.179816,-0.036715,-0.345486,-0.122264,-0.065817,...,-0.674349,0.518669,0.167597,0.214485,0.233701,-0.294256,0.138164,-0.052792,-0.211388,0.354174
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1624,-0.257513,-0.146472,0.451699,0.495849,-0.338506,-0.255964,-0.402430,-0.428553,-0.307838,-0.650036,...,0.122965,0.237100,-0.874231,-0.190443,0.274259,0.708804,-0.771384,-0.187505,0.448029,0.107789
1486,-0.257513,-1.024752,-3.072293,0.304988,-0.591864,-0.060836,-0.355923,-0.292324,-0.358052,-0.392980,...,-0.428790,0.460388,-1.750798,1.923307,1.114681,-0.670168,1.221850,0.697245,-0.367337,0.107978
1607,0.049707,-0.364076,0.011200,0.474304,-0.200598,-0.252791,-0.385519,-0.475070,-0.344953,-0.579930,...,0.167086,0.281821,0.348690,-0.009450,-0.546721,0.409849,0.033548,0.128989,0.163001,-0.087245
124,0.311413,-0.733740,0.671949,-1.666848,0.646860,3.645009,-0.053627,2.438905,-0.279456,1.032514,...,-0.660720,-0.213431,-0.116629,1.287174,-1.080889,0.371869,0.305815,0.712231,0.041865,0.685467


In [22]:
df_train_set.shape

(2114, 247)

In [23]:
df_test_set.shape

(353, 247)

In [24]:
df_train_set.isnull().values.any()

False

In [25]:
df_test_set.isnull().values.any()

False

In [65]:
rf = RandomForestRegressor(criterion='mae',max_depth=25, max_features=70, n_estimators=70, min_samples_leaf=2, min_samples_split=2, 
                           random_state=42)
rf.fit(train_set, train_labels)
pred_labels = rf.predict(train_set)
test_pred_labels = rf.predict(test_set)

errors = abs(test_pred_labels - test_labels)

print('Training MAE: {:0.2f}'.format(metrics.mean_absolute_error(train_labels, pred_labels)))
print('Test MAE: {:0.2f}'.format(metrics.mean_absolute_error(test_labels, test_pred_labels)), 'degrees')
print('Views mean: {:0.2f}'.format(labels.mean()))
print('Views std: {:0.2f}'.format(labels.std()))

# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / test_labels)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Training MAE: 134140.26
Test MAE: 261944.90 degrees
Views mean: 1724277.02
Views std: 2534604.53
Accuracy: 84.7 %.


In [54]:
# max_features=70 max_depth=25 = %84.7 - 

In [62]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state = 42)
from pprint import pprint
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(rf.get_params())

Parameters currently in use:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}


In [63]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}
