In [556]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import statsmodels.api as sm
from statsmodels.formula.api import ols
import ast
from textblob import TextBlob
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

df_ted_main = pd.read_csv("data/ted_main.csv")
df_ted_transcripts = df = pd.read_csv("data/transcripts.csv")

In [557]:
df_ted_main.drop(['description', 'film_date', 'main_speaker', 'name', 'event', 'num_speaker', 'related_talks', 'speaker_occupation', 'title', 'url'], axis=1, inplace=True)
df_ted_main.head()

Unnamed: 0,comments,duration,languages,published_date,ratings,tags,views
0,4553,1164,60,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 19645}, {...","['children', 'creativity', 'culture', 'dance',...",47227110
1,265,977,43,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 544}, {'i...","['alternative energy', 'cars', 'climate change...",3200520
2,124,1286,26,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 964}, {'i...","['computers', 'entertainment', 'interface desi...",1636292
3,200,1116,35,1151367060,"[{'id': 3, 'name': 'Courageous', 'count': 760}...","['MacArthur grant', 'activism', 'business', 'c...",1697550
4,593,1190,48,1151440680,"[{'id': 9, 'name': 'Ingenious', 'count': 3202}...","['Africa', 'Asia', 'Google', 'demo', 'economic...",12005869


In [558]:
# ----------------------------------------------------------
# 2. one hot encoding
# ----------------------------------------------------------

# one hot encoding tags

# iterate trough the data set
for i, row in df_ted_main.iterrows():
    # change string formed list into normal list
    element_list_form = ast.literal_eval(df_ted_main.at[i,'tags'])
    element_list_form = [i.strip() for i in element_list_form]
    # finish converting by replacing string version of a list for the list type
    df_ted_main.at[i,'tags'] = element_list_form

# one hot encoding with column named "tags"
# since "tags" is a Series on each row, we need to split them first and then we can do one hot encoding
df_ted_main = df_ted_main.drop('tags', 1).join(
    pd.get_dummies(
        pd.DataFrame(df_ted_main.tags.tolist()).stack()
    ).astype(int).sum(level=0)
)

df_ted_main.head()

Unnamed: 0,comments,duration,languages,published_date,ratings,views,3d printing,AI,AIDS,Addiction,...,wikipedia,wind energy,women,women in business,work,work-life balance,world cultures,writing,wunderkind,youth
0,4553,1164,60,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 19645}, {...",47227110,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,265,977,43,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 544}, {'i...",3200520,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,124,1286,26,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 964}, {'i...",1636292,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,200,1116,35,1151367060,"[{'id': 3, 'name': 'Courageous', 'count': 760}...",1697550,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,593,1190,48,1151440680,"[{'id': 9, 'name': 'Ingenious', 'count': 3202}...",12005869,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [559]:
# # one hot encoding "event" column

# def one_hot_encode(original_dataframe, feature_to_encode):
#     dummies = pd.get_dummies(original_dataframe[[feature_to_encode]])
#     res = pd.concat([original_dataframe, dummies], axis=1)
#     res = res.drop([feature_to_encode], axis=1)
#     return(res)

# df_ted_main = one_hot_encode(df_ted_main, 'event')
# df_ted_main.head()

In [560]:
#rating analysis

df_ted_main['ratings']=df_ted_main['ratings'].str.replace("'",'"')

df_ted_main=df_ted_main.merge(df_ted_main.ratings.apply(lambda x: pd.Series(pd.read_json(x)['count'].values,index=pd.read_json(x)['name'])), 
            left_index=True, right_index=True)

df_ted_main.drop('ratings', axis=1, inplace=True)

df_ted_main.head()

Unnamed: 0,comments,duration,languages,published_date,views,3d printing,AI,AIDS,Addiction,Africa,...,Longwinded,Confusing,Informative,Fascinating,Unconvincing,Persuasive,Jaw-dropping,OK,Obnoxious,Inspiring
0,4553,1164,60,1151367060,47227110,0,0,0,0,0,...,387,242,7346,10581,300,10704,4439,1174,209,24924
1,265,977,43,1151367060,3200520,0,0,0,0,0,...,113,62,443,132,258,268,116,203,131,413
2,124,1286,26,1151367060,1636292,0,0,0,0,0,...,78,27,395,166,104,230,54,146,142,230
3,200,1116,35,1151367060,1697550,0,0,0,0,0,...,53,32,380,132,36,460,230,85,35,1070
4,593,1190,48,1151440680,12005869,0,0,0,0,1,...,110,72,5433,4606,67,2542,3736,248,61,2893


In [561]:
# sentiment analysis of transcript
pol = lambda x: TextBlob(x).sentiment.polarity
sub = lambda x: TextBlob(x).sentiment.subjectivity

df_ted_transcripts['polarity'] = df_ted_transcripts['transcript'].apply(pol)
df_ted_transcripts['subjectivity'] = df_ted_transcripts['transcript'].apply(sub)
df_ted_transcripts.head()

Unnamed: 0,transcript,url,polarity,subjectivity
0,Good morning. How are you?(Laughter)It's been ...,https://www.ted.com/talks/ken_robinson_says_sc...,0.146452,0.462051
1,"Thank you so much, Chris. And it's truly a gre...",https://www.ted.com/talks/al_gore_on_averting_...,0.157775,0.424101
2,"(Music: ""The Sound of Silence,"" Simon & Garfun...",https://www.ted.com/talks/david_pogue_says_sim...,0.136579,0.475229
3,If you're here today — and I'm very happy that...,https://www.ted.com/talks/majora_carter_s_tale...,0.082928,0.439165
4,"About 10 years ago, I took on the task to teac...",https://www.ted.com/talks/hans_rosling_shows_t...,0.096483,0.450631


In [562]:
# merging both datasets and delete the rows that don't have a transcipt
df_ted_talks = pd.merge(df_ted_main, df_ted_transcripts, left_index=True, right_index=True)

In [563]:
# deleting the transcript and url columns because we don't need them.
del df_ted_talks['transcript']
del df_ted_talks['url']
df_ted_talks.head()

Unnamed: 0,comments,duration,languages,published_date,views,3d printing,AI,AIDS,Addiction,Africa,...,Informative,Fascinating,Unconvincing,Persuasive,Jaw-dropping,OK,Obnoxious,Inspiring,polarity,subjectivity
0,4553,1164,60,1151367060,47227110,0,0,0,0,0,...,7346,10581,300,10704,4439,1174,209,24924,0.146452,0.462051
1,265,977,43,1151367060,3200520,0,0,0,0,0,...,443,132,258,268,116,203,131,413,0.157775,0.424101
2,124,1286,26,1151367060,1636292,0,0,0,0,0,...,395,166,104,230,54,146,142,230,0.136579,0.475229
3,200,1116,35,1151367060,1697550,0,0,0,0,0,...,380,132,36,460,230,85,35,1070,0.082928,0.439165
4,593,1190,48,1151440680,12005869,0,0,0,0,1,...,5433,4606,67,2542,3736,248,61,2893,0.096483,0.450631


In [564]:
# from sklearn.preprocessing import StandardScaler

# scaled_features = StandardScaler().fit_transform(result.values)

# result = pd.DataFrame(scaled_features, index=result.index, columns=result.columns)
# result

In [565]:
# # shuffle data set before splitting
# result = result.sample(frac=1).reset_index(drop=True)
# result

In [566]:
# from sklearn.model_selection import StratifiedShuffleSplit

# split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
# for train_index, test_index in split.split(df_ted_talks, df_ted_talks["views"]):
#     df_train_set = df_ted_talks.loc[train_index]
#     df_test_set = df_ted_talks.loc[test_index]

# df_test_set["views"].value_counts() / len(df_test_set)

In [567]:
#We will shuffle the whole dataset first (df.sample(frac=1)) and then split our data set into the following parts:
# 60% - train set
# 20% - validation set
# 20% - test set

# df_train, df_val, df_test = np.split(df_ted_talks.sample(frac=1), [int(.6*len(df_ted_talks)), int(.8*len(df_ted_talks))])

In [568]:
# from sklearn.preprocessing import StandardScaler


# df_train_stand = StandardScaler().fit_transform(df_train.values)
# df_val_stand = StandardScaler().fit_transform(df_val.values)
# df_test_stand = StandardScaler().fit_transform(df_test.values)



# df_train = pd.DataFrame(df_train_stand, index=df_train.index, columns=df_train.columns)
# df_val = pd.DataFrame(df_val_stand, index=df_val.index, columns=df_val.columns)
# df_test = pd.DataFrame(df_test_stand, index=df_test.index, columns=df_test.columns)

In [569]:
# Splitting
train_set, test_set, train_labels, test_labels = train_test_split(df_ted_talks, df_ted_talks['views'], test_size = 0.2, random_state = 42)


In [570]:
# Standardization

from sklearn.preprocessing import StandardScaler

# Fit on training set only.
# https://stats.stackexchange.com/questions/174823/how-to-apply-standardization-normalization-to-train-and-testset-if-prediction-i
scaler.fit(train_set)

# Apply transform to both the training set and the test set.
df_train_set_scaled = scaler.transform(train_set)
df_test_set_scaled = scaler.transform(test_set)

# convert to data frames
df_train_set = pd.DataFrame(df_train_set_scaled, index=train_set.index, columns=train_set.columns)
df_test_set = pd.DataFrame(df_test_set_scaled, index=test_set.index, columns=test_set.columns)



In [571]:
# PCA preprocessing - getting tag columns

# columns without PCA
columns_without_pca = ['views','subjectivity','polarity','Inspiring','Obnoxious','OK',
                                  'Jaw-dropping','Persuasive','Unconvincing','Fascinating','Informative','Confusing',
                                  'Longwinded','Courageous','Ingenious','Beautiful','Funny','comments','duration',
                                  'languages','published_date']

# get tag columns for both, training and test sets
tag_columns_train_set = df_train_set[df_train_set.columns.difference(columns_without_pca)]
tag_columns_test_set = df_test_set[df_test_set.columns.difference(columns_without_pca)]

# saving indexes of training and test sets for the next step.
# PCA will convert dataframe into numpy array and indexes will be lost. 
# this way I will keep them stored
tag_columns_train_set_indexes = tag_columns_train_set.index
tag_columns_test_set_indexes = tag_columns_test_set.index


##### DELETION #####
# all tag columns as a lists
tag_columns_train_set_list = list(tag_columns_train_set)
tag_columns_test_set_list = list(tag_columns_test_set)

# remove tag columns from original sets
df_train_set.drop(tag_columns_train_set_list, axis=1, inplace=True)
df_test_set.drop(tag_columns_test_set_list, axis=1, inplace=True)


# display all column names in the dataframe without tags
# list(df_train_set)

In [572]:
# PCA
pca = PCA(.80)

# fit PCA only on training set of tag columns
pca.fit(tag_columns_train_set)

# use PCA to transform both sets of tag columns
tag_columns_train_set = pca.transform(tag_columns_train_set)
tag_columns_test_set = pca.transform(tag_columns_test_set)


In [573]:
# combine: 
# PCA transformed training and test sets of tag columns, 
# with original training and test sets

# training set
feat_cols_train = ['feature'+str(i) for i in range(tag_columns_train_set.shape[1])]
# creating a dataframe from PCA results
# df_pca_train = pd.DataFrame(tag_columns_train_set, index=tag_columns_train_set_indexes)
df_pca_train = pd.DataFrame(tag_columns_train_set, columns=feat_cols_train, index=tag_columns_train_set_indexes)


# test set
feat_cols_test = ['feature'+str(i) for i in range(tag_columns_test_set.shape[1])]
# creating a dataframe from PCA results
# df_pca_test = pd.DataFrame(tag_columns_test_set,columns=tag_columns_test_set_indexes)
df_pca_test = pd.DataFrame(tag_columns_test_set, columns=feat_cols_test, index=tag_columns_test_set_indexes)

# results of combining
df_train_set = df_train_set.join(df_pca_train, how="left")
df_test_set = df_test_set.join(df_pca_test, how="left")


In [574]:
df_train_set.info()
df_test_set.info()

# print('Training result Shape:', df_train_set.shape)
# print('Testing result Shape:', df_test_set.shape)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1973 entries, 1124 to 860
Columns: 244 entries, comments to feature222
dtypes: float64(244)
memory usage: 3.8 MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 494 entries, 1711 to 1316
Columns: 244 entries, comments to feature222
dtypes: float64(244)
memory usage: 965.5 KB


In [575]:
df_train_set

Unnamed: 0,comments,duration,languages,published_date,views,Funny,Beautiful,Ingenious,Courageous,Longwinded,...,feature213,feature214,feature215,feature216,feature217,feature218,feature219,feature220,feature221,feature222
1124,-0.489040,-0.099535,0.455016,-0.113729,-0.449341,-0.238921,-0.272665,-0.478708,-0.138321,-0.552694,...,0.304571,0.095355,-1.249724,-0.608259,0.008785,-0.573317,0.580175,0.635650,-0.105556,-0.420863
856,-0.236517,-1.136071,1.116015,-0.457430,-0.164055,0.925606,-0.196375,-0.190301,-0.155339,-0.437095,...,0.325494,0.638003,-0.154831,-0.555155,-0.623465,-0.790688,0.715285,-0.801524,0.008567,-1.486012
1807,0.332603,-1.109763,0.455016,0.769812,-0.194887,-0.235836,-0.355141,-0.386942,-0.278720,-0.113419,...,0.161927,-0.291780,-0.319869,-0.000184,0.193258,0.471257,-0.087844,0.240809,0.167648,0.441202
219,-0.187520,1.415808,0.675349,-1.405273,0.282213,0.607868,0.453123,0.048946,-0.034086,5.527794,...,-0.037144,-0.224580,0.149397,0.612761,-0.014009,-0.394012,-0.229462,-0.404349,0.375397,0.287378
445,0.268530,0.566059,0.455016,-1.044982,0.095662,0.200669,-0.264417,0.124325,0.080786,0.649532,...,0.928488,3.010525,1.243017,1.924844,-0.168631,0.915021,0.814252,-1.689825,-1.558535,0.077125
808,-0.455119,0.581844,-0.426316,-0.530679,-0.540516,-0.255888,-0.398441,-0.511482,-0.344665,0.950088,...,-0.597887,-0.117148,-0.500722,0.438446,-1.023885,-0.887399,0.310683,0.352749,-0.136112,0.798662
1874,-0.590803,0.500289,-0.205983,0.862792,-0.258616,-0.223497,-0.190189,-0.498372,0.008459,-0.529574,...,0.070513,-0.229511,0.254740,-1.865139,0.592683,0.160542,0.205104,-0.916890,-1.273051,1.545335
1912,-0.620955,-1.207103,0.234683,0.927584,-0.252130,-0.243548,-0.274727,-0.350891,-0.353174,-0.645173,...,2.055547,1.164976,0.864283,-0.756510,-1.456978,2.501854,-0.103899,0.700974,-0.443997,-1.764332
303,-0.579496,-1.443875,1.005848,-1.254240,-0.382346,-0.117070,-0.101527,-0.187024,-0.329774,-0.552694,...,-0.427240,0.236992,0.100124,0.316699,-0.064147,-0.009469,0.169558,-0.310511,0.806039,0.938234
2171,0.528591,-0.041657,-0.977148,1.316291,-0.241555,-0.249718,0.826327,-0.472154,2.393115,-0.529574,...,1.608842,-0.091988,-0.813707,0.426650,-1.123887,0.540785,-0.168871,0.307411,1.186258,-0.034317


In [576]:
df_test_set

Unnamed: 0,comments,duration,languages,published_date,views,Funny,Beautiful,Ingenious,Courageous,Longwinded,...,feature213,feature214,feature215,feature216,feature217,feature218,feature219,feature220,feature221,feature222
1711,-0.093295,-0.617803,0.455016,0.631769,0.530702,-0.220412,1.038703,-0.459044,1.691120,-0.413975,...,0.092724,-1.210037,0.510059,2.967095,2.594836,1.460633,0.510280,0.702349,0.087644,1.582705
1557,-0.123447,-0.499417,0.344849,0.402668,-0.254255,-0.249718,-0.282974,-0.091980,0.006332,-0.783891,...,1.296262,-0.623870,0.429073,0.442885,0.376661,-1.131674,-0.516259,-1.026629,-0.164418,-2.182903
1870,-0.345818,0.423995,0.014350,0.855287,-0.105574,-0.238921,-0.101527,-0.449212,-0.027704,-0.575814,...,-0.376203,-1.509046,0.738552,1.068179,-0.229730,2.163780,-0.484309,-1.705047,0.311476,0.288345
1703,0.076310,0.360856,0.234683,0.621457,-0.368928,-0.237379,-0.363388,-0.459044,0.072277,-0.413975,...,0.447316,-0.651771,-1.001307,1.902289,-1.448486,0.101784,-0.002294,-0.275054,0.103527,-0.722486
1857,-0.455119,-0.652003,0.565182,0.837466,0.188470,-0.180309,-0.039670,-0.341059,-0.123431,-0.067180,...,-0.115853,-0.393911,-0.210898,0.052586,-0.084352,0.361776,0.059720,0.530695,0.213564,-0.002073
756,1.406769,0.673922,-0.316149,-0.596654,-0.306533,-0.229667,-0.353079,-0.367278,-0.244684,0.187137,...,0.796614,0.183270,0.050791,-0.156226,-0.807044,2.496438,0.072523,-0.445729,0.419431,-0.980633
727,0.652968,0.950156,0.014350,-0.643572,-0.431672,-0.251261,-0.313903,-0.475431,0.344566,0.557053,...,-1.704591,-0.531217,-0.682617,2.740445,0.395005,-0.693305,0.343430,-1.754710,-0.329790,-0.626086
1050,-0.334511,-0.673050,0.344849,-0.194510,-0.306549,-0.240464,-0.363388,-0.055929,-0.306375,-0.668292,...,0.305930,-1.094527,-0.643487,0.330911,-1.228102,-1.415171,-0.732950,0.675230,-2.048451,-0.671862
498,-0.202596,0.600259,0.014350,-0.963156,-0.232928,-0.251261,0.042806,0.402901,-0.302120,0.325856,...,0.760642,0.365627,1.279955,0.465673,-1.381453,2.226387,-0.737594,-0.251185,-0.557643,0.829149
237,-0.571958,-1.178164,-3.070310,-1.364100,-0.625870,-0.254345,0.275800,-0.495095,-0.321265,-0.622053,...,0.033888,0.950097,-0.085120,1.259464,1.241663,0.579299,-0.404309,-0.232001,0.455444,-0.512673


In [577]:
df_train_set.shape

(1973, 244)

In [578]:
df_test_set.shape

(494, 244)

In [579]:
df_train_set.isnull().values.any()

False

In [580]:
df_test_set.isnull().values.any()

False

In [581]:
# Instantiate model with 50 decision trees
rf = RandomForestRegressor(n_estimators = 50, random_state = 42)

# Train the model on training data
rf.fit(df_train_set, train_labels);

In [582]:
# Use the forest's predict method on the test data
predictions = rf.predict(df_test_set)

# Calculate the absolute errors
errors = abs(predictions - test_labels)

# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')


Mean Absolute Error: 26639.82 degrees.


In [583]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / test_labels)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 99.46 %.


In [584]:
from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor(random_state=0)
gbr.fit(df_train_set, train_labels)

gbr.predict(df_test_set)

array([ 3113029.90977243,  1046982.09249947,  1484890.14114954,
         742507.51542414,  2193354.2436755 ,   896595.85872561,
         589470.01176732,   894708.75424675,  1120196.52926736,
          99642.95736049,   742814.63123655,  1044375.77151774,
         957204.58667594,  1890475.07142503,   742056.61138057,
         522825.5579331 ,   419297.23752721,  2459081.10724626,
         333728.00463503,   131352.30311646,  1151286.34094861,
         304231.70025018,  5426722.74691305,   788449.26406751,
        1663199.03922358,  1229936.1556724 ,  1043989.67248361,
        1196643.09489582,  1046007.93703231,  1232278.15292801,
         541457.06046612,   971660.78794638,  1021546.25940705,
        2819005.77229547,  1022975.4758066 ,   895081.30464523,
        1116684.54305897,   418409.55779842,  2370704.5548685 ,
         898731.68525192,  1535925.04073899, 13398710.11505088,
        2724503.05185273,   301266.07866191,  1540341.25660025,
         956915.71960196,  1044781.58902

In [585]:
gbr.score(df_test_set, test_labels)

0.9997261441115408

In [592]:
from sklearn.neural_network import MLPRegressor

mlp = MLPRegressor(hidden_layer_sizes=(5,), max_iter=500, activation='relu').fit(df_train_set, train_labels)
mlp.predict(df_test_set)




array([4.46633513e+00, 4.46633513e+00, 2.80620525e+02, 4.46633513e+00,
       1.28068294e+03, 4.46633513e+00, 4.29886544e+01, 4.46633513e+00,
       4.46633513e+00, 4.46633513e+00, 4.46633513e+00, 4.46633513e+00,
       1.07522040e+02, 2.45207335e+02, 4.46633513e+00, 4.46633513e+00,
       3.90036875e+01, 4.72830264e+03, 4.46633513e+00, 4.46633513e+00,
       6.09454246e+01, 2.15341056e+03, 4.79545504e+03, 3.93565863e+02,
       1.16007309e+03, 4.46633513e+00, 4.46633513e+00, 4.46633513e+00,
       4.46633513e+00, 4.46633513e+00, 8.91637070e+01, 4.46633513e+00,
       3.27327347e+03, 4.46633513e+00, 4.46633513e+00, 4.46633513e+00,
       4.46428989e+00, 4.46633513e+00, 2.47935607e+03, 4.46633513e+00,
       1.21459890e+03, 8.28507942e+03, 2.21349104e+02, 4.46633513e+00,
       9.62650631e+02, 4.52295526e+00, 4.46633513e+00, 4.35855668e+03,
       4.46367293e+00, 4.46633513e+00, 4.46633513e+00, 7.31287404e+03,
       4.46633513e+00, 4.46633513e+00, 4.46633513e+00, 1.34552163e+02,
      

In [593]:
mlp.score(df_test_set, test_labels)

-0.6640832470344304