In [492]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import statsmodels.api as sm
from statsmodels.formula.api import ols
import ast
from textblob import TextBlob
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

df_ted_main = pd.read_csv("data/ted_main.csv")
df_ted_transcripts = df = pd.read_csv("data/transcripts.csv")

In [493]:
df_ted_main.drop(['description', 'film_date', 'main_speaker', 'name', 'event', 'num_speaker', 'related_talks', 'speaker_occupation', 'title', 'url'], axis=1, inplace=True)
df_ted_main.head()

Unnamed: 0,comments,duration,languages,published_date,ratings,tags,views
0,4553,1164,60,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 19645}, {...","['children', 'creativity', 'culture', 'dance',...",47227110
1,265,977,43,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 544}, {'i...","['alternative energy', 'cars', 'climate change...",3200520
2,124,1286,26,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 964}, {'i...","['computers', 'entertainment', 'interface desi...",1636292
3,200,1116,35,1151367060,"[{'id': 3, 'name': 'Courageous', 'count': 760}...","['MacArthur grant', 'activism', 'business', 'c...",1697550
4,593,1190,48,1151440680,"[{'id': 9, 'name': 'Ingenious', 'count': 3202}...","['Africa', 'Asia', 'Google', 'demo', 'economic...",12005869


In [494]:
# ----------------------------------------------------------
# 2. one hot encoding
# ----------------------------------------------------------

# one hot encoding tags

# iterate trough the data set
for i, row in df_ted_main.iterrows():
    # change string formed list into normal list
    element_list_form = ast.literal_eval(df_ted_main.at[i,'tags'])
    element_list_form = [i.strip() for i in element_list_form]
    # finish converting by replacing string version of a list for the list type
    df_ted_main.at[i,'tags'] = element_list_form

# one hot encoding with column named "tags"
# since "tags" is a Series on each row, we need to split them first and then we can do one hot encoding
df_ted_main = df_ted_main.drop('tags', 1).join(
    pd.get_dummies(
        pd.DataFrame(df_ted_main.tags.tolist()).stack()
    ).astype(int).sum(level=0)
)

df_ted_main.head()

Unnamed: 0,comments,duration,languages,published_date,ratings,views,3d printing,AI,AIDS,Addiction,...,wikipedia,wind energy,women,women in business,work,work-life balance,world cultures,writing,wunderkind,youth
0,4553,1164,60,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 19645}, {...",47227110,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,265,977,43,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 544}, {'i...",3200520,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,124,1286,26,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 964}, {'i...",1636292,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,200,1116,35,1151367060,"[{'id': 3, 'name': 'Courageous', 'count': 760}...",1697550,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,593,1190,48,1151440680,"[{'id': 9, 'name': 'Ingenious', 'count': 3202}...",12005869,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [495]:
# # one hot encoding "event" column

# def one_hot_encode(original_dataframe, feature_to_encode):
#     dummies = pd.get_dummies(original_dataframe[[feature_to_encode]])
#     res = pd.concat([original_dataframe, dummies], axis=1)
#     res = res.drop([feature_to_encode], axis=1)
#     return(res)

# df_ted_main = one_hot_encode(df_ted_main, 'event')
# df_ted_main.head()

In [496]:
#rating analysis

df_ted_main['ratings']=df_ted_main['ratings'].str.replace("'",'"')

df_ted_main=df_ted_main.merge(df_ted_main.ratings.apply(lambda x: pd.Series(pd.read_json(x)['count'].values,index=pd.read_json(x)['name'])), 
            left_index=True, right_index=True)

df_ted_main.drop('ratings', axis=1, inplace=True)

df_ted_main.head()

Unnamed: 0,comments,duration,languages,published_date,views,3d printing,AI,AIDS,Addiction,Africa,...,Longwinded,Confusing,Informative,Fascinating,Unconvincing,Persuasive,Jaw-dropping,OK,Obnoxious,Inspiring
0,4553,1164,60,1151367060,47227110,0,0,0,0,0,...,387,242,7346,10581,300,10704,4439,1174,209,24924
1,265,977,43,1151367060,3200520,0,0,0,0,0,...,113,62,443,132,258,268,116,203,131,413
2,124,1286,26,1151367060,1636292,0,0,0,0,0,...,78,27,395,166,104,230,54,146,142,230
3,200,1116,35,1151367060,1697550,0,0,0,0,0,...,53,32,380,132,36,460,230,85,35,1070
4,593,1190,48,1151440680,12005869,0,0,0,0,1,...,110,72,5433,4606,67,2542,3736,248,61,2893


In [497]:
# sentiment analysis of transcript
pol = lambda x: TextBlob(x).sentiment.polarity
sub = lambda x: TextBlob(x).sentiment.subjectivity

df_ted_transcripts['polarity'] = df_ted_transcripts['transcript'].apply(pol)
df_ted_transcripts['subjectivity'] = df_ted_transcripts['transcript'].apply(sub)
df_ted_transcripts.head()

Unnamed: 0,transcript,url,polarity,subjectivity
0,Good morning. How are you?(Laughter)It's been ...,https://www.ted.com/talks/ken_robinson_says_sc...,0.146452,0.462051
1,"Thank you so much, Chris. And it's truly a gre...",https://www.ted.com/talks/al_gore_on_averting_...,0.157775,0.424101
2,"(Music: ""The Sound of Silence,"" Simon & Garfun...",https://www.ted.com/talks/david_pogue_says_sim...,0.136579,0.475229
3,If you're here today — and I'm very happy that...,https://www.ted.com/talks/majora_carter_s_tale...,0.082928,0.439165
4,"About 10 years ago, I took on the task to teac...",https://www.ted.com/talks/hans_rosling_shows_t...,0.096483,0.450631


In [498]:
# merging both datasets and delete the rows that don't have a transcipt
df_ted_talks = pd.merge(df_ted_main, df_ted_transcripts, left_index=True, right_index=True)

In [499]:
# deleting the transcript and url columns because we don't need them.
del df_ted_talks['transcript']
del df_ted_talks['url']
df_ted_talks.head()

Unnamed: 0,comments,duration,languages,published_date,views,3d printing,AI,AIDS,Addiction,Africa,...,Informative,Fascinating,Unconvincing,Persuasive,Jaw-dropping,OK,Obnoxious,Inspiring,polarity,subjectivity
0,4553,1164,60,1151367060,47227110,0,0,0,0,0,...,7346,10581,300,10704,4439,1174,209,24924,0.146452,0.462051
1,265,977,43,1151367060,3200520,0,0,0,0,0,...,443,132,258,268,116,203,131,413,0.157775,0.424101
2,124,1286,26,1151367060,1636292,0,0,0,0,0,...,395,166,104,230,54,146,142,230,0.136579,0.475229
3,200,1116,35,1151367060,1697550,0,0,0,0,0,...,380,132,36,460,230,85,35,1070,0.082928,0.439165
4,593,1190,48,1151440680,12005869,0,0,0,0,1,...,5433,4606,67,2542,3736,248,61,2893,0.096483,0.450631


In [500]:
# from sklearn.preprocessing import StandardScaler

# scaled_features = StandardScaler().fit_transform(result.values)

# result = pd.DataFrame(scaled_features, index=result.index, columns=result.columns)
# result

In [501]:
# # shuffle data set before splitting
# result = result.sample(frac=1).reset_index(drop=True)
# result

In [502]:
# from sklearn.model_selection import StratifiedShuffleSplit

# split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
# for train_index, test_index in split.split(df_ted_talks, df_ted_talks["views"]):
#     df_train_set = df_ted_talks.loc[train_index]
#     df_test_set = df_ted_talks.loc[test_index]

# df_test_set["views"].value_counts() / len(df_test_set)

In [503]:
#We will shuffle the whole dataset first (df.sample(frac=1)) and then split our data set into the following parts:
# 60% - train set
# 20% - validation set
# 20% - test set

# df_train, df_val, df_test = np.split(df_ted_talks.sample(frac=1), [int(.6*len(df_ted_talks)), int(.8*len(df_ted_talks))])

In [504]:
# from sklearn.preprocessing import StandardScaler


# df_train_stand = StandardScaler().fit_transform(df_train.values)
# df_val_stand = StandardScaler().fit_transform(df_val.values)
# df_test_stand = StandardScaler().fit_transform(df_test.values)



# df_train = pd.DataFrame(df_train_stand, index=df_train.index, columns=df_train.columns)
# df_val = pd.DataFrame(df_val_stand, index=df_val.index, columns=df_val.columns)
# df_test = pd.DataFrame(df_test_stand, index=df_test.index, columns=df_test.columns)

In [505]:
# Splitting
train_set, test_set, train_labels, test_labels = train_test_split(df_ted_talks, df_ted_talks['views'], test_size = 1/7.0, random_state = 42)


In [506]:
# Standardization

from sklearn.preprocessing import StandardScaler

# Fit on training set only.
# https://stats.stackexchange.com/questions/174823/how-to-apply-standardization-normalization-to-train-and-testset-if-prediction-i
scaler.fit(train_set)

# Apply transform to both the training set and the test set.
df_train_set_scaled = scaler.transform(train_set)
df_test_set_scaled = scaler.transform(test_set)

# convert to data frames
df_train_set = pd.DataFrame(df_train_set_scaled, index=train_set.index, columns=train_set.columns)
df_test_set = pd.DataFrame(df_test_set_scaled, index=test_set.index, columns=test_set.columns)



In [507]:
# PCA preprocessing - getting tag columns

# columns without PCA
columns_without_pca = ['views','subjectivity','polarity','Inspiring','Obnoxious','OK',
                                  'Jaw-dropping','Persuasive','Unconvincing','Fascinating','Informative','Confusing',
                                  'Longwinded','Courageous','Ingenious','Beautiful','Funny','comments','duration',
                                  'languages','published_date']

# get tag columns for both, training and test sets
tag_columns_train_set = df_train_set[df_train_set.columns.difference(columns_without_pca)]
tag_columns_test_set = df_test_set[df_test_set.columns.difference(columns_without_pca)]

# saving indexes of training and test sets for the next step.
# PCA will convert dataframe into numpy array and indexes will be lost. 
# this way I will keep them stored
tag_columns_train_set_indexes = tag_columns_train_set.index
tag_columns_test_set_indexes = tag_columns_test_set.index


##### DELETION #####
# all tag columns as a lists
tag_columns_train_set_list = list(tag_columns_train_set)
tag_columns_test_set_list = list(tag_columns_test_set)

# remove tag columns from original sets
df_train_set.drop(tag_columns_train_set_list, axis=1, inplace=True)
df_test_set.drop(tag_columns_test_set_list, axis=1, inplace=True)


# display all column names in the dataframe without tags
# list(df_train_set)

In [508]:
# PCA
pca = PCA(.80)

# fit PCA only on training set of tag columns
pca.fit(tag_columns_train_set)

# use PCA to transform both sets of tag columns
tag_columns_train_set = pca.transform(tag_columns_train_set)
tag_columns_test_set = pca.transform(tag_columns_test_set)


In [509]:
# combine: 
# PCA transformed training and test sets of tag columns, 
# with original training and test sets

# training set
feat_cols_train = ['feature'+str(i) for i in range(tag_columns_train_set.shape[1])]
# creating a dataframe from PCA results
# df_pca_train = pd.DataFrame(tag_columns_train_set, index=tag_columns_train_set_indexes)
df_pca_train = pd.DataFrame(tag_columns_train_set, columns=feat_cols_train, index=tag_columns_train_set_indexes)


# test set
feat_cols_test = ['feature'+str(i) for i in range(tag_columns_test_set.shape[1])]
# creating a dataframe from PCA results
# df_pca_test = pd.DataFrame(tag_columns_test_set,columns=tag_columns_test_set_indexes)
df_pca_test = pd.DataFrame(tag_columns_test_set, columns=feat_cols_test, index=tag_columns_test_set_indexes)

# results of combining
df_train_set = df_train_set.join(df_pca_train, how="left")
df_test_set = df_test_set.join(df_pca_test, how="left")


In [510]:
df_train_set.info()
df_test_set.info()

# print('Training result Shape:', df_train_set.shape)
# print('Testing result Shape:', df_test_set.shape)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 2114 entries, 2240 to 860
Columns: 247 entries, comments to feature225
dtypes: float64(247)
memory usage: 4.1 MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 353 entries, 1711 to 1987
Columns: 247 entries, comments to feature225
dtypes: float64(247)
memory usage: 693.9 KB


In [511]:
df_train_set

Unnamed: 0,comments,duration,languages,published_date,views,Funny,Beautiful,Ingenious,Courageous,Longwinded,...,feature216,feature217,feature218,feature219,feature220,feature221,feature222,feature223,feature224,feature225
2240,-0.488876,-0.542353,-0.098925,1.397547,-0.322920,-0.248032,-0.300960,-0.192644,0.021828,-0.416349,...,-0.798559,1.603121,-0.379399,0.049373,0.609996,-0.387291,-0.633508,-1.144576,0.453698,-0.599351
2115,-0.553355,0.262518,0.341574,1.232969,0.277369,-0.203612,-0.021918,-0.136158,-0.194311,-0.206030,...,-1.648164,0.415784,-0.954074,-1.984280,2.369547,-0.686033,1.899612,-0.412546,1.287202,1.003407
1241,-0.325784,0.252031,-0.539424,-0.008371,-0.476259,-0.254377,-0.398202,-0.508297,-0.216143,0.004289,...,0.315670,0.270713,-0.606480,0.502964,-0.553205,-1.640681,-0.024057,0.713324,-0.073705,0.316816
25,-0.037528,0.807838,0.121325,-1.955519,-0.242211,-0.049731,-0.218516,0.372209,-0.172478,0.261345,...,0.611486,-0.860212,-0.508229,0.820453,0.895390,-0.323604,1.129766,0.120096,-0.700810,0.398685
497,0.793104,0.726564,-0.319174,-0.965832,-0.445751,-0.252791,-0.400316,0.083137,-0.185578,0.144502,...,0.286345,0.487553,0.923323,-0.102922,-0.195832,-0.197039,0.164758,0.407234,0.284895,0.162588
2000,-0.450948,-1.163703,1.442822,1.040215,0.168478,-0.243272,1.062543,-0.345486,-0.351502,-0.322874,...,0.987684,1.465059,-0.591414,-0.155809,1.240035,1.937287,-0.640568,0.154760,-2.029443,0.334812
643,-0.458534,1.159149,-0.209049,-0.754137,-0.383308,-0.249618,-0.216402,-0.415262,-0.325304,1.943896,...,-0.779010,-0.004684,0.305438,0.274685,-0.000725,0.244291,-0.013654,-0.101985,0.231043,-0.016825
1530,-0.325784,0.595477,-0.319174,0.358307,-0.151495,-0.252791,-0.343239,0.000071,-0.283823,-0.720142,...,0.130093,-0.463643,-1.414902,-1.001348,1.119291,-0.590662,0.555061,0.536819,-1.198337,0.074596
1902,-0.048907,-0.550218,0.561824,0.902740,0.359439,-0.232168,0.937820,-0.498329,2.213780,0.121133,...,-0.430703,3.076006,-1.135359,0.275992,-0.199384,0.535078,0.225553,0.948863,1.777803,-0.374686
1362,-0.348541,0.026562,-0.209049,0.133812,-0.399165,-0.084632,-0.410886,-0.375390,-0.336220,-0.416349,...,0.308187,0.483555,0.023321,0.676435,0.365249,0.812422,2.607250,0.558192,0.497855,0.665199


In [513]:
df_test_set

Unnamed: 0,comments,duration,languages,published_date,views,Funny,Beautiful,Ingenious,Courageous,Longwinded,...,feature216,feature217,feature218,feature219,feature220,feature221,feature222,feature223,feature224,feature225
1711,-0.090628,-0.610518,0.451699,0.623953,0.548063,-0.221063,1.068885,-0.465102,1.740022,-0.416349,...,0.528620,2.601715,-0.236989,1.581045,0.032494,2.061871,-0.592463,1.522558,1.178092,-1.438776
1557,-0.120971,-0.492540,0.341574,0.395724,-0.255468,-0.251204,-0.286162,-0.092964,0.010912,-0.790249,...,0.058177,0.693813,-0.558967,0.300258,1.085488,-0.947930,-0.273464,-0.681352,-0.191981,0.785101
1870,-0.344748,0.427687,0.011200,0.846619,-0.103269,-0.240100,-0.100134,-0.455134,-0.024020,-0.579930,...,-0.209801,-0.521573,-0.196326,-0.556653,0.317521,-0.166903,-1.644995,3.035983,-0.043845,-0.952574
1703,0.080050,0.364765,0.231450,0.613680,-0.372856,-0.238513,-0.368607,-0.465102,0.078592,-0.416349,...,-0.520387,0.400676,0.275621,0.753712,0.914029,0.650657,0.475686,1.148817,-1.564900,-0.380353
1857,-0.454741,-0.644601,0.561824,0.828866,0.197733,-0.179816,-0.036715,-0.345486,-0.122264,-0.065817,...,-0.674349,0.518669,0.167597,0.214485,0.233701,-0.294256,0.138164,-0.052792,-0.211388,0.354174
756,1.418923,0.676751,-0.319174,-0.599790,-0.308984,-0.230581,-0.358037,-0.372068,-0.246708,0.191239,...,-0.217133,-0.391988,0.519012,0.208716,0.721857,0.073568,-0.784684,1.220663,-0.226123,0.910623
727,0.660354,0.952033,0.011200,-0.646530,-0.437084,-0.252791,-0.317872,-0.481715,0.358044,0.565139,...,0.472344,1.810249,0.348473,1.370671,1.813977,0.831911,1.040847,0.300528,0.586678,-0.798694
1050,-0.333370,-0.665575,0.341574,-0.199179,-0.309000,-0.241686,-0.368607,-0.056415,-0.310021,-0.673405,...,-0.792294,-1.102482,0.020955,-0.878991,-0.039329,1.316617,-0.631117,-0.664384,-0.126760,2.172429
498,-0.200620,0.603343,0.011200,-0.964897,-0.233637,-0.252791,0.047843,0.408758,-0.305655,0.331452,...,-1.380611,1.073855,0.867867,0.224523,0.388908,0.781867,-0.529762,1.992814,-0.156673,0.184747
237,-0.572319,-1.168947,-3.072293,-1.364313,-0.635877,-0.255964,0.286720,-0.501651,-0.325304,-0.626667,...,0.252472,-0.540996,0.280875,0.949929,-1.018626,-0.221639,-0.431315,-0.842262,-0.689501,-0.365359


In [514]:
df_train_set.shape

(2114, 247)

In [515]:
df_test_set.shape

(353, 247)

In [517]:
df_train_set.isnull().values.any()

False

In [518]:
df_test_set.isnull().values.any()

False