In [204]:
import pandas as pd
import numpy as np
import datetime

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

In [138]:
def is_show(tags):
    entertainment_tags = ['entertainment', 'guitar', 'music', 'performance']
    
    for x in entertainment_tags:
        if x in tags: return True
    return False

def tag_list_to_df(tags, tag_list):
    row = []
    for tag in tag_list:
        row.append(1 if tag in tags else 0)
    return pd.Series(row)

def ted_events():
    return {
            'TED20': 'Yearly TED Conference',
            'TED19': 'Yearly TED Conference',
            'TEDx': 'TEDx',
            'TEDGlobal': 'TEDGlobal',
            'TEDWomen': 'TEDWomen',
            'TEDSalon': 'TEDSalon',
            'TEDNYC': 'TEDNYC'
    }

def transform_event(x):
    for event, transformation in ted_events().items():
        if x.count(event) > 0:
            return transformation
    return 'Other'

In [257]:
def generate_X_y():
    ted_main = pd.read_csv('./ted-talks/ted_main.csv')
    transcripts = pd.read_csv('./ted-talks/transcripts.csv')

    tag_ocurrences = ted_main['tags'].apply(lambda x: pd.eval(x, engine='python')).apply(pd.Series).stack().value_counts()
    tag_list = tag_ocurrences[tag_ocurrences > 150].index.values
    tags = pd.DataFrame(
        ted_main['tags'].apply(lambda x: tag_list_to_df(x, tag_list)))
    tags.columns = tag_list
    temp_X = ted_main.drop('tags', axis=1).join(tags)

    temp_X['published_date_read'] = pd.to_datetime(temp_X['published_date'],unit='s')
    temp_X['pub_month'] = temp_X['published_date_read'].apply(lambda x: x.month)
    temp_X['pub_weekday'] = temp_X['published_date_read'].apply(lambda x: x.weekday())  # Monday: 0, Sunday: 6
    
    initial_data = datetime.datetime(2017, 9, 23)
    temp_X['published_date_read'] = pd.to_datetime(temp_X['published_date_read'])
    temp_X['days_published'] = (initial_data - temp_X['published_date_read']).dt.days
    
    temp_X['event'] = temp_X['event'].apply(lambda x: transform_event(x))
    
    temp_X['duration'] = temp_X['duration'] / 60
    
    temp_X['title-description'] = temp_X['title'].str.cat(temp_X['description'], sep=' ').apply(lambda x: x.lower())

    speakers = list(set(temp_X['main_speaker'].values))
    extra_stop_words = ['ted', 'talk', 'talks', 'don', 'tedx', 'll', 've']

    vec = TfidfVectorizer(min_df=0.02 ,max_df=0.04, stop_words=extra_stop_words)
    title_words = vec.fit_transform(temp_X['title-description'])
    title_df = pd.DataFrame(data= title_words.todense(), columns = vec.get_feature_names())
    repeated_tags = [tag for tag in temp_X.columns if tag in title_df.columns]
    temp_X = temp_X.drop(repeated_tags + ['title', 'description'], axis=1).join(title_df)
        
    temp_X = temp_X.join(pd.get_dummies(temp_X['event'], drop_first=True, prefix='_'))
    
    dropable_columns=['views', 'comments', 'film_date', 'main_speaker', 'name', 'published_date',
                     'ratings', 'related_talks', 'url', 'speaker_occupation', 'event', 'TEDx']
    X = temp_X.drop(dropable_columns, axis=1)
    y = temp_X['views']
    
    return X, y

In [258]:
X, y = generate_X_y()

In [260]:
pd.to_pickle(X, 'X.pkl')
pd.to_pickle(y, 'y.pkl')

Unnamed: 0,duration,languages,num_speaker,technology,science,global issues,design,business,entertainment,innovation,...,would,year,yet,young,__TEDGlobal,__TEDNYC,__TEDSalon,__TEDWomen,__TEDx,__Yearly TED Conference
0,19.4,60,1,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0,0,0,0,0,1
1,16.283333,43,1,1,1,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0,0,0,0,0,1
2,21.433333,26,1,1,0,0,1,0,1,0,...,0.0,0.0,0.0,0.0,0,0,0,0,0,1
3,18.6,35,1,0,0,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0,0,0,0,0,1
4,19.833333,48,1,0,0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0,0,0,0,0,1
