In [142]:
import pandas as pd
import numpy as np
import datetime

from collections import Counter

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

In [124]:
def is_show(tags):
    entertainment_tags = ['entertainment', 'guitar', 'music', 'performance']
    
    for x in entertainment_tags:
        if x in tags: return True
    return False

def tag_list_to_df(tags, tag_list):
    row = []
    for tag in tag_list:
        row.append(1 if tag in tags else 0)
    return pd.Series(row)

def ted_events():
    return {
            'TED20': 'Yearly TED Conference',
            'TED19': 'Yearly TED Conference',
            'TEDx': 'TEDx',
            'TEDGlobal': 'TEDGlobal',
            'TEDWomen': 'TEDWomen',
            'TEDSalon': 'TEDSalon',
            'TEDNYC': 'TEDNYC'
    }

def transform_event(x):
    for event, transformation in ted_events().items():
        if x.count(event) > 0:
            return transformation
    return 'Other'

def load_data():
    transcripts = pd.read_csv('./ted-talks/transcripts.csv')
    
    return pd.read_csv('./ted-talks/ted_main.csv').join(transcripts['transcript'])

def process_tags(data):
    tag_ocurrences = data['tags'].apply(lambda x: pd.eval(x, engine='python')).apply(pd.Series).stack().value_counts()
    tag_list = tag_ocurrences[tag_ocurrences > 100].index.values
    tags = pd.DataFrame(
        data['tags'].apply(lambda x: tag_list_to_df(x, tag_list)))
    tags.columns = tag_list
    
    return data.drop('tags', axis=1).join(tags)
def process_dates(data):
    data['published_date_read'] = pd.to_datetime(data['published_date'],unit='s')
    data['pub_month'] = data['published_date_read'].apply(lambda x: x.month)
    data['pub_weekday'] = data['published_date_read'].apply(lambda x: x.weekday())  # Monday: 0, Sunday: 6
    
    initial_data = datetime.datetime(2017, 9, 23)
    data['published_date_read'] = pd.to_datetime(data['published_date_read'])
    data['days_published'] = (initial_data - data['published_date_read']).dt.days
    
    return data
def process_text(data):
    speakers = list(set(data['main_speaker'].values))
    extra_stop_words = ['ted', 'talk', 'talks', 'don', 'tedx', 'll', 've', 'is', 'it', 'its','1960s',
                        '1990', '1998', '1999', '2002', '2016', '2050', '20s', '29', '37', '48', '50s',
                        '55', '900']

    vec = CountVectorizer(min_df=0.02 ,max_df=0.025, stop_words=extra_stop_words)
    title_words = vec.fit_transform(data['title-description-transcript'])
    title_df = pd.DataFrame(data= title_words.todense(), columns = vec.get_feature_names())
    repeated_tags = [tag for tag in data.columns if tag in title_df.columns]
    
    return  data.drop(repeated_tags + ['title', 'description', 'transcript'], axis=1).join(title_df)
def drop_columns_and_split(data):
    dropable_columns=['comments', 'main_speaker', 'name', 'published_date', 'published_date_read',
                     'ratings', 'related_talks', 'url', 'speaker_occupation', 'event', 'TEDx',
                     'title-description-transcript']

    X = data.drop(dropable_columns, axis=1)
    X = X[(X['duration'] < 25) & (X['duration'] > 5)]
    X = X[pd.to_datetime(X['film_date'], unit='s').dt.year >= 1984].drop('film_date', axis=1)
    
    y = X['views']
    X = X.drop('views', axis=1)
    
    return X, y

In [174]:
def generate_X_y():
    ted_main = load_data()

    temp_X = process_tags(ted_main)
    temp_X = process_dates(temp_X)
    
    temp_X['event'] = temp_X['event'].apply(lambda x: transform_event(x))
    temp_X = temp_X.join(pd.get_dummies(temp_X['event'], drop_first=True, prefix='_'))
    
    temp_X['duration'] = temp_X['duration'] / 60
    
    temp_X['title-description-transcript'] = temp_X['title'].str.cat(
        temp_X['description'].str.cat(temp_X['transcript'], sep=' '), sep=' '
    ).apply(lambda x: str(x).lower())

    temp_X['laughter'] = temp_X['title-description-transcript'].apply(lambda x: Counter(x.split(' '))['(laughter)'])
    temp_X['applause'] = temp_X['title-description-transcript'].apply(lambda x: Counter(x.split(' '))['(applause)'])
    
    temp_X = process_text(temp_X)
    
    
    X, y = drop_columns_and_split(temp_X)
    
    return X, y

In [175]:
X, y = generate_X_y()

In [179]:
pd.to_pickle(X, 'X.pkl')
pd.to_pickle(y, 'y.pkl')