In [262]:
import pandas as pd
import numpy as np
import datetime

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

In [333]:
def is_show(tags):
    entertainment_tags = ['entertainment', 'guitar', 'music', 'performance']
    
    for x in entertainment_tags:
        if x in tags: return True
    return False

def tag_list_to_df(tags, tag_list):
    row = []
    for tag in tag_list:
        row.append(1 if tag in tags else 0)
    return pd.Series(row)

def ted_events():
    return {
            'TED20': 'Yearly TED Conference',
            'TED19': 'Yearly TED Conference',
            'TEDx': 'TEDx',
            'TEDGlobal': 'TEDGlobal',
            'TEDWomen': 'TEDWomen',
            'TEDSalon': 'TEDSalon',
            'TEDNYC': 'TEDNYC'
    }

def transform_event(x):
    for event, transformation in ted_events().items():
        if x.count(event) > 0:
            return transformation
    return 'Other'

def load_data():
    return pd.read_csv('./ted-talks/ted_main.csv')

def process_tags(data):
    tag_ocurrences = data['tags'].apply(lambda x: pd.eval(x, engine='python')).apply(pd.Series).stack().value_counts()
    tag_list = tag_ocurrences[tag_ocurrences > 150].index.values
    tags = pd.DataFrame(
        ted_main['tags'].apply(lambda x: tag_list_to_df(x, tag_list)))
    tags.columns = tag_list
    
    return ted_main.drop('tags', axis=1).join(tags)
def process_dates(data):
    data['published_date_read'] = pd.to_datetime(data['published_date'],unit='s')
    data['pub_month'] = data['published_date_read'].apply(lambda x: x.month)
    data['pub_weekday'] = data['published_date_read'].apply(lambda x: x.weekday())  # Monday: 0, Sunday: 6
    
    initial_data = datetime.datetime(2017, 9, 23)
    data['published_date_read'] = pd.to_datetime(data['published_date_read'])
    data['days_published'] = (initial_data - data['published_date_read']).dt.days
    
    return data
def process_text(data):
    speakers = list(set(data['main_speaker'].values))
    extra_stop_words = ['ted', 'talk', 'talks', 'don', 'tedx', 'll', 've']

    vec = TfidfVectorizer(min_df=0.02 ,max_df=0.04, stop_words=extra_stop_words)
    title_words = vec.fit_transform(data['title-description'])
    title_df = pd.DataFrame(data= title_words.todense(), columns = vec.get_feature_names())
    repeated_tags = [tag for tag in data.columns if tag in title_df.columns]
    
    return  data.drop(repeated_tags + ['title', 'description'], axis=1).join(title_df)
def drop_columns_and_split(data):
    dropable_columns=['views', 'comments', 'film_date', 'main_speaker', 'name', 'published_date',
                     'ratings', 'related_talks', 'url', 'speaker_occupation', 'event', 'TEDx']

    X = data.drop(dropable_columns, axis=1)
    X = X[(X['duration'] < 25) & (X['duration'] > 5)]
    X = X[pd.to_datetime(X['film_date'], unit='s').dt.year >= 1984]
    
    y = data['views']
    
    return X, y

In [334]:
def generate_X_y():
    ted_main = load_data()
    
    temp_X = process_tags(ted_main)
    temp_X = process_dates(temp_X)
    
    temp_X['event'] = temp_X['event'].apply(lambda x: transform_event(x))
    temp_X = temp_X.join(pd.get_dummies(temp_X['event'], drop_first=True, prefix='_'))
    
    temp_X['duration'] = temp_X['duration'] / 60
    temp_X['title-description'] = temp_X['title'].str.cat(temp_X['description'], sep=' ').apply(lambda x: x.lower())

    
    temp_X = process_text(temp_X)
    
    
    X, y = drop_columns_and_split(temp_X)
    
    return X, y

In [335]:
X, y = generate_X_y()

KeyError: 'film_date'

In [323]:
X.head()

Unnamed: 0,duration,languages,num_speaker,technology,science,global issues,design,business,entertainment,innovation,...,were,while,without,women,working,works,would,year,yet,young
0,19.4,60,1,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,16.283333,43,1,1,1,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,21.433333,26,1,1,0,0,1,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,18.6,35,1,0,0,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,19.833333,48,1,0,0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [268]:
pd.to_pickle(X, 'X.pkl')
pd.to_pickle(y, 'y.pkl')