In [1]:
import pandas as pd
import numpy as np
import joblib
import json
import string
import gensim
import urllib.request as ur
import nltk, re
import pickle
import cv2
import gensim.downloader as api

from collections import Counter
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk import sent_tokenize, word_tokenize
from yolov4.tf import YOLOv4
from tqdm import tqdm
from copy import deepcopy

from gensim.models import Word2Vec

from pathlib import Path
from utils import read_config

import warnings
warnings.filterwarnings("ignore")

cache_dir = Path('data/cache/chkp5')
cache_dir.mkdir(exist_ok=True, parents=True)

config = read_config("config.json")
config['dataset_path'] = Path(config['dataset_path'])

In [2]:
wv = api.load('word2vec-google-news-300')

In [3]:
def load_pickle(path):
    print("Loading from:", path)
    with open(path, 'rb') as fh:
        return pickle.load(fh)

def save_pickle(obj, path):
    print("Saving to:", path)
    with open(path, 'wb') as fh:
        return pickle.dump(obj, fh)

Removed cols - no data:
 'trending_date_day',
 'trending_date_month',
 'trending_date_year',
 'trending_date_weekday',
 'trending_date_quarter',
 'trending_span_days',
 'publish_to_trending_span_in_hours',
 'likes_first',
 'likes_last',
 'likes_diff',
 'dislikes_first',
 'dislikes_last',
 'dislikes_diff',
 'views_first',
 'views_last',
 'views_diff',
 'comment_count_first',
 'comment_count_last',
 'comment_count_diff',
 'votes_last',
 'likes_ratio_last',
 'dislike_ratio_last',
 'votes_first',
 'likes_ratio_first',
 'dislike_ratio_first',
 'is_us',
 'is_gb',
 'comments_disabled',
 'ratings_disabled',
 'video_error_or_removed',

Not explainable:
 'description_embedding_0',
 'description_embedding_1',
 'description_embedding_2',
 'description_embedding_3',
 'description_embedding_4',
 'description_embedding_5',
 'description_embedding_6',
 'description_embedding_7',
 'description_embedding_8',
 'description_embedding_9',
 'description_embedding_10',
 'description_embedding_11',
 'description_embedding_12',
 'description_embedding_13',
 'description_embedding_14',
 'title_embedding_0',
 'title_embedding_1',
 'title_embedding_2',
 'title_embedding_3',
 'title_embedding_4',
 'title_embedding_5',
 'title_embedding_6',
 'title_embedding_7',
 'title_embedding_8',
 'title_embedding_9',
 'title_embedding_10',
 'title_embedding_11',
 'title_embedding_12',
 'title_embedding_13',
 'title_embedding_14',
 'feature_vector_0',
 'feature_vector_1',
 'feature_vector_2',
 'feature_vector_3',
 'feature_vector_4',
 'feature_vector_5',
 'feature_vector_6',
 'feature_vector_7',
 'feature_vector_8',
 'feature_vector_9',
 'feature_vector_10',
 'feature_vector_11',
 'feature_vector_12',
 'feature_vector_13',
 'feature_vector_14',
 'hist_reds_0',
 'hist_reds_1',
 'hist_reds_2',
 'hist_reds_3',
 'hist_reds_4',
 'hist_reds_5',
 'hist_reds_6',
 'hist_reds_7',
 'hist_reds_8',
 'hist_reds_9',
 'hist_reds_10',
 'hist_reds_11',
 'hist_reds_12',
 'hist_reds_13',
 'hist_reds_14',
 'hist_greens_0',
 'hist_greens_1',
 'hist_greens_2',
 'hist_greens_3',
 'hist_greens_4',
 'hist_greens_5',
 'hist_greens_6',
 'hist_greens_7',
 'hist_greens_8',
 'hist_greens_9',
 'hist_greens_10',
 'hist_greens_11',
 'hist_greens_12',
 'hist_greens_13',
 'hist_greens_14',
 'hist_blues_0',
 'hist_blues_1',
 'hist_blues_2',
 'hist_blues_3',
 'hist_blues_4',
 'hist_blues_5',
 'hist_blues_6',
 'hist_blues_7',
 'hist_blues_8',
 'hist_blues_9',
 'hist_blues_10',
 'hist_blues_11',
 'hist_blues_12',
 'hist_blues_13',
 'hist_blues_14',
 'fisher_vector_0',
 'fisher_vector_1',
 'fisher_vector_2',
 'fisher_vector_3',
 'fisher_vector_4',
 'fisher_vector_5',
 'fisher_vector_6',
 'fisher_vector_7',
 'fisher_vector_8',
 'fisher_vector_9',
 'fisher_vector_10',
 'fisher_vector_11',
 'fisher_vector_12',
 'fisher_vector_13',
 'fisher_vector_14',
 'image_text_0',
 'image_text_1',
 'image_text_2',
 'image_text_3',
 'image_text_4',
 'image_text_5',
 'image_text_6',
 'image_text_7',
 'image_text_8',
 'image_text_9',
 'image_text_10',
 'image_text_11',
 'image_text_12',
 'image_text_13',
 'image_text_14',
 'objects_0',
 'objects_1',
 'objects_2',
 'objects_3',
 'objects_4',
 'objects_5',
 'objects_6',
 'objects_7',
 'objects_8',
 'objects_9',
 'objects_10',
 'objects_11',
 'objects_12',
 'objects_13',
 'objects_14',
 'euler_number',
 'flood_0',
 'flood_1',
 'flood_2',
 'flood_3',
 'flood_4',
 'flood_5',
 'flood_6',
 'flood_7',
 'shannon_entropy',
'tags_0',
 'tags_1',
 'tags_2',
 'tags_3',
 'tags_4',
 'tags_5',
 'tags_6',
 'tags_7',
 'tags_8',
 'tags_9',
 'tags_10',
 'tags_11',
 'tags_12',
 'tags_13',
 'tags_14',

Explainable:

'views',
'likes',
'dislikes',
'comment_count',
'publish_time_day',
'publish_time_month',
'publish_time_year',
'publish_time_weekday',
'publish_time_quarter',
'publish_time_hour',
'publish_time_minute',
'publish_time_second',
'title_punctuation_count',
'description_punctuation_count',
'channel_title_punctuation_count',
'title_attention_count',
'description_attention_count',
'channel_title_attention_count',
'title_len',
'description_len',
'channel_title_len',
'title_letter_count',
'description_letter_count',
'channel_title_letter_count',
'title_information_ratio',
'description_information_ratio',
'channel_title_information_ratio',
'title_attention_ratio',
'description_attention_ratio',
'channel_title_attention_ratio',
'title_word_count',
'description_word_count',
'channel_title_word_count',
'description_url_count',
'description_url_ratio',
'shouting_ratio_title',
'shouting_ratio_description',
'shouting_ratio_channel_title',
'channel_title_hash',
'people_amount',
'max_prob_object',
'most_occurence_object',
'face_count',
'count_angry',
'count_sad',
'count_neutral',
'count_surprise',
'count_fear',
'count_happy',
'mean_red',
'median_red',
'iqr_red',
'mean_green',
'median_green',
'iqr_green',
'mean_blue',
'median_blue',
'iqr_blue',
'category_id',


In [4]:
# df_pre = pd.read_pickle("data/chkp3.pkl")
# list(df_pre.columns)

In [5]:
df = pd.read_json('data/chkp5/data.json')
df.head(3)

Unnamed: 0,video_id,publishedAt,channelId,title,description,thumbnails,channelTitle,tags,categoryId,liveBroadcastContent,localized,defaultAudioLanguage,viewCount,likeCount,dislikeCount,favoriteCount,commentCount,defaultLanguage
0,Y8Ceu9YBR1I,2019-01-02T14:00:03Z,UCIiBf-JbtCazHSFqXV4JgoA,GRACE N MICHELLE REUNITE AFTER 6 YEARS!!!! // ...,Michelle and I reunited AT LAST! And we did a ...,{'default': {'url': 'https://i.ytimg.com/vi/Y8...,Grace Helbig,"[grace, helbig, grace helbig, gracehelbig, gra...",22,none,{'title': 'GRACE N MICHELLE REUNITE AFTER 6 YE...,en,134287,9284,27,0,910,
1,St6aYO0Gz5U,2019-02-09T05:35:24Z,UCIRYBXDze5krPDzAEOxFGVA,Matthew Whitaker testifies before House judici...,US acting attorney general Matthew Whitaker te...,{'default': {'url': 'https://i.ytimg.com/vi/St...,Guardian News,"[matthew whitaker, whitaker, whitaker testimon...",25,none,{'title': 'Matthew Whitaker testifies before H...,en-GB,31382,149,30,0,18,
2,k8JuFit-j38,2018-09-06T16:00:13Z,UC1A_Hq-N1dHhAvwg0QWC7Sw,Making Music But Everything Is A Challenge,"I take a plugin which generates ""random beat m...",{'default': {'url': 'https://i.ytimg.com/vi/k8...,Levi Niha,"[Beat Making, Making A Beat, Music, Music Maki...",10,none,{'title': 'Making Music But Everything Is A Ch...,,230180,6358,122,0,469,


In [6]:
df.columns

Index(['video_id', 'publishedAt', 'channelId', 'title', 'description',
       'thumbnails', 'channelTitle', 'tags', 'categoryId',
       'liveBroadcastContent', 'localized', 'defaultAudioLanguage',
       'viewCount', 'likeCount', 'dislikeCount', 'favoriteCount',
       'commentCount', 'defaultLanguage'],
      dtype='object')

# Explainable

'views',
'likes',
'dislikes',
'comment_count',
'publish_time_day',
'publish_time_month',
'publish_time_year',
'publish_time_weekday',
'publish_time_quarter',
'publish_time_hour',
'publish_time_minute',
'publish_time_second',
'title_punctuation_count',
'description_punctuation_count',
'channel_title_punctuation_count',
'title_attention_count',
'description_attention_count',
'channel_title_attention_count',
'title_len',
'description_len',
'channel_title_len',
'title_letter_count',
'description_letter_count',
'channel_title_letter_count',
'title_information_ratio',
'description_information_ratio',
'channel_title_information_ratio',
'title_attention_ratio',
'description_attention_ratio',
'channel_title_attention_ratio',
'title_word_count',
'description_word_count',
'channel_title_word_count',
'description_url_count',
'description_url_ratio',
'shouting_ratio_title',
'shouting_ratio_description',
'shouting_ratio_channel_title',
'channel_title_hash',
'people_amount',
'max_prob_object',
'most_occurence_object',
'face_count',
'count_angry',
'count_sad',
'count_neutral',
'count_surprise',
'count_fear',
'count_happy',
'mean_red',
'median_red',
'iqr_red',
'mean_green',
'median_green',
'iqr_green',
'mean_blue',
'median_blue',
'iqr_blue',
'category_id',


In [7]:
df = df[[
    'video_id', 'publishedAt', 'title', 'description',
    'channelTitle', 'categoryId', 'tags',
    'viewCount', 'likeCount', 'dislikeCount',
    'commentCount', 'thumbnails'
]]
df.columns = [
    'video_id', 'publish_time', 'title', 'description',
    'channel_title', 'category_id', 'tags',
    'views', 'likes', 'dislikes',
    'comment_count', 'thumbnails'
]
df.head(3)

Unnamed: 0,video_id,publish_time,title,description,channel_title,category_id,tags,views,likes,dislikes,comment_count,thumbnails
0,Y8Ceu9YBR1I,2019-01-02T14:00:03Z,GRACE N MICHELLE REUNITE AFTER 6 YEARS!!!! // ...,Michelle and I reunited AT LAST! And we did a ...,Grace Helbig,22,"[grace, helbig, grace helbig, gracehelbig, gra...",134287,9284,27,910,{'default': {'url': 'https://i.ytimg.com/vi/Y8...
1,St6aYO0Gz5U,2019-02-09T05:35:24Z,Matthew Whitaker testifies before House judici...,US acting attorney general Matthew Whitaker te...,Guardian News,25,"[matthew whitaker, whitaker, whitaker testimon...",31382,149,30,18,{'default': {'url': 'https://i.ytimg.com/vi/St...
2,k8JuFit-j38,2018-09-06T16:00:13Z,Making Music But Everything Is A Challenge,"I take a plugin which generates ""random beat m...",Levi Niha,10,"[Beat Making, Making A Beat, Music, Music Maki...",230180,6358,122,469,{'default': {'url': 'https://i.ytimg.com/vi/k8...


#### Handling publish date

In [8]:
# def handle_dates(df, cols):
#     for col in cols:
#         df[col+'_day'] = df[col].dt.day
#         df[col+'_month'] = df[col].dt.month
#         df[col+'_year'] = df[col].dt.year
#         df[col+'_weekday'] = df[col].dt.dayofweek
#         df[col+'_quarter'] = df[col].dt.quarter
#         if col == 'publish_time':
#             df[col+'_hour'] = df[col].dt.hour
#             df[col+'_minute'] = df[col].dt.minute
#             df[col+'_second'] = df[col].dt.second
#     return df

# df['publish_time'] = pd.to_datetime(df['publish_time'])
# df = handle_dates(df, ['publish_time'])
# df.drop('publish_time', axis=1, inplace=True)
# df.head(3)

#### Handling text features

In [9]:
# text_features = ['title', 'description', 'channel_title']
# attention_symbols = '!?$#'

# def punctutation_count(text):
#     return len([c for c in str(text) if c in string.punctuation])

# def attention_count(text):
#     return len([c for c in str(text) if c in attention_symbols])

# def text_len(text):
#     return len(str(text))

# def letter_count(text):
#     return len(list(filter(str.isalpha, str(text))))

# def information_ratio(df):
#     for name in text_features:
#         df[name+'_information_ratio'] = (df[name+'_letter_count'] / df[name+'_len']).fillna(0)
#     return df

# def attention_ratio(df):
#     for name in text_features:
#         df[name+'_attention_ratio'] = (df[name+'_attention_count'] / df[name+'_letter_count']).fillna(0)
#     return df

# def word_count(df):
#     for name in text_features:
#         df[name+'_word_count'] = df[name].str.count(r"[\w\-_#\/\\\+\:$?]+")
#     return df

# def url_count(df):
#     df['description_url_count'] = df.description.str.count(r"(https?:\/\/)?(\w+\.\w+)[\?=\&\w_\-.\/.]*")
#     return df

# def url_ratio(df):
#     df['description_url_ratio'] = (df['description_url_count'] / df['description_word_count']).fillna(0)
#     return df

# def shouting_ratio(text):
#   """Returns ratio of upper letters to all letters. Ignores non-letters in summary"""
#   if not isinstance(text, str):
#     return np.nan

#   letters = ''.join(filter(str.isalpha, text))
#   if letters:
#     uppers = ''.join(filter(str.isupper, letters))
#     return len(uppers) / len(letters)
#   else:
#     return 0

# df[[n+'_punctuation_count' for n in text_features]] = df[text_features].applymap(punctutation_count)
# df[[n+'_attention_count' for n in text_features]] = df[text_features].applymap(attention_count)
# df[[n+'_len' for n in text_features]] = df[text_features].applymap(text_len)
# df[[n+'_letter_count' for n in text_features]] = df[text_features].applymap(letter_count)
# df = information_ratio(df)
# df = attention_ratio(df)
# df = word_count(df)
# df = url_count(df)
# df = url_ratio(df)
# df[[f'shouting_ratio_{f}' for f in text_features]] = df[text_features].applymap(shouting_ratio)

# df.head(3)

In [10]:
# def _hash(object, trim):
#     obj = hash(object)
#     return np.float32(str(obj)[:trim])

# for col, trim in zip(['channel_title'], [12]):
#     df[col+'_hash'] = df[col].apply(_hash, trim=trim).astype(np.float32)

# df.head(3)

#### Handling thumbnails


In [11]:
def get_tbnl_url(x):
    return x['high']['url']

df['thumbnail_link'] = df['thumbnails'].apply(get_tbnl_url)
df.head(3)

Unnamed: 0,video_id,publish_time,title,description,channel_title,category_id,tags,views,likes,dislikes,comment_count,thumbnails,thumbnail_link
0,Y8Ceu9YBR1I,2019-01-02T14:00:03Z,GRACE N MICHELLE REUNITE AFTER 6 YEARS!!!! // ...,Michelle and I reunited AT LAST! And we did a ...,Grace Helbig,22,"[grace, helbig, grace helbig, gracehelbig, gra...",134287,9284,27,910,{'default': {'url': 'https://i.ytimg.com/vi/Y8...,https://i.ytimg.com/vi/Y8Ceu9YBR1I/hqdefault.jpg
1,St6aYO0Gz5U,2019-02-09T05:35:24Z,Matthew Whitaker testifies before House judici...,US acting attorney general Matthew Whitaker te...,Guardian News,25,"[matthew whitaker, whitaker, whitaker testimon...",31382,149,30,18,{'default': {'url': 'https://i.ytimg.com/vi/St...,https://i.ytimg.com/vi/St6aYO0Gz5U/hqdefault.jpg
2,k8JuFit-j38,2018-09-06T16:00:13Z,Making Music But Everything Is A Challenge,"I take a plugin which generates ""random beat m...",Levi Niha,10,"[Beat Making, Making A Beat, Music, Music Maki...",230180,6358,122,469,{'default': {'url': 'https://i.ytimg.com/vi/k8...,https://i.ytimg.com/vi/k8JuFit-j38/hqdefault.jpg


In [12]:
count_unknown = 0
for index, row in df.iterrows():
    path = config['dataset_path'] / 'images/{}.jpg'.format(row['video_id'])
    link = row['thumbnail_link']
    try:
        ur.urlretrieve(link, path)
    except:
        count_unknown += 1
count_unknown

0

In [13]:
# TODO
# - YOLO
# - colors
# - faces with emotions

In [14]:
def mod_img(df, func, colname='', *args):
    if isinstance(colname, list):
        fname = '_'.join(colname)+'.pkl'
    else:
        fname = colname+'.pkl'

    if (cache_dir / fname).exists():
        col_val_list = load_pickle(cache_dir / fname)
    else:
        col_val_list = []
        for index, row in tqdm(df.iterrows(), total=df.shape[0]):
            try:
                col_val_list.append(func(index, *args))
            except:
                col_val_list.append(tuple(np.NaN for _ in colname) if isinstance(colname, list) else np.NaN)
        save_pickle(col_val_list, cache_dir / fname)

    print(df.shape, len(col_val_list))
    df[colname] = col_val_list
    return df

#### YOLO


In [15]:
yolo = YOLOv4()
yolo.config.parse_names("data/yolo_data/coco.names")
yolo.config.parse_cfg("data/yolo_data/yolov4-tiny.cfg")
yolo.make_model()
yolo.load_weights("data/yolo_data/yolov4-tiny.weights", weights_type="yolo")

def apply_yolo(index, prob_thresh=0.25):
    frame = cv2.imread("data/images/{}.jpg".format(index))
    # print(frame)
    # print(frame.shape)
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    pred = yolo.predict(frame_rgb, prob_thresh=prob_thresh)
    objects = []
    max_prob_object = (np.nan, -1)
    for i in pred:
        cx, cy, hw, hh, label_id, prob = i
        if prob > prob_thresh:
            object_ = yolo.config.names[label_id]
            objects.append(object_)
            if prob > max_prob_object[1]: max_prob_object = (object_, prob)
    c = Counter(objects)
    try:
        most_common = c.most_common(1)[0][0]
    except IndexError:
        most_common = np.nan
    return c['person'], objects, max_prob_object, most_common

df.set_index('video_id', inplace=True)
df = mod_img(df, apply_yolo, colname=['people_amount', 'objects', 'max_prob_object', 'most_occurence_object'])
df.head(5)

Loading from: data\cache\chkp5\people_amount_objects_max_prob_object_most_occurence_object.pkl
(10, 12) 10


Unnamed: 0_level_0,publish_time,title,description,channel_title,category_id,tags,views,likes,dislikes,comment_count,thumbnails,thumbnail_link,people_amount,objects,max_prob_object,most_occurence_object
video_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Y8Ceu9YBR1I,2019-01-02T14:00:03Z,GRACE N MICHELLE REUNITE AFTER 6 YEARS!!!! // ...,Michelle and I reunited AT LAST! And we did a ...,Grace Helbig,22,"[grace, helbig, grace helbig, gracehelbig, gra...",134287,9284,27,910,{'default': {'url': 'https://i.ytimg.com/vi/Y8...,https://i.ytimg.com/vi/Y8Ceu9YBR1I/hqdefault.jpg,4,"[person, person, person, person]","(person, 0.9443111)",person
St6aYO0Gz5U,2019-02-09T05:35:24Z,Matthew Whitaker testifies before House judici...,US acting attorney general Matthew Whitaker te...,Guardian News,25,"[matthew whitaker, whitaker, whitaker testimon...",31382,149,30,18,{'default': {'url': 'https://i.ytimg.com/vi/St...,https://i.ytimg.com/vi/St6aYO0Gz5U/hqdefault.jpg,8,"[tie, person, person, person, person, person, ...","(person, 0.9942337)",person
k8JuFit-j38,2018-09-06T16:00:13Z,Making Music But Everything Is A Challenge,"I take a plugin which generates ""random beat m...",Levi Niha,10,"[Beat Making, Making A Beat, Music, Music Maki...",230180,6358,122,469,{'default': {'url': 'https://i.ytimg.com/vi/k8...,https://i.ytimg.com/vi/k8JuFit-j38/hqdefault.jpg,0,[],"(nan, -1)",
_CzomU7kgFE,2020-10-26T20:30:01Z,Doctor Fate Character Details & Powers Reveale...,Welcome to episode #3 of the DC Universe Annot...,HN Entertainment,24,"[Hybrid Network, HN Entertainment, HNE, Doctor...",1925,83,4,12,{'default': {'url': 'https://i.ytimg.com/vi/_C...,https://i.ytimg.com/vi/_CzomU7kgFE/hqdefault.jpg,1,"[dog, person]","(person, 0.60851365)",dog
s1bKD0kqPzk,2020-12-02T17:00:15Z,Stephen Fry & Bill Bailey Hilariously Read The...,A little throwback to Stephen Fry & Bill Baile...,The Graham Norton Show,24,"[Graham Norton, Graham Norton Show, The Graham...",756989,8855,164,344,{'default': {'url': 'https://i.ytimg.com/vi/s1...,https://i.ytimg.com/vi/s1bKD0kqPzk/hqdefault.jpg,2,"[person, person]","(person, 0.7665016)",person


#### emotions

In [16]:
# predicted_emotions = dict(idx=[], emotion=[])

# BATCH_SIZE = 256

# def chunks(lst, n):
#     """Yield successive n-sized chunks from lst."""
#     for i in range(0, len(lst), n):
#         yield lst[i:i + n]


# emotions_path = cache_dir / 'emotions.pkl'
# for batch in chunks(unrolled_faces, BATCH_SIZE):
#     predictions = model.predict(np.array([b for _, b in batch]))
#     predictions = [idx_to_emotions[x] for x in np.argmax(predictions, axis=1)]
#     for (idx, _), emotion in zip(batch, predictions):
#         predicted_emotions['idx'].append(idx)
#         predicted_emotions['emotion'].append(emotion)

# save_pickle(predicted_emotions, emotions_path)

# len(predicted_emotions['idx'])

In [17]:
cols_todo = ['views', 'likes', 'dislikes', 'comment_count', 'publish_time_day', 'publish_time_month', 'publish_time_year', 'publish_time_weekday', 'publish_time_quarter', 'publish_time_hour', 'publish_time_minute', 'publish_time_second', 'title_punctuation_count', 'description_punctuation_count', 'channel_title_punctuation_count', 'title_attention_count', 'description_attention_count', 'channel_title_attention_count', 'title_len', 'description_len', 'channel_title_len', 'title_letter_count', 'description_letter_count', 'channel_title_letter_count', 'title_information_ratio', 'description_information_ratio', 'channel_title_information_ratio', 'title_attention_ratio', 'description_attention_ratio', 'channel_title_attention_ratio', 'title_word_count', 'description_word_count', 'channel_title_word_count', 'description_url_count', 'description_url_ratio', 'shouting_ratio_title', 'shouting_ratio_description', 'shouting_ratio_channel_title', 'channel_title_hash', 'people_amount', 'max_prob_object', 'most_occurence_object', 'face_count', 'count_angry', 'count_sad', 'count_neutral', 'count_surprise', 'count_fear', 'count_happy', 'mean_red', 'median_red', 'iqr_red', 'mean_green', 'median_green', 'iqr_green', 'mean_blue', 'median_blue', 'iqr_blue', 'category_id']

In [18]:
[i for i in cols_todo if i not in df.columns]

['publish_time_day',
 'publish_time_month',
 'publish_time_year',
 'publish_time_weekday',
 'publish_time_quarter',
 'publish_time_hour',
 'publish_time_minute',
 'publish_time_second',
 'title_punctuation_count',
 'description_punctuation_count',
 'channel_title_punctuation_count',
 'title_attention_count',
 'description_attention_count',
 'channel_title_attention_count',
 'title_len',
 'description_len',
 'channel_title_len',
 'title_letter_count',
 'description_letter_count',
 'channel_title_letter_count',
 'title_information_ratio',
 'description_information_ratio',
 'channel_title_information_ratio',
 'title_attention_ratio',
 'description_attention_ratio',
 'channel_title_attention_ratio',
 'title_word_count',
 'description_word_count',
 'channel_title_word_count',
 'description_url_count',
 'description_url_ratio',
 'shouting_ratio_title',
 'shouting_ratio_description',
 'shouting_ratio_channel_title',
 'channel_title_hash',
 'face_count',
 'count_angry',
 'count_sad',
 'count_n

In [19]:
[i for i in df.columns if i not in cols_todo]

['publish_time',
 'title',
 'description',
 'channel_title',
 'tags',
 'thumbnails',
 'thumbnail_link',
 'objects']

# Embeddings

#### Deswcription and title

In [20]:
#  Desciption and title embeddings
model_description = gensim.models.Word2Vec.load('models/model_description.model')
model_title = gensim.models.Word2Vec.load('models/model_title.model')

stopwords_list = list(stopwords.words('english'))   

def clear_sentence(sentence):
    try:
        sentence = re.sub(r"http\S+", "", sentence)
        sentence = word_tokenize(re.sub('[^A-Za-z]+', ' ', sentence.lower()))
        sentence_temp = []
        for word in sentence:
            if (word not in stopwords_list) and (len(word) > 2) and not (word.isnumeric()):
                sentence_temp.append(word)
        return sentence_temp
    except TypeError:
        return []

descriptions = df.apply(lambda w: clear_sentence(w['description']), axis=1).values
titles = df.apply(lambda w: clear_sentence(w['title']), axis=1).values

def word_averaging(wv, sequence):
    mean = []
    for word in sequence:
        if word in wv.vocab: mean.append(wv.get_vector(word))
    if not mean:
        return np.zeros(200,)
    return gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)

def word_averaging_list(wv, sequences):
    return np.vstack([word_averaging(wv, w) for w in sequences])

descriptions = word_averaging_list(model_description.wv, descriptions)
titles = word_averaging_list(model_title.wv, titles)

df['description_embedding'] = list(descriptions)
df['title_embedding'] = list(titles)
df.head(3)

Unnamed: 0_level_0,publish_time,title,description,channel_title,category_id,tags,views,likes,dislikes,comment_count,thumbnails,thumbnail_link,people_amount,objects,max_prob_object,most_occurence_object,description_embedding,title_embedding
video_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Y8Ceu9YBR1I,2019-01-02T14:00:03Z,GRACE N MICHELLE REUNITE AFTER 6 YEARS!!!! // ...,Michelle and I reunited AT LAST! And we did a ...,Grace Helbig,22,"[grace, helbig, grace helbig, gracehelbig, gra...",134287,9284,27,910,{'default': {'url': 'https://i.ytimg.com/vi/Y8...,https://i.ytimg.com/vi/Y8Ceu9YBR1I/hqdefault.jpg,4,"[person, person, person, person]","(person, 0.9443111)",person,"[0.025118731, -0.024444254, 0.18426603, -0.098...","[-0.045307893, -0.06995849, -0.039730128, -0.0..."
St6aYO0Gz5U,2019-02-09T05:35:24Z,Matthew Whitaker testifies before House judici...,US acting attorney general Matthew Whitaker te...,Guardian News,25,"[matthew whitaker, whitaker, whitaker testimon...",31382,149,30,18,{'default': {'url': 'https://i.ytimg.com/vi/St...,https://i.ytimg.com/vi/St6aYO0Gz5U/hqdefault.jpg,8,"[tie, person, person, person, person, person, ...","(person, 0.9942337)",person,"[0.03625319, 0.041347872, 0.09603844, -0.06204...","[-0.096591875, 0.049942244, 0.095870174, 0.000..."
k8JuFit-j38,2018-09-06T16:00:13Z,Making Music But Everything Is A Challenge,"I take a plugin which generates ""random beat m...",Levi Niha,10,"[Beat Making, Making A Beat, Music, Music Maki...",230180,6358,122,469,{'default': {'url': 'https://i.ytimg.com/vi/k8...,https://i.ytimg.com/vi/k8JuFit-j38/hqdefault.jpg,0,[],"(nan, -1)",,"[0.029597504, -0.043384492, 0.1575317, -0.0225...","[-0.059564322, 0.004785954, 0.028052565, 0.027..."


#### Objects and tags

In [23]:
text_to_process = ['objects', 'tags']  #!!!!!!!!!!!!!!!! 'image_text' - TODO

df[text_to_process].head(3)

Unnamed: 0_level_0,objects,tags
video_id,Unnamed: 1_level_1,Unnamed: 2_level_1
Y8Ceu9YBR1I,"[person, person, person, person]","[grace, helbig, grace helbig, gracehelbig, gra..."
St6aYO0Gz5U,"[tie, person, person, person, person, person, ...","[matthew whitaker, whitaker, whitaker testimon..."
k8JuFit-j38,[],"[Beat Making, Making A Beat, Music, Music Maki..."


In [24]:
df_temp = deepcopy(df[text_to_process])

def word_averaging(wv, sequence):
    mean = []
    for word in sequence:
        if word in wv.vocab: mean.append(wv.get_vector(word))
    if not mean:
        return np.zeros(300,)
    return gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)

def word_averaging_list(wv, sequences):
    return np.vstack([word_averaging(wv, w) for w in sequences])

def embedding_model(text, name):
    # Word averaging
    wa = word_averaging_list(wv, text)
    # Replace cols
    df_temp[name] = list(wa)

for col in text_to_process:
    embedding_model(df_temp[col].values, col)

df[text_to_process] = df_temp
df[text_to_process].head(3)

Unnamed: 0_level_0,objects,tags
video_id,Unnamed: 1_level_1,Unnamed: 2_level_1
Y8Ceu9YBR1I,"[0.12082629650831223, -0.10840089619159698, 0....","[0.07456114888191223, 0.031295161694288254, -0..."
St6aYO0Gz5U,"[0.11479440331459045, -0.11055154353380203, 0....","[-0.00980058778077364, -0.012048429809510708, ..."
k8JuFit-j38,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.009094622917473316, -0.08706275373697281, ..."


#### Transforming other columns

In [26]:
to_encode = ['most_occurence_object']
le_moo = load_pickle('data/chkp5/le_most_occurence_object.pkl')
df[to_encode] = le_moo.transform(df[to_encode].values.ravel())

Loading from: data/chkp5/le_most_occurence_object.pkl


In [27]:
to_untuple = ['max_prob_object']
lee_mpo = load_pickle('data/chkp5/le_max_prob_object.pkl')
df[to_untuple] = lee_mpo.transform(df[to_untuple].\
    applymap(lambda x: x[0] if isinstance(x, tuple) else 'none').fillna('none').values.ravel())

Loading from: data/chkp5/le_max_prob_object.pkl


In [29]:
df.head(3)

Unnamed: 0_level_0,publish_time,title,description,channel_title,category_id,tags,views,likes,dislikes,comment_count,thumbnails,thumbnail_link,people_amount,objects,max_prob_object,most_occurence_object,description_embedding,title_embedding
video_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Y8Ceu9YBR1I,2019-01-02T14:00:03Z,GRACE N MICHELLE REUNITE AFTER 6 YEARS!!!! // ...,Michelle and I reunited AT LAST! And we did a ...,Grace Helbig,22,"[0.07456114888191223, 0.031295161694288254, -0...",134287,9284,27,910,{'default': {'url': 'https://i.ytimg.com/vi/Y8...,https://i.ytimg.com/vi/Y8Ceu9YBR1I/hqdefault.jpg,4,"[0.12082629650831223, -0.10840089619159698, 0....",46,46,"[0.025118731, -0.024444254, 0.18426603, -0.098...","[-0.045307893, -0.06995849, -0.039730128, -0.0..."
St6aYO0Gz5U,2019-02-09T05:35:24Z,Matthew Whitaker testifies before House judici...,US acting attorney general Matthew Whitaker te...,Guardian News,25,"[-0.00980058778077364, -0.012048429809510708, ...",31382,149,30,18,{'default': {'url': 'https://i.ytimg.com/vi/St...,https://i.ytimg.com/vi/St6aYO0Gz5U/hqdefault.jpg,8,"[0.11479440331459045, -0.11055154353380203, 0....",46,46,"[0.03625319, 0.041347872, 0.09603844, -0.06204...","[-0.096591875, 0.049942244, 0.095870174, 0.000..."
k8JuFit-j38,2018-09-06T16:00:13Z,Making Music But Everything Is A Challenge,"I take a plugin which generates ""random beat m...",Levi Niha,10,"[-0.009094622917473316, -0.08706275373697281, ...",230180,6358,122,469,{'default': {'url': 'https://i.ytimg.com/vi/k8...,https://i.ytimg.com/vi/k8JuFit-j38/hqdefault.jpg,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",43,77,"[0.029597504, -0.043384492, 0.1575317, -0.0225...","[-0.059564322, 0.004785954, 0.028052565, 0.027..."


In [21]:
cols_todo = ['description_embedding_0', 'description_embedding_1', 'description_embedding_2', 'description_embedding_3', 'description_embedding_4', 'description_embedding_5', 'description_embedding_6', 'description_embedding_7', 'description_embedding_8', 'description_embedding_9', 'description_embedding_10', 'description_embedding_11', 'description_embedding_12', 'description_embedding_13', 'description_embedding_14', 'title_embedding_0', 'title_embedding_1', 'title_embedding_2', 'title_embedding_3', 'title_embedding_4', 'title_embedding_5', 'title_embedding_6', 'title_embedding_7', 'title_embedding_8', 'title_embedding_9', 'title_embedding_10', 'title_embedding_11', 'title_embedding_12', 'title_embedding_13', 'title_embedding_14', 'feature_vector_0', 'feature_vector_1', 'feature_vector_2', 'feature_vector_3', 'feature_vector_4', 'feature_vector_5', 'feature_vector_6', 'feature_vector_7', 'feature_vector_8', 'feature_vector_9', 'feature_vector_10', 'feature_vector_11', 'feature_vector_12', 'feature_vector_13', 'feature_vector_14', 'hist_reds_0', 'hist_reds_1', 'hist_reds_2', 'hist_reds_3', 'hist_reds_4', 'hist_reds_5', 'hist_reds_6', 'hist_reds_7', 'hist_reds_8', 'hist_reds_9', 'hist_reds_10', 'hist_reds_11', 'hist_reds_12', 'hist_reds_13', 'hist_reds_14', 'hist_greens_0', 'hist_greens_1', 'hist_greens_2', 'hist_greens_3', 'hist_greens_4', 'hist_greens_5', 'hist_greens_6', 'hist_greens_7', 'hist_greens_8', 'hist_greens_9', 'hist_greens_10', 'hist_greens_11', 'hist_greens_12', 'hist_greens_13', 'hist_greens_14', 'hist_blues_0', 'hist_blues_1', 'hist_blues_2', 'hist_blues_3', 'hist_blues_4', 'hist_blues_5', 'hist_blues_6', 'hist_blues_7', 'hist_blues_8', 'hist_blues_9', 'hist_blues_10', 'hist_blues_11', 'hist_blues_12', 'hist_blues_13', 'hist_blues_14', 'fisher_vector_0', 'fisher_vector_1', 'fisher_vector_2', 'fisher_vector_3', 'fisher_vector_4', 'fisher_vector_5', 'fisher_vector_6', 'fisher_vector_7', 'fisher_vector_8', 'fisher_vector_9', 'fisher_vector_10', 'fisher_vector_11', 'fisher_vector_12', 'fisher_vector_13', 'fisher_vector_14', 'image_text_0', 'image_text_1', 'image_text_2', 'image_text_3', 'image_text_4', 'image_text_5', 'image_text_6', 'image_text_7', 'image_text_8', 'image_text_9', 'image_text_10', 'image_text_11', 'image_text_12', 'image_text_13', 'image_text_14', 'objects_0', 'objects_1', 'objects_2', 'objects_3', 'objects_4', 'objects_5', 'objects_6', 'objects_7', 'objects_8', 'objects_9', 'objects_10', 'objects_11', 'objects_12', 'objects_13', 'objects_14', 'euler_number', 'flood_0', 'flood_1', 'flood_2', 'flood_3', 'flood_4', 'flood_5', 'flood_6', 'flood_7', 'shannon_entropy', 'tags_0', 'tags_1', 'tags_2', 'tags_3', 'tags_4', 'tags_5', 'tags_6', 'tags_7', 'tags_8', 'tags_9', 'tags_10', 'tags_11', 'tags_12', 'tags_13', 'tags_14']

In [22]:
[i for i in cols_todo if i not in df.columns]

['description_embedding_0',
 'description_embedding_1',
 'description_embedding_2',
 'description_embedding_3',
 'description_embedding_4',
 'description_embedding_5',
 'description_embedding_6',
 'description_embedding_7',
 'description_embedding_8',
 'description_embedding_9',
 'description_embedding_10',
 'description_embedding_11',
 'description_embedding_12',
 'description_embedding_13',
 'description_embedding_14',
 'title_embedding_0',
 'title_embedding_1',
 'title_embedding_2',
 'title_embedding_3',
 'title_embedding_4',
 'title_embedding_5',
 'title_embedding_6',
 'title_embedding_7',
 'title_embedding_8',
 'title_embedding_9',
 'title_embedding_10',
 'title_embedding_11',
 'title_embedding_12',
 'title_embedding_13',
 'title_embedding_14',
 'feature_vector_0',
 'feature_vector_1',
 'feature_vector_2',
 'feature_vector_3',
 'feature_vector_4',
 'feature_vector_5',
 'feature_vector_6',
 'feature_vector_7',
 'feature_vector_8',
 'feature_vector_9',
 'feature_vector_10',
 'featur