In [80]:
import pandas as pd
import numpy as np
import joblib
import json
import string
import gensim
import urllib.request as ur
import cv2

import nltk, re
from collections import Counter
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk import sent_tokenize, word_tokenize

from gensim.models import Word2Vec

from pathlib import Path
from utils import read_config

import warnings

# --------------------------- BARTEK --------------------------- 
from skimage import exposure
from skimage import transform
from skimage import color
from scipy.stats import iqr
import pickle
import keras_ocr
from tqdm import tqdm
from tensorflow.keras.models import load_model
from collections import Counter
import face_recognition
import skimage
from itertools import product

model = load_model('data/model_v6_23.hdf5')

emotions_to_idx = {'Angry': 0, 'Sad': 5, 'Neutral': 4, 'Disgust': 1, 'Surprise': 6, 'Fear': 2, 'Happy': 3}
idx_to_emotions = {v:k for k,v in emotions_to_idx.items()}

cache_dir = Path('data/chkp5/cache')
cache_dir.mkdir(exist_ok=True, parents=True)


def load_pickle(path):
    print("Loading from:", path)
    with open(path, 'rb') as fh:
        return pickle.load(fh)

def save_pickle(obj, path):
    print("Saving to:", path)
    with open(path, 'wb') as fh:
        return pickle.dump(obj, fh)


# ----------------------------- END ----------------------------- 

    
warnings.filterwarnings("ignore")

config = read_config("config.json")
config['dataset_path'] = Path(config['dataset_path'])

Removed cols - no data:
 'trending_date_day',
 'trending_date_month',
 'trending_date_year',
 'trending_date_weekday',
 'trending_date_quarter',
 'trending_span_days',
 'publish_to_trending_span_in_hours',
 'likes_first',
 'likes_last',
 'likes_diff',
 'dislikes_first',
 'dislikes_last',
 'dislikes_diff',
 'views_first',
 'views_last',
 'views_diff',
 'comment_count_first',
 'comment_count_last',
 'comment_count_diff',
 'votes_last',
 'likes_ratio_last',
 'dislike_ratio_last',
 'votes_first',
 'likes_ratio_first',
 'dislike_ratio_first',
 'is_us',
 'is_gb',
 'comments_disabled',
 'ratings_disabled',
 'video_error_or_removed',

Not explainable:
 'description_embedding_0',
 'description_embedding_1',
 'description_embedding_2',
 'description_embedding_3',
 'description_embedding_4',
 'description_embedding_5',
 'description_embedding_6',
 'description_embedding_7',
 'description_embedding_8',
 'description_embedding_9',
 'description_embedding_10',
 'description_embedding_11',
 'description_embedding_12',
 'description_embedding_13',
 'description_embedding_14',
 'title_embedding_0',
 'title_embedding_1',
 'title_embedding_2',
 'title_embedding_3',
 'title_embedding_4',
 'title_embedding_5',
 'title_embedding_6',
 'title_embedding_7',
 'title_embedding_8',
 'title_embedding_9',
 'title_embedding_10',
 'title_embedding_11',
 'title_embedding_12',
 'title_embedding_13',
 'title_embedding_14',
 'feature_vector_0',
 'feature_vector_1',
 'feature_vector_2',
 'feature_vector_3',
 'feature_vector_4',
 'feature_vector_5',
 'feature_vector_6',
 'feature_vector_7',
 'feature_vector_8',
 'feature_vector_9',
 'feature_vector_10',
 'feature_vector_11',
 'feature_vector_12',
 'feature_vector_13',
 'feature_vector_14',
 'hist_reds_0',
 'hist_reds_1',
 'hist_reds_2',
 'hist_reds_3',
 'hist_reds_4',
 'hist_reds_5',
 'hist_reds_6',
 'hist_reds_7',
 'hist_reds_8',
 'hist_reds_9',
 'hist_reds_10',
 'hist_reds_11',
 'hist_reds_12',
 'hist_reds_13',
 'hist_reds_14',
 'hist_greens_0',
 'hist_greens_1',
 'hist_greens_2',
 'hist_greens_3',
 'hist_greens_4',
 'hist_greens_5',
 'hist_greens_6',
 'hist_greens_7',
 'hist_greens_8',
 'hist_greens_9',
 'hist_greens_10',
 'hist_greens_11',
 'hist_greens_12',
 'hist_greens_13',
 'hist_greens_14',
 'hist_blues_0',
 'hist_blues_1',
 'hist_blues_2',
 'hist_blues_3',
 'hist_blues_4',
 'hist_blues_5',
 'hist_blues_6',
 'hist_blues_7',
 'hist_blues_8',
 'hist_blues_9',
 'hist_blues_10',
 'hist_blues_11',
 'hist_blues_12',
 'hist_blues_13',
 'hist_blues_14',
 'fisher_vector_0',
 'fisher_vector_1',
 'fisher_vector_2',
 'fisher_vector_3',
 'fisher_vector_4',
 'fisher_vector_5',
 'fisher_vector_6',
 'fisher_vector_7',
 'fisher_vector_8',
 'fisher_vector_9',
 'fisher_vector_10',
 'fisher_vector_11',
 'fisher_vector_12',
 'fisher_vector_13',
 'fisher_vector_14',
 'image_text_0',
 'image_text_1',
 'image_text_2',
 'image_text_3',
 'image_text_4',
 'image_text_5',
 'image_text_6',
 'image_text_7',
 'image_text_8',
 'image_text_9',
 'image_text_10',
 'image_text_11',
 'image_text_12',
 'image_text_13',
 'image_text_14',
 'objects_0',
 'objects_1',
 'objects_2',
 'objects_3',
 'objects_4',
 'objects_5',
 'objects_6',
 'objects_7',
 'objects_8',
 'objects_9',
 'objects_10',
 'objects_11',
 'objects_12',
 'objects_13',
 'objects_14',
 'euler_number',
 'flood_0',
 'flood_1',
 'flood_2',
 'flood_3',
 'flood_4',
 'flood_5',
 'flood_6',
 'flood_7',
 'shannon_entropy',
'tags_0',
 'tags_1',
 'tags_2',
 'tags_3',
 'tags_4',
 'tags_5',
 'tags_6',
 'tags_7',
 'tags_8',
 'tags_9',
 'tags_10',
 'tags_11',
 'tags_12',
 'tags_13',
 'tags_14',

Explainable:

'views',
'likes',
'dislikes',
'comment_count',
'publish_time_day',
'publish_time_month',
'publish_time_year',
'publish_time_weekday',
'publish_time_quarter',
'publish_time_hour',
'publish_time_minute',
'publish_time_second',
'title_punctuation_count',
'description_punctuation_count',
'channel_title_punctuation_count',
'title_attention_count',
'description_attention_count',
'channel_title_attention_count',
'title_len',
'description_len',
'channel_title_len',
'title_letter_count',
'description_letter_count',
'channel_title_letter_count',
'title_information_ratio',
'description_information_ratio',
'channel_title_information_ratio',
'title_attention_ratio',
'description_attention_ratio',
'channel_title_attention_ratio',
'title_word_count',
'description_word_count',
'channel_title_word_count',
'description_url_count',
'description_url_ratio',
'shouting_ratio_title',
'shouting_ratio_description',
'shouting_ratio_channel_title',
'channel_title_hash',
'people_amount',
'max_prob_object',
'most_occurence_object',
'face_count',
'count_angry',
'count_sad',
'count_neutral',
'count_surprise',
'count_fear',
'count_happy',
'mean_red',
'median_red',
'iqr_red',
'mean_green',
'median_green',
'iqr_green',
'mean_blue',
'median_blue',
'iqr_blue',
'category_id',


In [81]:
# df_pre = pd.read_pickle("data/chkp3.pkl")
# list(df_pre.columns)

In [82]:
df = pd.read_json('data/chkp5/data.json')
print(df.info())
df.head(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39265 entries, 0 to 39264
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   video_id              39265 non-null  object 
 1   publishedAt           39265 non-null  object 
 2   channelId             39265 non-null  object 
 3   title                 39265 non-null  object 
 4   description           39265 non-null  object 
 5   thumbnails            39265 non-null  object 
 6   channelTitle          39265 non-null  object 
 7   tags                  36361 non-null  object 
 8   categoryId            39265 non-null  int64  
 9   liveBroadcastContent  39265 non-null  object 
 10  localized             39265 non-null  object 
 11  defaultAudioLanguage  25665 non-null  object 
 12  viewCount             39256 non-null  float64
 13  likeCount             38978 non-null  float64
 14  dislikeCount          38978 non-null  float64
 15  favoriteCount      

Unnamed: 0,video_id,publishedAt,channelId,title,description,thumbnails,channelTitle,tags,categoryId,liveBroadcastContent,localized,defaultAudioLanguage,viewCount,likeCount,dislikeCount,favoriteCount,commentCount,defaultLanguage
0,Y8Ceu9YBR1I,2019-01-02T14:00:03Z,UCIiBf-JbtCazHSFqXV4JgoA,GRACE N MICHELLE REUNITE AFTER 6 YEARS!!!! // ...,Michelle and I reunited AT LAST! And we did a ...,{'default': {'url': 'https://i.ytimg.com/vi/Y8...,Grace Helbig,"[grace, helbig, grace helbig, gracehelbig, gra...",22,none,{'title': 'GRACE N MICHELLE REUNITE AFTER 6 YE...,en,134287.0,9284.0,27.0,0,910.0,
1,St6aYO0Gz5U,2019-02-09T05:35:24Z,UCIRYBXDze5krPDzAEOxFGVA,Matthew Whitaker testifies before House judici...,US acting attorney general Matthew Whitaker te...,{'default': {'url': 'https://i.ytimg.com/vi/St...,Guardian News,"[matthew whitaker, whitaker, whitaker testimon...",25,none,{'title': 'Matthew Whitaker testifies before H...,en-GB,31382.0,149.0,30.0,0,18.0,
2,k8JuFit-j38,2018-09-06T16:00:13Z,UC1A_Hq-N1dHhAvwg0QWC7Sw,Making Music But Everything Is A Challenge,"I take a plugin which generates ""random beat m...",{'default': {'url': 'https://i.ytimg.com/vi/k8...,Levi Niha,"[Beat Making, Making A Beat, Music, Music Maki...",10,none,{'title': 'Making Music But Everything Is A Ch...,,230180.0,6358.0,122.0,0,469.0,


In [83]:
df.columns

Index(['video_id', 'publishedAt', 'channelId', 'title', 'description',
       'thumbnails', 'channelTitle', 'tags', 'categoryId',
       'liveBroadcastContent', 'localized', 'defaultAudioLanguage',
       'viewCount', 'likeCount', 'dislikeCount', 'favoriteCount',
       'commentCount', 'defaultLanguage'],
      dtype='object')

# Explainable

'views',
'likes',
'dislikes',
'comment_count',
'publish_time_day',
'publish_time_month',
'publish_time_year',
'publish_time_weekday',
'publish_time_quarter',
'publish_time_hour',
'publish_time_minute',
'publish_time_second',
'title_punctuation_count',
'description_punctuation_count',
'channel_title_punctuation_count',
'title_attention_count',
'description_attention_count',
'channel_title_attention_count',
'title_len',
'description_len',
'channel_title_len',
'title_letter_count',
'description_letter_count',
'channel_title_letter_count',
'title_information_ratio',
'description_information_ratio',
'channel_title_information_ratio',
'title_attention_ratio',
'description_attention_ratio',
'channel_title_attention_ratio',
'title_word_count',
'description_word_count',
'channel_title_word_count',
'description_url_count',
'description_url_ratio',
'shouting_ratio_title',
'shouting_ratio_description',
'shouting_ratio_channel_title',
'channel_title_hash',
'people_amount',
'max_prob_object',
'most_occurence_object',
'face_count',
'count_angry',
'count_sad',
'count_neutral',
'count_surprise',
'count_fear',
'count_happy',
'mean_red',
'median_red',
'iqr_red',
'mean_green',
'median_green',
'iqr_green',
'mean_blue',
'median_blue',
'iqr_blue',
'category_id',


In [84]:
df = df[[
    'video_id', 'publishedAt', 'title', 'description',
    'channelTitle', 'categoryId',
    'viewCount', 'likeCount', 'dislikeCount',
    'commentCount', 'thumbnails'
]]
df.columns = [
    'video_id', 'publish_time', 'title', 'description',
    'channel_title', 'category_id',
    'views', 'likes', 'dislikes',
    'comment_count', 'thumbnails'
]
df.head(3)

Unnamed: 0,video_id,publish_time,title,description,channel_title,category_id,views,likes,dislikes,comment_count,thumbnails
0,Y8Ceu9YBR1I,2019-01-02T14:00:03Z,GRACE N MICHELLE REUNITE AFTER 6 YEARS!!!! // ...,Michelle and I reunited AT LAST! And we did a ...,Grace Helbig,22,134287.0,9284.0,27.0,910.0,{'default': {'url': 'https://i.ytimg.com/vi/Y8...
1,St6aYO0Gz5U,2019-02-09T05:35:24Z,Matthew Whitaker testifies before House judici...,US acting attorney general Matthew Whitaker te...,Guardian News,25,31382.0,149.0,30.0,18.0,{'default': {'url': 'https://i.ytimg.com/vi/St...
2,k8JuFit-j38,2018-09-06T16:00:13Z,Making Music But Everything Is A Challenge,"I take a plugin which generates ""random beat m...",Levi Niha,10,230180.0,6358.0,122.0,469.0,{'default': {'url': 'https://i.ytimg.com/vi/k8...


#### Handling publish date

In [85]:
def handle_dates(df, cols):
    for col in cols:
        df[col+'_day'] = df[col].dt.day
        df[col+'_month'] = df[col].dt.month
        df[col+'_year'] = df[col].dt.year
        df[col+'_weekday'] = df[col].dt.dayofweek
        df[col+'_quarter'] = df[col].dt.quarter
        if col == 'publish_time':
            df[col+'_hour'] = df[col].dt.hour
            df[col+'_minute'] = df[col].dt.minute
            df[col+'_second'] = df[col].dt.second
    return df

df['publish_time'] = pd.to_datetime(df['publish_time'])
df = handle_dates(df, ['publish_time'])
df.drop('publish_time', axis=1, inplace=True)
df.head(3)

Unnamed: 0,video_id,title,description,channel_title,category_id,views,likes,dislikes,comment_count,thumbnails,publish_time_day,publish_time_month,publish_time_year,publish_time_weekday,publish_time_quarter,publish_time_hour,publish_time_minute,publish_time_second
0,Y8Ceu9YBR1I,GRACE N MICHELLE REUNITE AFTER 6 YEARS!!!! // ...,Michelle and I reunited AT LAST! And we did a ...,Grace Helbig,22,134287.0,9284.0,27.0,910.0,{'default': {'url': 'https://i.ytimg.com/vi/Y8...,2,1,2019,2,1,14,0,3
1,St6aYO0Gz5U,Matthew Whitaker testifies before House judici...,US acting attorney general Matthew Whitaker te...,Guardian News,25,31382.0,149.0,30.0,18.0,{'default': {'url': 'https://i.ytimg.com/vi/St...,9,2,2019,5,1,5,35,24
2,k8JuFit-j38,Making Music But Everything Is A Challenge,"I take a plugin which generates ""random beat m...",Levi Niha,10,230180.0,6358.0,122.0,469.0,{'default': {'url': 'https://i.ytimg.com/vi/k8...,6,9,2018,3,3,16,0,13


#### Handling text features

In [86]:
text_features = ['title', 'description', 'channel_title']
attention_symbols = '!?$#'

def punctutation_count(text):
    return len([c for c in str(text) if c in string.punctuation])

def attention_count(text):
    return len([c for c in str(text) if c in attention_symbols])

def text_len(text):
    return len(str(text))

def letter_count(text):
    return len(list(filter(str.isalpha, str(text))))

def information_ratio(df):
    for name in text_features:
        df[name+'_information_ratio'] = (df[name+'_letter_count'] / df[name+'_len']).fillna(0)
    return df

def attention_ratio(df):
    for name in text_features:
        df[name+'_attention_ratio'] = (df[name+'_attention_count'] / df[name+'_letter_count']).fillna(0)
    return df

def word_count(df):
    for name in text_features:
        df[name+'_word_count'] = df[name].str.count(r"[\w\-_#\/\\\+\:$?]+")
    return df

def url_count(df):
    df['description_url_count'] = df.description.str.count(r"(https?:\/\/)?(\w+\.\w+)[\?=\&\w_\-.\/.]*")
    return df

def url_ratio(df):
    df['description_url_ratio'] = (df['description_url_count'] / df['description_word_count']).fillna(0)
    return df

def shouting_ratio(text):
  """Returns ratio of upper letters to all letters. Ignores non-letters in summary"""
  if not isinstance(text, str):
    return np.nan

  letters = ''.join(filter(str.isalpha, text))
  if letters:
    uppers = ''.join(filter(str.isupper, letters))
    return len(uppers) / len(letters)
  else:
    return 0

df[[n+'_punctuation_count' for n in text_features]] = df[text_features].applymap(punctutation_count)
df[[n+'_attention_count' for n in text_features]] = df[text_features].applymap(attention_count)
df[[n+'_len' for n in text_features]] = df[text_features].applymap(text_len)
df[[n+'_letter_count' for n in text_features]] = df[text_features].applymap(letter_count)
df = information_ratio(df)
df = attention_ratio(df)
df = word_count(df)
df = url_count(df)
df = url_ratio(df)
df[[f'shouting_ratio_{f}' for f in text_features]] = df[text_features].applymap(shouting_ratio)

df.head(3)

Unnamed: 0,video_id,title,description,channel_title,category_id,views,likes,dislikes,comment_count,thumbnails,...,description_attention_ratio,channel_title_attention_ratio,title_word_count,description_word_count,channel_title_word_count,description_url_count,description_url_ratio,shouting_ratio_title,shouting_ratio_description,shouting_ratio_channel_title
0,Y8Ceu9YBR1I,GRACE N MICHELLE REUNITE AFTER 6 YEARS!!!! // ...,Michelle and I reunited AT LAST! And we did a ...,Grace Helbig,22,134287.0,9284.0,27.0,910.0,{'default': {'url': 'https://i.ytimg.com/vi/Y8...,...,0.007291,0.0,10,253,2,25,0.098814,0.785714,0.143578,0.181818
1,St6aYO0Gz5U,Matthew Whitaker testifies before House judici...,US acting attorney general Matthew Whitaker te...,Guardian News,25,31382.0,149.0,30.0,18.0,{'default': {'url': 'https://i.ytimg.com/vi/St...,...,0.0,0.0,9,82,2,8,0.097561,0.048387,0.064125,0.166667
2,k8JuFit-j38,Making Music But Everything Is A Challenge,"I take a plugin which generates ""random beat m...",Levi Niha,10,230180.0,6358.0,122.0,469.0,{'default': {'url': 'https://i.ytimg.com/vi/k8...,...,0.004878,0.0,7,60,2,7,0.116667,0.194444,0.178049,0.25


In [87]:
def _hash(object, trim):
    obj = hash(object)
    return np.float32(str(obj)[:trim])

for col, trim in zip(['channel_title'], [12]):
    df[col+'_hash'] = df[col].apply(_hash, trim=trim).astype(np.float32)

df.head(3)

Unnamed: 0,video_id,title,description,channel_title,category_id,views,likes,dislikes,comment_count,thumbnails,...,channel_title_attention_ratio,title_word_count,description_word_count,channel_title_word_count,description_url_count,description_url_ratio,shouting_ratio_title,shouting_ratio_description,shouting_ratio_channel_title,channel_title_hash
0,Y8Ceu9YBR1I,GRACE N MICHELLE REUNITE AFTER 6 YEARS!!!! // ...,Michelle and I reunited AT LAST! And we did a ...,Grace Helbig,22,134287.0,9284.0,27.0,910.0,{'default': {'url': 'https://i.ytimg.com/vi/Y8...,...,0.0,10,253,2,25,0.098814,0.785714,0.143578,0.181818,-84794600000.0
1,St6aYO0Gz5U,Matthew Whitaker testifies before House judici...,US acting attorney general Matthew Whitaker te...,Guardian News,25,31382.0,149.0,30.0,18.0,{'default': {'url': 'https://i.ytimg.com/vi/St...,...,0.0,9,82,2,8,0.097561,0.048387,0.064125,0.166667,594503700000.0
2,k8JuFit-j38,Making Music But Everything Is A Challenge,"I take a plugin which generates ""random beat m...",Levi Niha,10,230180.0,6358.0,122.0,469.0,{'default': {'url': 'https://i.ytimg.com/vi/k8...,...,0.0,7,60,2,7,0.116667,0.194444,0.178049,0.25,913237200000.0


#### Handling thumbnails


In [88]:
def get_tbnl_url(x):
    return x['high']['url']

df['thumbnail_link'] = df['thumbnails'].apply(get_tbnl_url)
df.head(3)

Unnamed: 0,video_id,title,description,channel_title,category_id,views,likes,dislikes,comment_count,thumbnails,...,title_word_count,description_word_count,channel_title_word_count,description_url_count,description_url_ratio,shouting_ratio_title,shouting_ratio_description,shouting_ratio_channel_title,channel_title_hash,thumbnail_link
0,Y8Ceu9YBR1I,GRACE N MICHELLE REUNITE AFTER 6 YEARS!!!! // ...,Michelle and I reunited AT LAST! And we did a ...,Grace Helbig,22,134287.0,9284.0,27.0,910.0,{'default': {'url': 'https://i.ytimg.com/vi/Y8...,...,10,253,2,25,0.098814,0.785714,0.143578,0.181818,-84794600000.0,https://i.ytimg.com/vi/Y8Ceu9YBR1I/hqdefault.jpg
1,St6aYO0Gz5U,Matthew Whitaker testifies before House judici...,US acting attorney general Matthew Whitaker te...,Guardian News,25,31382.0,149.0,30.0,18.0,{'default': {'url': 'https://i.ytimg.com/vi/St...,...,9,82,2,8,0.097561,0.048387,0.064125,0.166667,594503700000.0,https://i.ytimg.com/vi/St6aYO0Gz5U/hqdefault.jpg
2,k8JuFit-j38,Making Music But Everything Is A Challenge,"I take a plugin which generates ""random beat m...",Levi Niha,10,230180.0,6358.0,122.0,469.0,{'default': {'url': 'https://i.ytimg.com/vi/k8...,...,7,60,2,7,0.116667,0.194444,0.178049,0.25,913237200000.0,https://i.ytimg.com/vi/k8JuFit-j38/hqdefault.jpg


In [90]:
count_unknown = 0
df = df.head(100) # TODO

for index, row in tqdm(df.iterrows()):
    path = config['dataset_path'] / 'images/{}.jpg'.format(row['video_id'])
    link = row['thumbnail_link']
    try:
        ur.urlretrieve(link, path)
    except:
        count_unknown += 1
count_unknown

100it [00:11,  8.55it/s]


0

In [11]:
# TODO
# - YOLO
# - colors
# - faces with emotions

## BARTEK

In [92]:
def mod_img(df, func, cache=True, as_df=True, colname='', *args):
    if isinstance(colname, list):
        fname = '_'.join(colname)+'.pkl'
    else:
        fname = colname+'.pkl'

    if (cache_dir / fname).exists() and cache:
        col_val_list = load_pickle(cache_dir / fname)
    else:
        col_val_list = []
        for index, row in tqdm(df.iterrows(), total=df.shape[0]):
            try:
                col_val_list.append(func(index, *args))
            except Exception as ex:
                print(ex)
                col_val_list.append(tuple(np.NaN for _ in colname) if isinstance(colname, list) else np.NaN)
        if cache:
            save_pickle(col_val_list, cache_dir / fname)

    if len(col_val_list) > 0 and isinstance(col_val_list[0], dict) and as_df:
        new_cols = pd.DataFrame(col_val_list, index=df.index)
        df = pd.concat([df, new_cols], axis=1)
    else:
        df[colname] = col_val_list
    return df

In [54]:
videos_df = df.set_index('video_id').head()

def apply_hist(index):
    try:
        frame = cv2.imread("data/images/{}.jpg".format(index))    
        img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        res = {'hist_'+c: exposure.histogram(img[:,:,idx]/255.0, nbins=256)[0] / (img.shape[0]*img.shape[1]) for idx, c in enumerate(['reds', 'greens', 'blues'])}
        return res
    except:
        return {'hist_'+k: np.nan for k in ['reds', 'greens', 'blues']}
    
    
videos_df = mod_img(videos_df, apply_hist, cache=False, colname='image_histograms')
videos_df.head(5)

100%|██████████| 5/5 [00:00<00:00, 86.44it/s]


Unnamed: 0_level_0,title,description,channel_title,category_id,views,likes,dislikes,comment_count,thumbnails,publish_time_day,...,description_url_count,description_url_ratio,shouting_ratio_title,shouting_ratio_description,shouting_ratio_channel_title,channel_title_hash,thumbnail_link,hist_reds,hist_greens,hist_blues
video_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Y8Ceu9YBR1I,GRACE N MICHELLE REUNITE AFTER 6 YEARS!!!! // ...,Michelle and I reunited AT LAST! And we did a ...,Grace Helbig,22,134287.0,9284.0,27.0,910.0,{'default': {'url': 'https://i.ytimg.com/vi/Y8...,2,...,25,0.098814,0.785714,0.143578,0.181818,-84794600000.0,https://i.ytimg.com/vi/Y8Ceu9YBR1I/hqdefault.jpg,"[0.24558449074074074, 0.0314525462962963, 0.01...","[0.27406828703703706, 0.03357060185185185, 0.0...","[0.2353009259259259, 0.015289351851851853, 0.0..."
St6aYO0Gz5U,Matthew Whitaker testifies before House judici...,US acting attorney general Matthew Whitaker te...,Guardian News,25,31382.0,149.0,30.0,18.0,{'default': {'url': 'https://i.ytimg.com/vi/St...,9,...,8,0.097561,0.048387,0.064125,0.166667,594503700000.0,https://i.ytimg.com/vi/St6aYO0Gz5U/hqdefault.jpg,"[0.09430555555555556, 0.01068287037037037, 0.0...","[0.09864583333333334, 0.010162037037037037, 0....","[0.11651041666666667, 0.017864583333333333, 0...."
k8JuFit-j38,Making Music But Everything Is A Challenge,"I take a plugin which generates ""random beat m...",Levi Niha,10,230180.0,6358.0,122.0,469.0,{'default': {'url': 'https://i.ytimg.com/vi/k8...,6,...,7,0.116667,0.194444,0.178049,0.25,913237200000.0,https://i.ytimg.com/vi/k8JuFit-j38/hqdefault.jpg,"[0.23684606481481482, 0.009982638888888888, 0....","[0.24790509259259258, 0.009265046296296296, 0....","[0.24179398148148148, 0.006394675925925926, 0...."
_CzomU7kgFE,Doctor Fate Character Details & Powers Reveale...,Welcome to episode #3 of the DC Universe Annot...,HN Entertainment,24,1925.0,83.0,4.0,12.0,{'default': {'url': 'https://i.ytimg.com/vi/_C...,26,...,3,0.018182,0.307692,0.088915,0.2,896644300000.0,https://i.ytimg.com/vi/_CzomU7kgFE/hqdefault.jpg,"[0.2644328703703704, 0.02208912037037037, 0.00...","[0.25609953703703703, 0.022002314814814815, 0....","[0.2673668981481481, 0.016655092592592593, 0.0..."
s1bKD0kqPzk,Stephen Fry & Bill Bailey Hilariously Read The...,A little throwback to Stephen Fry & Bill Baile...,The Graham Norton Show,24,756989.0,8855.0,164.0,344.0,{'default': {'url': 'https://i.ytimg.com/vi/s1...,2,...,3,0.04918,0.179487,0.103627,0.210526,908887800000.0,https://i.ytimg.com/vi/s1bKD0kqPzk/hqdefault.jpg,"[0.22884837962962962, 0.012233796296296296, 0....","[0.3511226851851852, 0.04990740740740741, 0.02...","[0.3956134259259259, 0.05303240740740741, 0.04..."


In [58]:
def apply_image_statistics(index):
    try:
        frame = cv2.imread("data/images/{}.jpg".format(index))    
        image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        
        color_vectors = image.reshape(-1, 3).T
        output = {}
        for c, n in zip(color_vectors, ['red', 'green', 'blue']):
            output['mean_'+n] = np.mean(c)
            output['median_'+n] = np.median(c)
            output['iqr_'+n] = iqr(c)
        return output
    except Exception as ex:
        print(ex)
        return {f'{k}_{v}': np.nan for k, v in product(['mean_','median_','iqr_'], ['reds', 'greens', 'blues'])}

videos_df = mod_img(videos_df, apply_image_statistics, cache=False, colname='image_statistics')
videos_df.head(5)

100%|██████████| 5/5 [00:00<00:00, 69.68it/s]


Unnamed: 0_level_0,title,description,channel_title,category_id,views,likes,dislikes,comment_count,thumbnails,publish_time_day,...,iqr__blues,mean_red,median_red,iqr_red,mean_green,median_green,iqr_green,mean_blue,median_blue,iqr_blue
video_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Y8Ceu9YBR1I,GRACE N MICHELLE REUNITE AFTER 6 YEARS!!!! // ...,Michelle and I reunited AT LAST! And we did a ...,Grace Helbig,22,134287.0,9284.0,27.0,910.0,{'default': {'url': 'https://i.ytimg.com/vi/Y8...,2,...,,105.192043,111.0,181.0,97.269959,91.0,186.0,114.923362,133.0,201.0
St6aYO0Gz5U,Matthew Whitaker testifies before House judici...,US acting attorney general Matthew Whitaker te...,Guardian News,25,31382.0,149.0,30.0,18.0,{'default': {'url': 'https://i.ytimg.com/vi/St...,9,...,,51.380862,27.0,61.0,39.219421,20.0,44.0,33.633513,16.0,39.0
k8JuFit-j38,Making Music But Everything Is A Challenge,"I take a plugin which generates ""random beat m...",Levi Niha,10,230180.0,6358.0,122.0,469.0,{'default': {'url': 'https://i.ytimg.com/vi/k8...,6,...,,124.358455,122.0,224.0,98.976933,64.0,198.0,95.308605,58.0,206.0
_CzomU7kgFE,Doctor Fate Character Details & Powers Reveale...,Welcome to episode #3 of the DC Universe Annot...,HN Entertainment,24,1925.0,83.0,4.0,12.0,{'default': {'url': 'https://i.ytimg.com/vi/_C...,26,...,,56.090816,30.0,72.0,52.913067,23.0,72.0,44.556221,16.0,50.0
s1bKD0kqPzk,Stephen Fry & Bill Bailey Hilariously Read The...,A little throwback to Stephen Fry & Bill Baile...,The Graham Norton Show,24,756989.0,8855.0,164.0,344.0,{'default': {'url': 'https://i.ytimg.com/vi/s1...,2,...,,123.488328,151.0,220.0,57.686701,10.0,94.0,57.444323,3.0,88.0


In [121]:
videos_df = df.set_index('video_id').head(100)

def process_face(img):
    face_img = transform.resize(img, (48,48,3))
    face_img = color.rgb2gray(face_img)[..., np.newaxis]
    return skimage.img_as_ubyte(face_img)


def apply_face_detection(index):
    try:
        frame = cv2.imread("data/images/{}.jpg".format(index))    
        img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        
        faces = face_recognition.face_locations(img)
        detected_faces = [process_face(img[top:bottom, left:right]) for top, right, bottom, left in faces]
        
        if len(faces) == 0:
            result = {'count_'+k.lower(): 0 for k in emotions_to_idx.keys()}
            result['face_count'] = 0
            return result
        else:
            predictions = model.predict(np.stack(detected_faces))
            predictions = [idx_to_emotions[x] for x in np.argmax(predictions, axis=1)]
            
            c = Counter(predictions)
            result = {'count_'+k.lower(): c.get(k, 0) for k in emotions_to_idx.keys()}
            result['face_count'] = len(faces)
            return result
        
    except Exception as ex:
        print(ex)
        result = {'count_'+k.lower(): 0 for k in emotions_to_idx.keys()}
        result['face_count'] = 0
        return result
    
videos_df = mod_img(videos_df, apply_face_detection, cache=False, colname='face_detection')
videos_df.head(5)

100%|██████████| 100/100 [00:11<00:00,  9.03it/s]


Unnamed: 0_level_0,title,description,channel_title,category_id,views,likes,dislikes,comment_count,thumbnails,publish_time_day,...,channel_title_hash,thumbnail_link,count_angry,count_sad,count_neutral,count_disgust,count_surprise,count_fear,count_happy,face_count
video_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Y8Ceu9YBR1I,GRACE N MICHELLE REUNITE AFTER 6 YEARS!!!! // ...,Michelle and I reunited AT LAST! And we did a ...,Grace Helbig,22,134287.0,9284.0,27.0,910.0,{'default': {'url': 'https://i.ytimg.com/vi/Y8...,2,...,-84794600000.0,https://i.ytimg.com/vi/Y8Ceu9YBR1I/hqdefault.jpg,0,0,0,0,0,2,2,4
St6aYO0Gz5U,Matthew Whitaker testifies before House judici...,US acting attorney general Matthew Whitaker te...,Guardian News,25,31382.0,149.0,30.0,18.0,{'default': {'url': 'https://i.ytimg.com/vi/St...,9,...,594503700000.0,https://i.ytimg.com/vi/St6aYO0Gz5U/hqdefault.jpg,1,0,0,0,0,1,0,2
k8JuFit-j38,Making Music But Everything Is A Challenge,"I take a plugin which generates ""random beat m...",Levi Niha,10,230180.0,6358.0,122.0,469.0,{'default': {'url': 'https://i.ytimg.com/vi/k8...,6,...,913237200000.0,https://i.ytimg.com/vi/k8JuFit-j38/hqdefault.jpg,0,0,0,0,0,0,0,0
_CzomU7kgFE,Doctor Fate Character Details & Powers Reveale...,Welcome to episode #3 of the DC Universe Annot...,HN Entertainment,24,1925.0,83.0,4.0,12.0,{'default': {'url': 'https://i.ytimg.com/vi/_C...,26,...,896644300000.0,https://i.ytimg.com/vi/_CzomU7kgFE/hqdefault.jpg,1,0,0,0,0,0,0,1
s1bKD0kqPzk,Stephen Fry & Bill Bailey Hilariously Read The...,A little throwback to Stephen Fry & Bill Baile...,The Graham Norton Show,24,756989.0,8855.0,164.0,344.0,{'default': {'url': 'https://i.ytimg.com/vi/s1...,2,...,908887800000.0,https://i.ytimg.com/vi/s1bKD0kqPzk/hqdefault.jpg,0,0,0,0,0,2,0,2


In [122]:
pipeline = keras_ocr.pipeline.Pipeline()

def apply_text_detection(index):
    try:
        frame = cv2.imread("data/images/{}.jpg".format(index))    
        img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        words = pipeline.recognize([img])[0]
        
        kears_ocr_text = [word for word, _ in words if len(word) > 3]
        return {'image_text': kears_ocr_text}
                
    except Exception as ex:
        print(ex)
        return {'image_text': []}
    
videos_df = mod_img(videos_df, apply_text_detection, cache=False, colname='image_text')
videos_df.head(5)

Looking for /home/hylomorph/.keras-ocr/craft_mlt_25k.h5
Looking for /home/hylomorph/.keras-ocr/crnn_kurapan.h5


100%|██████████| 100/100 [04:33<00:00,  2.73s/it]


Unnamed: 0_level_0,title,description,channel_title,category_id,views,likes,dislikes,comment_count,thumbnails,publish_time_day,...,thumbnail_link,count_angry,count_sad,count_neutral,count_disgust,count_surprise,count_fear,count_happy,face_count,image_text
video_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Y8Ceu9YBR1I,GRACE N MICHELLE REUNITE AFTER 6 YEARS!!!! // ...,Michelle and I reunited AT LAST! And we did a ...,Grace Helbig,22,134287.0,9284.0,27.0,910.0,{'default': {'url': 'https://i.ytimg.com/vi/Y8...,2,...,https://i.ytimg.com/vi/Y8Ceu9YBR1I/hqdefault.jpg,0,0,0,0,0,2,2,4,[reunion]
St6aYO0Gz5U,Matthew Whitaker testifies before House judici...,US acting attorney general Matthew Whitaker te...,Guardian News,25,31382.0,149.0,30.0,18.0,{'default': {'url': 'https://i.ytimg.com/vi/St...,9,...,https://i.ytimg.com/vi/St6aYO0Gz5U/hqdefault.jpg,1,0,0,0,0,1,0,2,[]
k8JuFit-j38,Making Music But Everything Is A Challenge,"I take a plugin which generates ""random beat m...",Levi Niha,10,230180.0,6358.0,122.0,469.0,{'default': {'url': 'https://i.ytimg.com/vi/k8...,6,...,https://i.ytimg.com/vi/k8JuFit-j38/hqdefault.jpg,0,0,0,0,0,0,0,0,"[percussion, only, percussion, even, melody, p..."
_CzomU7kgFE,Doctor Fate Character Details & Powers Reveale...,Welcome to episode #3 of the DC Universe Annot...,HN Entertainment,24,1925.0,83.0,4.0,12.0,{'default': {'url': 'https://i.ytimg.com/vi/_C...,26,...,https://i.ytimg.com/vi/_CzomU7kgFE/hqdefault.jpg,1,0,0,0,0,0,0,1,[gho32]
s1bKD0kqPzk,Stephen Fry & Bill Bailey Hilariously Read The...,A little throwback to Stephen Fry & Bill Baile...,The Graham Norton Show,24,756989.0,8855.0,164.0,344.0,{'default': {'url': 'https://i.ytimg.com/vi/s1...,2,...,https://i.ytimg.com/vi/s1bKD0kqPzk/hqdefault.jpg,0,0,0,0,0,2,0,2,"[graham, stephenfry, watching, weekpl, norton,..."


#### emotions

In [None]:
# predicted_emotions = dict(idx=[], emotion=[])

# BATCH_SIZE = 256

# def chunks(lst, n):
#     """Yield successive n-sized chunks from lst."""
#     for i in range(0, len(lst), n):
#         yield lst[i:i + n]


# emotions_path = cache_dir / 'emotions.pkl'
# for batch in chunks(unrolled_faces, BATCH_SIZE):
#     predictions = model.predict(np.array([b for _, b in batch]))
#     predictions = [idx_to_emotions[x] for x in np.argmax(predictions, axis=1)]
#     for (idx, _), emotion in zip(batch, predictions):
#         predicted_emotions['idx'].append(idx)
#         predicted_emotions['emotion'].append(emotion)

# save_pickle(predicted_emotions, emotions_path)

# len(predicted_emotions['idx'])

In [13]:
cols_todo = ['views', 'likes', 'dislikes', 'comment_count', 'publish_time_day', 'publish_time_month', 'publish_time_year', 'publish_time_weekday', 'publish_time_quarter', 'publish_time_hour', 'publish_time_minute', 'publish_time_second', 'title_punctuation_count', 'description_punctuation_count', 'channel_title_punctuation_count', 'title_attention_count', 'description_attention_count', 'channel_title_attention_count', 'title_len', 'description_len', 'channel_title_len', 'title_letter_count', 'description_letter_count', 'channel_title_letter_count', 'title_information_ratio', 'description_information_ratio', 'channel_title_information_ratio', 'title_attention_ratio', 'description_attention_ratio', 'channel_title_attention_ratio', 'title_word_count', 'description_word_count', 'channel_title_word_count', 'description_url_count', 'description_url_ratio', 'shouting_ratio_title', 'shouting_ratio_description', 'shouting_ratio_channel_title', 'channel_title_hash', 'people_amount', 'max_prob_object', 'most_occurence_object', 'face_count', 'count_angry', 'count_sad', 'count_neutral', 'count_surprise', 'count_fear', 'count_happy', 'mean_red', 'median_red', 'iqr_red', 'mean_green', 'median_green', 'iqr_green', 'mean_blue', 'median_blue', 'iqr_blue', 'category_id']

In [14]:
[i for i in cols_todo if i not in df.columns]

['people_amount',
 'max_prob_object',
 'most_occurence_object',
 'face_count',
 'count_angry',
 'count_sad',
 'count_neutral',
 'count_surprise',
 'count_fear',
 'count_happy',
 'mean_red',
 'median_red',
 'iqr_red',
 'mean_green',
 'median_green',
 'iqr_green',
 'mean_blue',
 'median_blue',
 'iqr_blue']

In [15]:
[i for i in df.columns if i not in cols_todo]

['video_id',
 'title',
 'description',
 'channel_title',
 'thumbnails',
 'thumbnail_link']

# Embeddings

In [23]:
#  Desciption and title embeddings
model_description = gensim.models.Word2Vec.load('models/model_description.model')
model_title = gensim.models.Word2Vec.load('models/model_title.model')

stopwords_list = list(stopwords.words('english'))   

def clear_sentence(sentence):
    try:
        sentence = re.sub(r"http\S+", "", sentence)
        sentence = word_tokenize(re.sub('[^A-Za-z]+', ' ', sentence.lower()))
        sentence_temp = []
        for word in sentence:
            if (word not in stopwords_list) and (len(word) > 2) and not (word.isnumeric()):
                sentence_temp.append(word)
        return sentence_temp
    except TypeError:
        return []

descriptions = df.apply(lambda w: clear_sentence(w['description']), axis=1).values
titles = df.apply(lambda w: clear_sentence(w['title']), axis=1).values

def word_averaging(wv, sequence):
    mean = []
    for word in sequence:
        if word in wv.vocab: mean.append(wv.get_vector(word))
    if not mean:
        return np.zeros(200,)
    return gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)

def word_averaging_list(wv, sequences):
    return np.vstack([word_averaging(wv, w) for w in sequences])

descriptions = word_averaging_list(model_description.wv, descriptions)
titles = word_averaging_list(model_title.wv, titles)

df['description_embedding'] = list(descriptions)
df['title_embedding'] = list(titles)
df.head(3)

Unnamed: 0,video_id,title,description,channel_title,category_id,views,likes,dislikes,comment_count,thumbnails,...,channel_title_word_count,description_url_count,description_url_ratio,shouting_ratio_title,shouting_ratio_description,shouting_ratio_channel_title,channel_title_hash,thumbnail_link,description_embedding,title_embedding
0,Y8Ceu9YBR1I,GRACE N MICHELLE REUNITE AFTER 6 YEARS!!!! // ...,Michelle and I reunited AT LAST! And we did a ...,Grace Helbig,22,134287,9284,27,910,{'default': {'url': 'https://i.ytimg.com/vi/Y8...,...,2,25,0.098814,0.785714,0.143578,0.181818,-65171230000.0,https://i.ytimg.com/vi/Y8Ceu9YBR1I/hqdefault.jpg,"[0.025118731, -0.024444254, 0.18426603, -0.098...","[-0.045307893, -0.06995849, -0.039730128, -0.0..."
1,St6aYO0Gz5U,Matthew Whitaker testifies before House judici...,US acting attorney general Matthew Whitaker te...,Guardian News,25,31382,149,30,18,{'default': {'url': 'https://i.ytimg.com/vi/St...,...,2,8,0.097561,0.048387,0.064125,0.166667,-60165530000.0,https://i.ytimg.com/vi/St6aYO0Gz5U/hqdefault.jpg,"[0.03625319, 0.041347872, 0.09603844, -0.06204...","[-0.096591875, 0.049942244, 0.095870174, 0.000..."
2,k8JuFit-j38,Making Music But Everything Is A Challenge,"I take a plugin which generates ""random beat m...",Levi Niha,10,230180,6358,122,469,{'default': {'url': 'https://i.ytimg.com/vi/k8...,...,2,7,0.116667,0.194444,0.178049,0.25,-53066620000.0,https://i.ytimg.com/vi/k8JuFit-j38/hqdefault.jpg,"[0.029597504, -0.043384492, 0.1575317, -0.0225...","[-0.059564322, 0.004785954, 0.028052565, 0.027..."
