In [62]:
import tensorflow_hub as hub
import tensorflow as tf

import urllib.request as ur
import pandas as pd
import numpy as np
import pickle
import glob
import json
import ast
import re

from sklearn.manifold import TSNE
from pandas.io.json import json_normalize
from utils import read_config
from pathlib import Path
from copy import deepcopy

In [3]:
config = read_config("config.json")
config['dataset_path'] = Path(config['dataset_path'])

In [4]:
def load_csv(path, origin):
    df = pd.read_csv(path, sep=';', parse_dates=['publish_time'])
    df.columns = list(map(str.strip, df.columns))
    df['trending_date'] = pd.to_datetime(df.trending_date, format='%y.%d.%m')
    df['origin'] = origin
    return df


us_videos_df = load_csv(config['dataset_path'] / 'US_videos_5p.csv', 'US')
gb_videos_df = load_csv(config['dataset_path'] / 'GB_videos_5p_utf8.csv', 'GB')

videos_df = pd.concat([us_videos_df, gb_videos_df]).reset_index(drop=True)

def load_missing_ids(path):
    with open(path, 'rb') as fh:
        return{k: v['items'][0]['id']['videoId'] for k, v in pickle.load(fh).items()}

us_missing = load_missing_ids(config['dataset_path'] / 'us_missing_id.pkl')
gb_missing = load_missing_ids(config['dataset_path'] / 'gb_missing_id.pkl')

for (title, channel), idx in {**us_missing, **gb_missing}.items():
    videos_df.loc[(videos_df.title == title) & (videos_df.channel_title == channel), 'video_id'] = idx 

videos_df.head(3)

Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description,origin
0,2kyS6SvSYSE,2017-11-14,WE WANT TO TALK ABOUT OUR MARRIAGE,CaseyNeistat,,2017-11-13 17:13:01+00:00,SHANtell martin,748374,57527,2966,15954,https://i.ytimg.com/vi/2kyS6SvSYSE/default.jpg,False,False,False,SHANTELL'S CHANNEL - https://www.youtube.com/s...,US
1,1ZAPwfrtAFY,2017-11-14,The Trump Presidency: Last Week Tonight with J...,LastWeekTonight,,2017-11-13 07:30:00+00:00,"last week tonight trump presidency|""last week ...",2418783,97185,6146,12703,https://i.ytimg.com/vi/1ZAPwfrtAFY/default.jpg,False,False,False,"One year after the presidential election, John...",US
2,5qpjK5DgCt4,2017-11-14,"Racist Superman | Rudy Mancuso, King Bach & Le...",Rudy Mancuso,,2017-11-12 19:05:24+00:00,"racist superman|""rudy""|""mancuso""|""king""|""bach""...",3191434,146033,5339,8181,https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg,False,False,False,WATCH MY PREVIOUS VIDEO ▶ SUBSCRIBE ► https:/...,US


In [5]:
videos_df[videos_df.video_id == '#NAZWA?']

Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description,origin


In [6]:
videos_df = videos_df[['video_id', 'thumbnail_link']]
videos_df.drop_duplicates(inplace=True)
videos_df.set_index('video_id', inplace=True)
print(videos_df.shape)
videos_df.head(3)

(8610, 1)


Unnamed: 0_level_0,thumbnail_link
video_id,Unnamed: 1_level_1
2kyS6SvSYSE,https://i.ytimg.com/vi/2kyS6SvSYSE/default.jpg
1ZAPwfrtAFY,https://i.ytimg.com/vi/1ZAPwfrtAFY/default.jpg
5qpjK5DgCt4,https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg


### Download images

Wyjątki - Są to filmy, które nie są dostępne (np. są prywatne), przykładem kanału z takim filmem jest The View, który postuje filmy z różnych programów telewizyjnych. Takie filmy mogą być obiektem roszczeń o prawa autorskie, pomimo tego, że były trending i ich thumbnaile będą niedostępne.

In [7]:
# count_unknown = 0
# for index, row in videos_df.iterrows():
#     path = config['dataset_path'] / 'images/{}.jpg'.format(row['video_id'])
#     link = re.sub(r'(default)', r'hq\1', row['thumbnail_link'])
#     try:
#         ur.urlretrieve(link, path)
#     except:
#         
#         count_unknown += 1
# count_unknown

### Thumbnail embedding

In [32]:
videos_df_temp = deepcopy(videos_df.head(5))

def mod_img(func, colname='', max_dim=512, *args):
    # images = [str(p) for p in Path('data/images/').glob('*.jpg')]
    count=0
    col_val_list = []
    for index, row in videos_df.iterrows():
        print(count)
        count+=1
        if count > 5: break # test
        # Read img
        try:
            img = tf.io.read_file('data/images/{}.jpg'.format(index))
            img = tf.image.decode_image(img, channels=3)
            # Preprocess img (scale to max_dim)
            img = tf.image.convert_image_dtype(img, tf.float32)
            shape = tf.cast(tf.shape(img)[:-1], tf.float32)
            new_shape = tf.cast(shape * (max_dim / max(shape)), tf.int32) # shape * scale
            img = tf.image.resize(img, new_shape)
            img = img[tf.newaxis, :]
            # Apply function - TODO modification for list of functions
            col_val_list.append(func(img, *args))
        except UnicodeDecodeError:
            col_val_list.append(np.NaN)
    # Add to dataframe
    videos_df_temp[colname] = col_val_list

![](https://1.bp.blogspot.com/-oNSfIOzO8ko/XO3BtHnUx0I/AAAAAAAAEKk/rJ2tHovGkzsyZnCbwVad-Q3ZBnwQmCFsgCEwYBhgL/s640/image3.png)

In [16]:
feature_ex_model = hub.load("https://tfhub.dev/tensorflow/efficientnet/b7/feature-vector/1")

In [49]:
def fv_func(img, *args):
    return np.squeeze(feature_ex_model(img).numpy())

mod_img(fv_func, colname='feature_vector')
videos_df_temp.head(3)

0
1
2
3
4
5


Unnamed: 0_level_0,thumbnail_link,feature_vector
video_id,Unnamed: 1_level_1,Unnamed: 2_level_1
2kyS6SvSYSE,https://i.ytimg.com/vi/2kyS6SvSYSE/default.jpg,"[0.038101573, -0.07571, 0.0931569, -0.10862432..."
1ZAPwfrtAFY,https://i.ytimg.com/vi/1ZAPwfrtAFY/default.jpg,"[0.10854018, 0.076120146, 0.24488439, -0.13733..."
5qpjK5DgCt4,https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg,


In [50]:
tensor = videos_df_temp['feature_vector'].values

In [54]:
tensor

array([array([ 0.03810157, -0.07571   ,  0.0931569 , ...,  0.04092859,
        0.07770374, -0.12705919], dtype=float32),
       array([ 0.10854018,  0.07612015,  0.24488439, ...,  0.25569355,
        0.00531475, -0.14011018], dtype=float32),
       nan,
       array([0.0579176 , 0.30172202, 0.564285  , ..., 0.07848703, 0.13500798,
       0.24005534], dtype=float32),
       array([ 0.5283228 ,  0.19460933,  0.42413977, ...,  0.08944044,
       -0.0156542 , -0.08639332], dtype=float32)], dtype=object)

### t-SNE projection

In [60]:
tsne_temp = videos_df_temp['feature_vector'].dropna().values
tsne_temp = np.stack(tsne_temp, axis=0)
tsne_temp.shape

(4, 2560)

In [61]:
tsne_temp

array([[ 0.03810157, -0.07571   ,  0.0931569 , ...,  0.04092859,
         0.07770374, -0.12705919],
       [ 0.10854018,  0.07612015,  0.24488439, ...,  0.25569355,
         0.00531475, -0.14011018],
       [ 0.0579176 ,  0.30172202,  0.564285  , ...,  0.07848703,
         0.13500798,  0.24005534],
       [ 0.5283228 ,  0.19460933,  0.42413977, ...,  0.08944044,
        -0.0156542 , -0.08639332]], dtype=float32)

In [63]:
tsne = TSNE(n_components=2, verbose=1).fit_transform(tsne_temp)

[t-SNE] Computing 3 nearest neighbors...
[t-SNE] Indexed 4 samples in 0.000s...
[t-SNE] Computed neighbors for 4 samples in 0.002s...
[t-SNE] Computed conditional probabilities for sample 4 / 4
[t-SNE] Mean sigma: 1125899906842624.000000
[t-SNE] KL divergence after 250 iterations with early exaggeration: 45.041618
[t-SNE] KL divergence after 400 iterations: 0.048725


In [None]:
cmap = cm.get_cmap('tab20')
fig, ax = plt.subplots(figsize=(8,8))
num_categories = 10
for lab in range(num_categories):
    indices = test_predictions==lab
    ax.scatter(tsne_proj[indices,0],tsne_proj[indices,1], c=np.array(cmap(lab)).reshape(1,4), label = lab ,alpha=0.5)
ax.legend(fontsize='large', markerscale=2)
plt.show()