In [1]:
import urllib.request as ur
import pandas as pd
import numpy as np
import pickle
import json
import ast
import re

from pandas.io.json import json_normalize
from utils import read_config
from pathlib import Path

In [2]:
config = read_config("config.json")
config['dataset_path'] = Path(config['dataset_path'])

In [3]:
def load_csv(path, origin):
    df = pd.read_csv(path, sep=';', parse_dates=['publish_time'])
    df.columns = list(map(str.strip, df.columns))
    df['trending_date'] = pd.to_datetime(df.trending_date, format='%y.%d.%m')
    df['origin'] = origin
    return df


us_videos_df = load_csv(config['dataset_path'] / 'US_videos_5p.csv', 'US')
gb_videos_df = load_csv(config['dataset_path'] / 'GB_videos_5p_utf8.csv', 'GB')

videos_df = pd.concat([us_videos_df, gb_videos_df]).reset_index(drop=True)

def load_missing_ids(path):
    with open(path, 'rb') as fh:
        return{k: v['items'][0]['id']['videoId'] for k, v in pickle.load(fh).items()}

us_missing = load_missing_ids(config['dataset_path'] / 'us_missing_id.pkl')
gb_missing = load_missing_ids(config['dataset_path'] / 'gb_missing_id.pkl')

for (title, channel), idx in {**us_missing, **gb_missing}.items():
    videos_df.loc[(videos_df.title == title) & (videos_df.channel_title == channel), 'video_id'] = idx 

videos_df.head(3)

Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description,origin
0,2kyS6SvSYSE,2017-11-14,WE WANT TO TALK ABOUT OUR MARRIAGE,CaseyNeistat,,2017-11-13 17:13:01+00:00,SHANtell martin,748374,57527,2966,15954,https://i.ytimg.com/vi/2kyS6SvSYSE/default.jpg,False,False,False,SHANTELL'S CHANNEL - https://www.youtube.com/s...,US
1,1ZAPwfrtAFY,2017-11-14,The Trump Presidency: Last Week Tonight with J...,LastWeekTonight,,2017-11-13 07:30:00+00:00,"last week tonight trump presidency|""last week ...",2418783,97185,6146,12703,https://i.ytimg.com/vi/1ZAPwfrtAFY/default.jpg,False,False,False,"One year after the presidential election, John...",US
2,5qpjK5DgCt4,2017-11-14,"Racist Superman | Rudy Mancuso, King Bach & Le...",Rudy Mancuso,,2017-11-12 19:05:24+00:00,"racist superman|""rudy""|""mancuso""|""king""|""bach""...",3191434,146033,5339,8181,https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg,False,False,False,WATCH MY PREVIOUS VIDEO ▶ SUBSCRIBE ► https:/...,US


In [6]:
videos_df[videos_df.video_id == '#NAZWA?']

Unnamed: 0,video_id,thumbnail_link


In [7]:
videos_df = videos_df[['video_id', 'thumbnail_link']]
videos_df.drop_duplicates(inplace=True)
print(videos_df.shape)
videos_df.head(3)

(8610, 2)


Unnamed: 0,video_id,thumbnail_link
0,2kyS6SvSYSE,https://i.ytimg.com/vi/2kyS6SvSYSE/default.jpg
1,1ZAPwfrtAFY,https://i.ytimg.com/vi/1ZAPwfrtAFY/default.jpg
2,5qpjK5DgCt4,https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg


### Download images

In [8]:
count_unknown = 0
for index, row in videos_df.iterrows():
    path = config['dataset_path'] / 'images/{}.jpg'.format(row['video_id'])
    link = re.sub(r'(default)', r'hq\1', row['thumbnail_link'])
    try:
        ur.urlretrieve(link, path)
    except:
        # Są to filmy, które nie są dostępne (np. są prywatne), przykładem kanału z takim filmem jest The View, który postuje filmy z różnych programów telewizyjnych. Takie filmy mogą być obiektem roszczeń o prawa autorskie, pomimo tego, że były trending i ich thumbnaile będą niedostępne.
        count_unknown += 1
count_unknown

805

### Thumbnail embedding