# Youtube trends 
https://www.kaggle.com/datasnaek/youtube-new

In [None]:
country_codes = ['CA', 'DE', 'FR', 'GB', 'IN', 'JP', 'KR', 'MX', 'RU', 'US']


In [None]:
import pandas as pd
from pandas import DataFrame
import json
import csv

In [None]:
categories = {}
data = {}

for country_code in country_codes:
    # load videos per country
    print(country_code)


    category_fn = f'raw/{country_code}_category_id.json'

    with open(category_fn, 'r') as f:
        cats = json.loads(f.read())
        categories[country_code] = {entry['id'] : entry['snippet']['title'] for entry in cats['items']}

    
    video_fn = f'raw/{country_code}videos.csv'

    try:
        data[country_code]: DataFrame = pd.read_csv(video_fn, encoding='utf-8') 
    except Exception as e:
        print(f"Error for {country_code} : {e}")


## Prepare categories

In [None]:
# create one large map for all global categories as they are just 
global_cats = {}

for _, cats in categories.items():
    for cat in cats.items():
        if cat[0] not in global_cats: 
            global_cats[cat[0]] = []

        global_cats[cat[0]].append(cat[1])

assert len({key : len(set(val)) for key, val in global_cats.items() if len(set(val)) > 1}) == 0, 'NOT unique and same names for categories for all countries'

global_categories = {key : val[0] for key, val in global_cats.items()}

In [None]:
with open('categories.json', 'w') as file:
    file.write(json.dumps(global_categories))

In [None]:
global_categories

## Prepare data

In [None]:
# add country info and category names
for i, cc in enumerate(country_codes):
    data[cc]['country_code'] = cc
    data[cc]['country_id'] = i

    data[cc]['category_name'] = data[cc].apply(lambda row: global_categories[str(row['category_id'])], axis=1)

In [None]:
sum_ = 0
for cc in country_codes:
    columns = data[cc].columns
    print(f"{cc} ({len(columns)}) ({data[cc].size}): {columns}")
    sum_ += len(data[cc])

print(sum_)

In [None]:
dfs = pd.concat(list(data.values()), ignore_index=True, join='inner')
len(dfs)

In [None]:
import time
import datetime

def convert_to_unix_timestamp(input_str: str, input_format: str) -> int:
    '''Converts strings in form input_format to unix timestamp.'''
    return int(time.mktime(datetime.datetime.strptime(input_str, input_format).timetuple()))


In [None]:
dfs['trending_timestamp'] = dfs.apply(lambda row: convert_to_unix_timestamp(row['trending_date'], input_format="%y.%d.%m"), axis=1)

In [None]:
dfs['publish_timestamp'] = dfs.apply(lambda row: convert_to_unix_timestamp(row['publish_time'], input_format="%Y-%m-%dT%H:%M:%S.000Z"), axis=1)

In [None]:
dfs['trend_duration'] = dfs.apply(
    lambda row: (datetime.datetime.fromtimestamp(row['trending_timestamp']).date() - datetime.datetime.fromtimestamp(row['publish_timestamp']).date()).days   
    , axis=1)

In [None]:
dfs.to_csv('videos.csv')

## Upload preprocessed data

In [None]:
import pandas as pd
from pandas import DataFrame

dfs: DataFrame = pd.read_csv('videos.csv')

In [None]:
len(dfs)

In [None]:
import requests
jwt = "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJ1c2VybmFtZSI6InJlZ3VsYXJAaXRlYy5hYXUuYXQiLCJjcmVhdGVkX2F0IjoiMjAyMS0wNS0wNCAxMjoyNzo1Ny4yOTQxNzMiLCJ2YWxpZF91bnRpbCI6IjIwMjEtMDUtMDUgMTI6Mjc6NTcuMjk0MTczIn0.Mdvi-dy_PshPfoqujIcKzJLux-g3pMPfhM2ZmP6JeBY"


def send_transaction_to_rest_gateway(transaction: dict):
    res = requests.post(
        url = 'https://articonf1.itec.aau.at:30401/api/trace',
        json = transaction,
        headers = {"Authorization": f"Bearer {jwt}"},
        verify = False # ignore ssl error
    )

    return res

In [None]:
import csv

use_case = 'community-prediction-youtube'
table_name = 'community-prediction-youtube'
     
for idx, entry in dfs.iterrows():
    row = entry.to_dict()

    row['ApplicationType'] = use_case
    row['docType'] = table_name

    res = send_transaction_to_rest_gateway(row)
    print(idx, res)
    