# Data generating of the 360-degree videos

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import glob
from datetime import datetime as datetime
import time

In [None]:
# Configure panda to show all columns
pd.set_option('display.max_columns', None)

### Merging all the traces found

In [None]:
path = '../../traces' # use your path
all_files = glob.glob(path + "/*.json")

li = []

for filename in all_files:
    df = pd.read_json(filename)
    li.append(df)

frame = pd.concat(li, ignore_index=True)


In [None]:
# Convert upload date (yyymmdd) to "days since upload"

current_time = datetime.now().timestamp()

for i in frame.index:
    try:
        upload_time = datetime.strptime(str(frame['upload_date'][i]), '%Y%m%d').timestamp()
        ms_since_upload = current_time - upload_time
        days_since_upload = ms_since_upload / 86400
        
        frame['upload_date'][i] =  days_since_upload
    except:
        pass

In [None]:
gen_frame = frame.copy()

gen_frame = gen_frame.explode('categories')
gen_frame = gen_frame[['upload_date','categories','duration', 'view_count','like_count','dislike_count', 'formats']]
music_frame = gen_frame[gen_frame['categories'] == "Music"]
music_frame = music_frame.explode('formats')
music_frame

In [None]:
music_frame

## Convert format objects to data columns

Format objects are JSON objects containing trace information. The properties (keys) of these objects must be unpacked into columns of the dataframe. This makes a row go from containing a column for one format object, to containing many columns; one for each property.

In [None]:
df = pd.DataFrame(music_frame['formats'].values.tolist())
music_frame = pd.concat([music_frame.reset_index(), df], axis =1)

music_frame

### Filter out audio formats

We are only interested in video streaming. Audio-only formats are discarded to reduce runtime complexity and to obtain accurate results.

In [None]:
#music_frame.reset_index(inplace=True)

music_frame = music_frame[music_frame["vcodec"] != "none"]
music_frame

### Drop irrelevant columns

To reduce runtime complexity and increase the accuracy of the results, we drop columns 

In [None]:
clean_df = music_frame[['upload_date','duration', 'view_count','like_count','dislike_count', 'acodec', 'filesize', 'format' ,'fps', 'vcodec', 'protocol', 'container']].reset_index(drop=True)

clean_df["like_count"] = clean_df["like_count"].fillna(0)
clean_df["container"] = clean_df["container"].fillna("none")

clean_df["dislike_count"] = clean_df["dislike_count"].fillna(0)
#clean_df['categories'] = clean_df['categories'].astype("string")
# clean_df['format_id'] = clean_df['format_id'].astype("category")
# clean_df['acodec'] = clean_df['acodec'].astype("category")
clean_df['like_count'] = clean_df['like_count'].astype("int64")
clean_df['dislike_count'] = clean_df['dislike_count'].astype("int64")
#clean_df['height'] = clean_df['height'].astype("object")
clean_df['fps'] = clean_df['fps'].astype("object")
# clean_df['vcodec'] = clean_df['vcodec'].astype("category")
# clean_df['protocol'] = clean_df['protocol'].astype("category")
# clean_df['container'] = clean_df['container'].astype("category")
clean_df['upload_date'] = clean_df['upload_date'].astype("int64")


clean_df.dtypes

In [None]:
clean_df

## Drop non-primitive (or string) data columns

We do this because the TGAN network can not evaluate/generate objects embedded in table cells.

In [None]:
# tmp_frame = music_frame.convert_dtypes()
# li = []
# for col in tmp_frame:
#     print(tmp_frame[col].dtype)
#     if tmp_frame[col].dtype == 'object':
#         li.append(col)
# for item in li:
#     music_frame = music_frame.drop(item, axis=1)

In [None]:
clean_df



In [None]:
# from sdv.tabular import CTGAN
# model = CTGAN()
# model.fit(clean_df)

from sdv.tabular import CopulaGAN
model = CopulaGAN()
model.fit(clean_df)

# new_data = model.sample(50)
# new_data.describe()

In [None]:
new_data = model.sample(2000)


In [None]:
new_data[new_data['view_count'] < 0]

In [None]:
new_data.describe()

In [None]:
clean_df

### Inspect data

In [None]:
frame.head(2)

In [None]:
len(frame.id.unique())

In [None]:
frame.describe()

In [None]:
print(df.columns.tolist())

In [None]:
len(frame['categories'].apply(sorted).transform(tuple).unique())

In [None]:
categories = pd.DataFrame(frame['categories'].apply(sorted).transform(tuple).unique())

In [None]:
categories

In [None]:
grouped = frame.groupby("categories")

In [None]:
newFrame = frame['categories'].apply(sorted).transform(tuple)

In [None]:
frame["tupleCat"] = newFrame

In [None]:
frame["tupleCat"]

In [None]:
frame.groupby("tupleCat").agg("count")["id"]

Note: make a bar chart of category video count

## Plotting relations between categories and other characteristics

### Upload date

In [None]:
fig, ax = plt.subplots(figsize=(20, 10))
frame.explode('categories').boxplot(by='categories', column=['upload_date'], ax=ax, grid=False)

ax.yaxis.grid(which='major', linestyle='-', linewidth='0.5', color='red')
ax.get_yaxis().set_ticks([365, 730, 1095, 1460, 1825])

### Popularity

#### Plot popularity

Note: like/dislike ratio is not being used for calculating the popularity of a video (see paragraph under graphs).

In [None]:
# For each entry, calculate its popularity
frame['popularity'] = 0.0 # initial float value
frame['like_dislike_ratio'] = 0.0

for i in frame.index:
    view_count = frame['view_count'][i]
    days_since_upload = frame['upload_date'][i]
    like_count = frame['like_count'][i]
    dislike_count = frame['dislike_count'][i]

    like_dislike_ratio = like_count / dislike_count

    popularity = (view_count / float(days_since_upload))# * like_dislike_ratio

    frame['like_dislike_ratio'][i] = like_dislike_ratio
    frame['popularity'][i] =  popularity

In [None]:
# Plot popularity

fig, ax = plt.subplots(figsize=(20, 10))
frame.explode('categories').boxplot(by='categories', column=['popularity'], ax=ax, grid=False, showfliers=False)

ax.yaxis.grid(which='major', linestyle='-', linewidth='0.5', color='red')

#### Plot like/dislike ratio

In [None]:
fig, ax = plt.subplots(figsize=(20, 10))
frame.explode('categories').boxplot(by='categories', column=['like_dislike_ratio'], ax=ax, grid=False, showfliers=False)

ax.yaxis.grid(which='major', linestyle='-', linewidth='0.5', color='red')
ax.get_yaxis().set_ticks([1, 10])

In the boxplot above, we find that the ratio between likes and dislikes on a video is hardly ever below 1. Therefore, we may conclude that viewers are more inclined to indicate which videos they like than to indicate which videos they dislike. And indeed, content creators usually encourage their audience to like their videos. Therefore, the ratio between likes and dislikes seems to be positively dominated by a relatively high number of likes.

This means that the like/dislike ratio may overrepresent the positive perception and we must be careful with using this metric.

### Available representations

#### Number of representations

Not solved yet.

In [None]:
df1 = (pd.concat({i: pd.DataFrame(x) for i, x in frame.pop('formats').items()})
         .reset_index(level=1, drop=True)
         .join(frame, rsuffix='_shared')
         .reset_index(drop=True))

df1 = df1[df1.vcodec != "none"]

df1['available_representations'] = df1['formats'].str.len()

fig, ax = plt.subplots(figsize=(20, 10))
df1.explode('categories').boxplot(by='categories', column=['available_representations'], ax=ax, grid=False, showfliers=False)

ax.yaxis.grid(which='major', linestyle='-', linewidth='0.5', color='red')

In [None]:
# df1 = (pd.concat({i: pd.DataFrame(x) for i, x in frame.pop('formats').items()})
#          .reset_index(level=1, drop=True)
#          .join(frame, rsuffix='_shared')
#          .reset_index(drop=True))

# df1 = df1[df1.vcodec != "none"]

#### Average bitrate

In [None]:
# Create column for average bitrate (kbps)
df1['average_bitrate'] = 0.0

for i in df1.index:
    try:
        file_size_bytes = df1['filesize'][i]
        file_size_bits = file_size_bytes * 8
        
        duration = df1['duration'][i]
        
        average_bitrate = (file_size_bits / float(duration)) / 1000 # average bitrate in kbps
        
        df1['average_bitrate'][i] =  average_bitrate
    except:
        pass

In [None]:
fig, ax = plt.subplots(figsize=(20, 10))
df1.explode('categories').boxplot(by='categories', column=['average_bitrate'], ax=ax, grid=False, showfliers=False)

ax.yaxis.grid(which='major', linestyle='-', linewidth='0.5', color='red')

#### File types

In [None]:
df1.groupby('categories')

In [None]:
df1.groupby('ext').agg('count')["id"]

#### Framerates

In [None]:
fig, ax = plt.subplots(figsize=(20, 10))
df1.explode('categories').boxplot(by='categories', column=['fps'], ax=ax, grid=False, showfliers=True)

ax.yaxis.grid(which='major', linestyle='-', linewidth='0.5', color='red')

#### Durations

In [None]:
fig, ax = plt.subplots(figsize=(20, 10))
df1.explode('categories').boxplot(by='categories', column=['duration'], ax=ax, grid=False, showfliers=False)

ax.yaxis.grid(which='major', linestyle='-', linewidth='0.5', color='red')