In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import glob
from datetime import datetime as datetime
import time
import os

In [2]:
# Select category of videos
VIDEO_CAT = "Travel & Events"
# Configure panda to show all columns
pd.set_option('display.max_columns', None)

# Prepare input dataframe

Merging all the available traces 

In [3]:
path = '../../traces' # use your path
all_files = glob.glob(path + "/*.json")

li = []

for filename in all_files:
    df = pd.read_json(filename)
    li.append(df)

df = pd.concat(li, ignore_index=True)

Convert upload date (yyymmdd) to "days since upload"

In [4]:
current_time = datetime.now().timestamp()

for i in df.index:
    try:
        upload_time = datetime.strptime(str(df['upload_date'][i]), '%Y%m%d').timestamp()
        seconds_since_upload = current_time - upload_time
        days_since_upload = seconds_since_upload / 86400 # 86400 seconds in a day
        
        df['upload_date'][i] =  days_since_upload
    except:
        pass

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['upload_date'][i] =  days_since_upload


Explode categories and remove irrelevant columns

In [5]:
df = df.explode('categories')
df = df[['upload_date','categories','duration', 'view_count','like_count','dislike_count', 'formats']]

Filter on video category

In [6]:
cat_df = df[df['categories'] == VIDEO_CAT]

Explode on formats

In [7]:
cat_df = cat_df.explode('formats')

Convert format objects to data columns. Format objects are JSON objects containing trace information. The properties (keys) of these objects must be unpacked into columns of the datadf. This makes a row go from containing a column for one format object, to containing many columns; one for each property.

In [8]:
formats_df = pd.DataFrame(cat_df['formats'].values.tolist())
cat_df = pd.concat([cat_df.reset_index(), formats_df], axis=1)

Filter out audio formats

In [9]:
cat_df = cat_df[cat_df["vcodec"] != "none"]

Drop irrelevant columns

In [10]:
clean_df = cat_df[['upload_date','duration', 'view_count','like_count','dislike_count', 'acodec', 'filesize', 'format' ,'fps', 'vcodec', 'protocol', 'container']].reset_index(drop=True)

# Fill empty cells with default value
clean_df["like_count"] = clean_df["like_count"].fillna(0)
clean_df["container"] = clean_df["container"].fillna("none")
clean_df["dislike_count"] = clean_df["dislike_count"].fillna(0)

# Convert datat types
clean_df['like_count'] = clean_df['like_count'].astype("int64")
clean_df['dislike_count'] = clean_df['dislike_count'].astype("int64")
clean_df['fps'] = clean_df['fps'].astype("object")
clean_df['upload_date'] = clean_df['upload_date'].astype("int64")

Print clean input dataframe for debugging

In [11]:
clean_df.head()

Unnamed: 0,upload_date,duration,view_count,like_count,dislike_count,acodec,filesize,format,fps,vcodec,protocol,container
0,1216,265,11706236,61117,6686,none,2902073.0,278 - 256x144 (144s),30,vp9,https,webm
1,1216,265,11706236,61117,6686,none,2459632.0,160 - 256x144 (144s),30,avc1.4d400c,https,none
2,1216,265,11706236,61117,6686,none,6165460.0,242 - 424x240 (240s),30,vp9,https,none
3,1216,265,11706236,61117,6686,none,5342931.0,133 - 424x240 (240s),30,avc1.4d4015,https,none
4,1216,265,11706236,61117,6686,none,11625305.0,243 - 640x360 (360s),30,vp9,https,none


# Prepare Generative Adversarial Network models

Prepare CTGAN model

In [12]:
from sdv.tabular import CTGAN
modelCTGAN = CTGAN()
modelCTGAN.fit(clean_df)

  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,


Prepare CopulaGAN model

In [13]:
from sdv.tabular import CopulaGAN
modelCopulaGAN = CopulaGAN()
modelCopulaGAN.fit(clean_df)

  return c**2 / (c**2 - n**2)
  Lhat = muhat - Shat*mu
  a = (self.min - loc) / scale
  b = (self.max - loc) / scale
  sk = 2*(b-a)*np.sqrt(a + b + 1) / (a + b + 2) / np.sqrt(a*b)
  improvement from the last ten iterations.
  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,


In [14]:
new_dataCT = modelCTGAN.sample(2000)
new_dataCopula = modelCopulaGAN.sample(2000)

In [15]:
new_dataCT.head()

Unnamed: 0,upload_date,duration,view_count,like_count,dislike_count,acodec,filesize,format,fps,vcodec,protocol,container
0,81,88,60306,-358,3,none,33593030.0,133 - 424x240 (240s),30,vp9,https,none
1,1210,670,12653915,825,39,none,96200290.0,248 - 1920x1080 (1080s),30,avc1.4d401f,https,none
2,438,409,-20139,-9,-4,none,,138 - 7680x7680 (DASH video),30,vp9,http_dash_segments,webm_dash
3,35,60,15886,385,-38,none,27891380.0,272 - 6400x3200 (2880s),60,avc1.4d401e,https,none
4,76,237,6558235,451,5070,mp4a.40.2,347523800.0,18 - 640x360 (360s),30,avc1.42001E,https,none


In [16]:
new_dataCopula.head()

Unnamed: 0,upload_date,duration,view_count,like_count,dislike_count,acodec,filesize,format,fps,vcodec,protocol,container
0,643,192,8228858,69494,9641,none,519062.9,278 - 256x144 (144s),30,vp9,https,webm
1,649,770,265,4,0,none,7880943.0,133 - 426x240 (240s),30,vp9,https,none
2,1479,151,4904563,41,107,none,1940671.0,278 - 256x144 (144s),25,vp9,https,webm
3,1326,419,137335,648,71,none,82318000.0,397 - 854x480 (480s),25,vp9,http_dash_segments,webm_dash
4,1019,283,368635,10828,71,none,1653502.0,278 - 256x144 (144s),30,vp9,https,webm


# Save GAN models

In [17]:
if not os.path.exists('models'):
        os.makedirs('models')

In [18]:
ct_model_path = os.path.join('models', "{}_ct.pkl".format(VIDEO_CAT))
copula_model_path = os.path.join('models', "{}_copula.pkl".format(VIDEO_CAT))

modelCTGAN.save(ct_model_path)
modelCopulaGAN.save(copula_model_path)

# Save generated output

In [19]:
if not os.path.exists('output'):
        os.makedirs('output')

In [20]:
ct_data_path = os.path.join('output', "{}_ct.csv".format(VIDEO_CAT))
copula_data_path = os.path.join('output', "{}_copula.csv".format(VIDEO_CAT))

new_dataCT.to_csv(ct_data_path)
new_dataCopula.to_csv(copula_data_path)