In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import glob
from datetime import datetime as datetime
import time
import os

In [2]:
# Select category of videos
VIDEO_CAT = "Gaming"
# Configure panda to show all columns
pd.set_option('display.max_columns', None)

# Prepare input datadf

Merging all the available traces 

In [3]:
path = '../../traces' # use your path
all_files = glob.glob(path + "/*.json")

li = []

for filename in all_files:
    df = pd.read_json(filename)
    li.append(df)

df = pd.concat(li, ignore_index=True)

Convert upload date (yyymmdd) to "days since upload"

In [4]:
current_time = datetime.now().timestamp()

for i in df.index:
    try:
        upload_time = datetime.strptime(str(df['upload_date'][i]), '%Y%m%d').timestamp()
        seconds_since_upload = current_time - upload_time
        days_since_upload = seconds_since_upload / 86400 # 86400 seconds in a day
        
        df['upload_date'][i] =  days_since_upload
    except:
        pass

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['upload_date'][i] =  days_since_upload


Explode categories and remove irrelevant columns

In [5]:
df = df.explode('categories')
df = df[['upload_date','categories','duration', 'view_count','like_count','dislike_count', 'formats']]

Filter on video category

In [6]:
cat_df = df[df['categories'] == VIDEO_CAT]

Explode on formats

In [7]:
cat_df = cat_df.explode('formats')

Convert format objects to data columns. Format objects are JSON objects containing trace information. The properties (keys) of these objects must be unpacked into columns of the datadf. This makes a row go from containing a column for one format object, to containing many columns; one for each property.

In [8]:
formats_df = pd.DataFrame(cat_df['formats'].values.tolist())
cat_df = pd.concat([cat_df.reset_index(), formats_df], axis=1)

Filter out audio formats

In [9]:
cat_df = cat_df[cat_df["vcodec"] != "none"]

Drop irrelevant columns

In [10]:
clean_df = cat_df[['upload_date','duration', 'view_count','like_count','dislike_count', 'acodec', 'filesize', 'format' ,'fps', 'vcodec', 'protocol', 'container']].reset_index(drop=True)

# Fill empty cells with default value
clean_df["like_count"] = clean_df["like_count"].fillna(0)
clean_df["container"] = clean_df["container"].fillna("none")
clean_df["dislike_count"] = clean_df["dislike_count"].fillna(0)

# Convert datat types
clean_df['like_count'] = clean_df['like_count'].astype("int64")
clean_df['dislike_count'] = clean_df['dislike_count'].astype("int64")
clean_df['fps'] = clean_df['fps'].astype("object")
clean_df['upload_date'] = clean_df['upload_date'].astype("int64")

Print clean input dataframe for debugging

In [11]:
clean_df.head()

Unnamed: 0,upload_date,duration,view_count,like_count,dislike_count,acodec,filesize,format,fps,vcodec,protocol,container
0,42,245,9401,1337,6,none,2724473.0,278 - 256x144 (144s),30,vp9,https,webm
1,42,245,9401,1337,6,none,2433581.0,160 - 256x144 (144s),30,avc1.4d400c,https,none
2,42,245,9401,1337,6,none,5281822.0,242 - 426x240 (240s),30,vp9,https,none
3,42,245,9401,1337,6,none,5554575.0,133 - 426x240 (240s),30,avc1.4d4015,https,none
4,42,245,9401,1337,6,none,9256482.0,243 - 640x360 (360s),30,vp9,https,none


# Prepare Generative Adversarial Network models

Prepare CTGAN model

In [12]:
from sdv.tabular import CTGAN
modelCTGAN = CTGAN()
modelCTGAN.fit(clean_df)

  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,


Prepare CopulaGAN model

In [13]:
from sdv.tabular import CopulaGAN
modelCopulaGAN = CopulaGAN()
modelCopulaGAN.fit(clean_df)

  return c**2 / (c**2 - n**2)
  Lhat = muhat - Shat*mu
  sk = 2*(b-a)*np.sqrt(a + b + 1) / (a + b + 2) / np.sqrt(a*b)
  improvement from the last ten iterations.
  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,


In [14]:
new_dataCT = modelCTGAN.sample(2000)
new_dataCopula = modelCopulaGAN.sample(2000)

In [15]:
new_dataCT.head()

Unnamed: 0,upload_date,duration,view_count,like_count,dislike_count,acodec,filesize,format,fps,vcodec,protocol,container
0,1911,237,250876,-5408,278,none,63906270.0,22 - 1280x640 (720s),25,avc1.64001f,http_dash_segments,none
1,1216,225,10406818,41750,137,none,-26020470.0,248 - 1920x1080 (1080s),30,vp9,https,none
2,68,271,1059419,9007,-387,mp4a.40.2,-2402583.0,160 - 256x144 (144s),30,avc1.42001E,https,none
3,1461,217,594966,24215,-1022,mp4a.40.2,45551930.0,243 - 640x320 (DASH video),30,avc1.42001E,https,none
4,1682,170,504506,7128,2274,none,63658020.0,278 - 256x144 (144s),30,vp9,https,webm


In [16]:
new_dataCopula.head()

Unnamed: 0,upload_date,duration,view_count,like_count,dislike_count,acodec,filesize,format,fps,vcodec,protocol,container
0,895,286,57896,5,639,none,31782970.0,264 - 2560x1440 (1440s),30,avc1.64001F,https,none
1,357,370,513,4,38,none,548681800.0,278 - 256x144 (144s),30,avc1.640033,https,none
2,964,489,8843345,7731,33,mp4a.40.2,46564570.0,18 - 640x360 (360p),30,avc1.42001E,https,none
3,631,382,389,868,2,none,3707904.0,278 - 256x144 (144s),30,vp9,https,none
4,1590,185,61740,98,2493,mp4a.40.2,830288700.0,247 - 1280x720 (720s),30,avc1.64002a,https,none


# Save GAN models

In [22]:
if not os.path.exists('models'):
        os.makedirs('models')

In [23]:
ct_model_path = os.path.join('models', "{}_ct.pkl".format(VIDEO_CAT))
copula_model_path = os.path.join('models', "{}_copula.pkl".format(VIDEO_CAT))

modelCTGAN.save(ct_model_path)
modelCopulaGAN.save(copula_model_path)

# Save generated output

In [24]:
if not os.path.exists('output'):
        os.makedirs('output')

In [25]:
ct_data_path = os.path.join('output', "{}_ct.csv".format(VIDEO_CAT))
copula_data_path = os.path.join('output', "{}_copula.csv".format(VIDEO_CAT))

new_dataCT.to_csv(ct_data_path)
new_dataCopula.to_csv(copula_data_path)