In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import glob
from datetime import datetime as datetime
import time
import os

In [None]:
# Select category of videos
VIDEO_CAT = "Travel & Events"
# Configure panda to show all columns
pd.set_option('display.max_columns', None)

# Prepare input dataframe

Merging all the available traces 

In [None]:
path = '../../traces' # use your path
all_files = glob.glob(path + "/*.json")

li = []

for filename in all_files:
    df = pd.read_json(filename)
    li.append(df)

df = pd.concat(li, ignore_index=True)

Convert upload date (yyymmdd) to "days since upload"

In [None]:
current_time = datetime.now().timestamp()

for i in df.index:
    try:
        upload_time = datetime.strptime(str(df['upload_date'][i]), '%Y%m%d').timestamp()
        seconds_since_upload = current_time - upload_time
        days_since_upload = seconds_since_upload / 86400 # 86400 seconds in a day
        
        df['upload_date'][i] =  days_since_upload
    except:
        pass

Explode categories and remove irrelevant columns

In [None]:
df = df.explode('categories')
df = df[['upload_date','categories','duration', 'view_count','like_count','dislike_count', 'formats']]

Filter on video category

In [None]:
cat_df = df[df['categories'] == VIDEO_CAT]

Explode on formats

In [None]:
cat_df = cat_df.explode('formats')

Convert format objects to data columns. Format objects are JSON objects containing trace information. The properties (keys) of these objects must be unpacked into columns of the datadf. This makes a row go from containing a column for one format object, to containing many columns; one for each property.

In [None]:
formats_df = pd.DataFrame(cat_df['formats'].values.tolist())
cat_df = pd.concat([cat_df.reset_index(), formats_df], axis=1)

Filter out audio formats

In [None]:
cat_df = cat_df[cat_df["vcodec"] != "none"]

Drop irrelevant columns

In [None]:
clean_df = cat_df[['upload_date','duration', 'view_count','like_count','dislike_count', 'acodec', 'filesize', 'format' ,'fps', 'vcodec', 'protocol', 'container']].reset_index(drop=True)

# Fill empty cells with default value
clean_df["like_count"] = clean_df["like_count"].fillna(0)
clean_df["container"] = clean_df["container"].fillna("none")
clean_df["dislike_count"] = clean_df["dislike_count"].fillna(0)

# Convert datat types
clean_df['like_count'] = clean_df['like_count'].astype("int64")
clean_df['dislike_count'] = clean_df['dislike_count'].astype("int64")
clean_df['fps'] = clean_df['fps'].astype("object")
clean_df['upload_date'] = clean_df['upload_date'].astype("int64")

Print clean input dataframe for debugging

In [None]:
clean_df.head()

# Prepare Generative Adversarial Network models

Prepare CTGAN model

In [None]:
from sdv.tabular import CTGAN
modelCTGAN = CTGAN()
modelCTGAN.fit(clean_df)

Prepare CopulaGAN model

In [None]:
from sdv.tabular import CopulaGAN
modelCopulaGAN = CopulaGAN()
modelCopulaGAN.fit(clean_df)

In [None]:
new_dataCT = modelCTGAN.sample(2000)
new_dataCopula = modelCopulaGAN.sample(2000)

In [None]:
new_dataCT.head()

In [None]:
new_dataCopula.head()

# Save GAN models

In [None]:
if not os.path.exists('models'):
        os.makedirs('models')

In [None]:
ct_model_path = os.path.join('models', "{}_ct.pkl".format(VIDEO_CAT))
copula_model_path = os.path.join('models', "{}_copula.pkl".format(VIDEO_CAT))

modelCTGAN.save(ct_model_path)
modelCopulaGAN.save(copula_model_path)

# Save generated output

In [None]:
if not os.path.exists('output'):
        os.makedirs('output')

In [None]:
ct_data_path = os.path.join('output', "{}_ct.csv".format(VIDEO_CAT))
copula_data_path = os.path.join('output', "{}_copula.csv".format(VIDEO_CAT))

new_dataCT.to_csv(ct_data_path)
new_dataCopula.to_csv(copula_data_path)