In [2]:
import numpy as np
import pandas as pd
from math import floor
from uuid import uuid4
from time import mktime
from os import path, mkdir
from random import choices
import ciso8601 as fasttime
from itertools import cycle, permutations, repeat

if not path.exists("./database/csv"):
    mkdir("./database/csv")

rng = np.random.default_rng()

## Time Dependent Random Generator

In [5]:
def time_dependent_random(independent_time, dependent_time, random_func, offset=0):
    start_time, end_time = dependent_time.min(), dependent_time.max()
    batch_independent_timestamps = np.concatenate([[0], independent_time[np.where(independent_time <= end_time)], [2147483647]])
    independent_count = np.where(independent_time <= start_time)[0].shape[0]
    batch_pointer_idx = 0
    minibatch_lst = []

    for idx in range(len(batch_independent_timestamps)-1):
        if independent_count == 0:
            independent_count += 1
            batch_pointer_idx += np.where(dependent_time < independent_time[idx])[0].shape[0]
            continue

        lower_time, upper_time = batch_independent_timestamps[idx:idx+2]
        minibatch = np.where(np.logical_and(lower_time <= dependent_time, dependent_time < upper_time))[0]
        if minibatch.shape[0] == 0:
            continue

        joined_index = np.cumsum(
            np.clip(
                np.round(
                    random_func(size=minibatch.shape[0])+1
                ),
                a_min=0,
                a_max=independent_count
            ),
            dtype=int
        )
        joined_choice = np.random.choice(np.arange(0, independent_count), size=joined_index[-1]).astype(int)

        minibatch_lst.append(
            np.column_stack((
                joined_choice,
                np.repeat(
                    np.arange(batch_pointer_idx, batch_pointer_idx+minibatch.shape[0]),
                    np.insert(np.ediff1d(joined_index), 0, joined_index[0])
                ).astype(int)
            ))
        )
        batch_pointer_idx += minibatch.shape[0]
        independent_count += 1

    result = np.vstack(minibatch_lst)
    result[:, 1] += offset
    return result

## Status Reference Table (21 rows)

In [42]:
if not path.exists("./database/csv/status_reference.csv"):
    status_reference = [
        {"status_id": 0, "name": "image_pending", "category": "actionable_timed", "description": "Image under review."},
        {"status_id": 1, "name": "image_appealed", "category": "actionable_timed", "description": "Image to be reassessed."},
        {"status_id": 2, "name": "image_reported", "category": "actionable_untimed", "description": "Image reported, pending review."},
        {"status_id": 3, "name": "image_accepted", "category": "decision", "description": "Image approved for display."},
        {"status_id": 4, "name": "image_rejected", "category": "decision", "description": "Image rejected."},
        {"status_id": 5, "name": "image_marked_deletion", "category": "decision", "description": "Image marked for deletion, will automatically delete after 7 days."},
        {"status_id": 6, "name": "image_deleted", "category": "decision_final", "description": "Image deleted, do not display."},
        {"status_id": 7, "name": "user_acceptable", "category": "decision", "description": "User fine, no action neede."},
        {"status_id": 8, "name": "user_reported", "category": "actionable_untimed", "description": "User reported, pending review."},
        {"status_id": 9, "name": "user_muted", "category": "decision", "description": "User muted, cannot post comments or upload images."},
        {"status_id": 10, "name": "user_mute_appeal", "category": "actionable_untimed", "description": "User to be reassessed."},
        {"status_id": 11, "name": "user_banned", "category": "decision_final", "description": "User banned, account access restricted."},
        {"status_id": 12, "name": "comment_acceptable", "category": "decision", "description": "Comment fine, no action neede."},
        {"status_id": 13, "name": "comment_reported", "category": "actionable_untimed", "description": "Comment reported, pending review."},
        {"status_id": 14, "name": "comment_hidden", "category": "decision", "description": "Comment hidden from general view."},
        {"status_id": 15, "name": "comment_deleted", "category": "decision_final", "description": "Comment deleted, do not display."},
        {"status_id": 16, "name": "tag_pending", "category": "actionable_timed", "description": "Tag under review."},
        {"status_id": 17, "name": "tag_appealed", "category": "actionable_timed", "description": "Tag to be reassessed."},
        {"status_id": 18, "name": "tag_reported", "category": "actionable_untimed", "description": "Tag reported, pending review."},
        {"status_id": 19, "name": "tag_accepted", "category": "decision", "description": "Tag approved for use."},
        {"status_id": 20, "name": "tag_rejected", "category": "decision", "description": "Tag rejected for use."}
    ]

    pd.DataFrame.from_records(status_reference).to_csv("./database/csv/status_reference.csv", index=False)
    del status_reference

## Permission Reference Table (4 rows)

In [43]:
if not path.exists("./database/csv/premission_reference.csv"):
    permission_reference = [
        {"permission_id": 0, "name": "basic", "description": "Basic user, no special permissions."},
        {"permission_id": 1, "name": "premium", "description": "Premium user, able to view deleted images."},
        {"permission_id": 2, "name": "moderator", "description": "Moderator, able to act on reports."},
        {"permission_id": 3, "name": "admin", "description": "Administrator."},
    ]

    pd.DataFrame.from_records(permission_reference).to_csv("./database/csv/permission_reference.csv", index=False)
    del permission_reference

## Tags Table (400 rows)

In [44]:
if not path.exists("./database/csv/tags_table.csv"):
    tag_names = np.random.choice(
        a=[
        "Apple", "Air", "Conditioner", "Airport",
        "Ambulance", "Aircraft", "Apartment",
        "Arrow", "Antlers", "Apron", "Alligator",
        "Architect", "Ankle", "Armchair", "Aunt",
        "Ball", "Bermudas", "Beans", "Balloon",
        "Bear", "Blouse", "Bed", "Bow", "Bread",
        "Black", "Board", "Bones", "Bill",
        "Bitterness", "Boxers", "Belt", "Brain",
        "Buffalo", "Bird", "Baby", "Book", "Back",
        "Butter", "Bulb", "Buckles", "Bat", "Bank",
        "Bag", "Bra", "Boots", "Blazer", "Bikini",
        "Bookcase", "Bookstore", "Bus stop", "Brass",
        "Brother", "Boy", "Blender", "Bucket",
        "Bakery", "Bow", "Bridge", "Boat", "Car",
        "Cow", "Cap", "Cooker", "Cheeks", "Cheese",
        "Credenza", "Carpet", "Crow", "Crest",
        "Chest", "Chair", "Candy", "Cabinet", "Cat",
        "Coffee", "Children", "Cookware",
        "Chaise longue", "Chicken", "Casino",
        "Cabin", "Castle", "Church", "Cafe",
        "Cinema", "Choker", "Cravat", "Cane",
        "Costume", "Cardigan", "Chocolate", "Crib",
        "Couch", "Cello", "Cashier", "Composer",
        "Cave", "Country", "Computer", "Canoe",
        "Clock", "Dog", "Deer", "Donkey", "Desk",
        "Desktop", "Dress", "Dolphin", "Doctor",
        "Dentist", "Drum", "Dresser", "Designer",
        "Detective", "Daughter", "Egg", "Elephant",
        "Earrings", "Ears", "Eyes", "Estate",
        "Finger", "Fox", "Frock", "Frog", "Fan",
        "Freezer", "Fish", "Film", "Foot",
        "Flag", "Factory", "Father", "Farm",
        "Forest", "Flower", "Fruit", "Fork",
        "Grapes", "Goat", "Gown", "Garlic",
        "Ginger", "Giraffe", "Gauva", "Grains",
        "Gas station", "Garage", "Gloves",
        "Glasses", "Gift", "Galaxy", "Guitar",
        "Grandmother", "Grandfather", "Governor",
        "Girl", "Guest", "Hamburger", "Hand",
        "Head", "Hair", "Heart", "House", "Horse",
        "Hen", "Horn", "Hat", "Hammer", "Hostel",
        "Hospital", "Hotel", "Heels", "Herbs",
        "Host", "Jacket", "Jersey", "Jewelry",
        "Jaw", "Jumper", "Judge", "Juicer",
        "Keyboard", "Kid", "Kangaroo", "Koala",
        "Knife", "Lemon", "Lion", "Leggings",
        "Leg", "Laptop", "Library", "Lamb",
        "London", "Lips", "Lung", "Lighter",
        "Luggage", "Lamp", "Lawyer", "Mouse",
        "Monkey", "Mouth", "Mango", "Mobile",
        "Milk", "Music", "Mirror", "Musician",
        "Mother", "Man", "Model", "Mall",
        "Museum", "Market", "Moonlight",
        "Medicine", "Microscope", "Newspaper",
        "Nose", "Notebook", "Neck", "Noodles",
        "Nurse", "Necklace", "Noise", "Ocean",
        "Ostrich", "Oil", "Orange", "Onion",
        "Oven", "Owl", "Paper", "Panda",
        "Pants", "Palm", "Pasta", "Pumpkin",
        "Pharmacist", "Potato", "Parfume",
        "Panther", "Pad", "Pencil", "Pipe",
        "Police", "Pen", "Pharmacy",
        "Petrol station", "Police station",
        "Parrot", "Plane", "Pigeon", "Phone",
        "Peacock", "Pencil", "Pig", "Pouch",
        "Pagoda", "Pyramid", "Purse", "Pancake",
        "Popcorn", "Piano", "Physician",
        "Photographer", "Professor", "Painter",
        "Park", "Plant", "Parfume", "Radio",
        "Razor", "Ribs", "Rainbow", "Ring",
        "Rabbit", "Rice", "Refrigerator",
        "Remote", "Restaurant", "Road",
        "Surgeon", "Scale", "Shampoo", "Sink",
        "Salt", "Shark", "Sandals", "Shoulder",
        "Spoon", "Soap", "Sand", "Sheep",
        "Sari", "Stomach", "Stairs", "Soup",
        "Shoes",  "Scissors", "Sparrow",
        "Shirt", "Suitcase", "Stove",
        "Stairs", "Snowman", "Shower", "Swan",
        "Suit", "Sweater", "Smoke", "Skirt",
        "Sofa", "Socks", "Stadium", "Skyscraper",
        "School", "Sunglasses", "Sandals",
        "Slippers", "Shorts", "Sandwich",
        "Strawberry", "Spaghetti", "Shrimp",
        "Saxophone", "Sister", "Son", "Singer",
        "Senator", "Street", "Supermarket",
        "Swimming pool", "Star", "Sky", "Sun",
        "Spoon", "Ship", "Smile", "Table",
        "Turkey", "Tie", "Toes", "Truck",
        "Train", "Taxi", "Tiger", "Trousers",
        "Tongue", "Television", "Teacher",
        "Turtle", "Tablet", "Train station",
        "Toothpaste", "Tail", "Theater",
        "Trench coat", "Tea", "Tomato", "Teen",
        "Tunnel", "Temple", "Town", "Toothbrush",
        "Tree", "Toy", "Tissue", "Telephone",
        "Underwear", "Uncle", "Umbrella", "Vest",
        "Voice", "Veterinarian", "Villa", "Violin",
        "Village", "Vehicle", "Vase", "Wallet",
        "Wolf", "Waist", "Wrist", "Water melon",
        "Whale", "Water", "Wings", "Whisker",
        "Watch", "Woman", "Washing machine",
        "Wheelchair", "Waiter", "Wound",
        "Xylophone", "Zebra", "Zoo"
    ],
        size=400,
)

    tag_count = len(tag_names)

    description = cycle([""])

    tag_categories = cycle(["general"])
    temp_status_id = rng.choice(
        a=[16, 17, 18, 19, 20],
        size=tag_count,
        p=[0.15, 0.01, 0.01, 0.71, 0.12]
    )

    creation_timestamps = np.sort(np.random.randint(
        low=int(mktime(fasttime.parse_datetime("2009-05-14").timetuple())),
        high=int(mktime(fasttime.parse_datetime("2023-12-28").timetuple())),
        size=tag_count
    ))

    tag_status = [
        19 if t < mktime(fasttime.parse_datetime("2021-04-03").timetuple()) else temp_status_id[idx]
        for idx, t in enumerate(creation_timestamps)
    ]

    tags = sorted([*zip(range(tag_count), tag_categories, tag_names, description, tag_status, creation_timestamps)], key=lambda x: x[-1])
    pd.DataFrame(tags, columns=["tag_id", "type_category", "name", "description", "status_id", "creation_timestamp"]).to_csv("./database/csv/tags_table.csv", index=False)

    del tag_names, tag_categories, temp_status_id, creation_timestamps, tag_status, tag_count, tags, description

## Users Table (50,000 rows)

In [45]:
if not path.exists("./database/csv/users_table.csv"):
    user_count = 50_000
    string = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+_"
    usernames = np.apply_along_axis(
        func1d=lambda x:"".join(x),
        axis=1,
        arr=rng.choice(a=list(string), size=(user_count, 14))
    )

    user_timestamps = np.sort(np.random.randint(
        low=int(mktime(fasttime.parse_datetime("2009-05-14").timetuple())),
        high=int(mktime(fasttime.parse_datetime("2023-12-28").timetuple())),
        size=user_count-120
    ))
    # Inject initial 120 users as preregistered users
    user_timestamps = np.concatenate([
        [int(mktime(fasttime.parse_datetime("2009-05-14").timetuple())) for _ in range(120)],
        user_timestamps
    ])

    temp_status_id = rng.choice(
        a=range(7, 12),
        size=user_count,
        p=[0.88, 0.01, 0.02, 0.01, 0.08]
    )

    status_id = [
        7 if t < mktime(fasttime.parse_datetime("2023-12-22").timetuple()) else temp_status_id[idx]
        for idx, t in enumerate(user_timestamps)
    ]

    permission_level = rng.choice(
        a=range(4),
        size=user_count,
        p=[0.832388, 0.166522, 0.001064, 0.000026]
    )


    pd.DataFrame(
        zip(range(user_count), usernames, user_timestamps, status_id, permission_level),
        columns=["user_id", "username", "creation_timestamp", "status_id", "permission_id"]
    ).to_csv("./database/csv/users_table.csv", index=False)
    del string, usernames, user_timestamps, temp_status_id, status_id, permission_level, user_count

## Images Table (1,000,000 rows)

In [48]:
if not path.exists("./database/csv/images_table.csv"):
    # Constants
    user_count = 50_000
    batch_size = 100_000
    batch_count = 1_000_000//batch_size
    random_func = lambda size: np.zeros(shape=size)
    start_time = mktime(fasttime.parse_datetime("2009-05-20").timetuple())
    end_time = mktime(fasttime.parse_datetime("2023-12-28").timetuple())
    batch_timediff = (end_time - start_time)/batch_count
    user_timestamps = np.transpose(pd.read_csv("./database/csv/users_table.csv", usecols=["creation_timestamp"]).to_numpy())[0]

    # Image Resolution constants
    common_aspect_ratios = [(1, 1), (3, 2), (5, 4), (1, 2), (2, 1)]
    custom_uncommon_ratios = [(h, w) for h,w in permutations(range(3, 11), r=2)]

    image_sizes = ["800x600", "1080x1080", "1350x1080", "1280x720", "1240x1754", "1960x1080", "3840x2160"]
    pixelart_heights = [64, 128, 256, 512]
    custom_image_heights = [*range(300, 2050, 50)]

    custom_common_image_sizes = []
    for ratio in common_aspect_ratios:
        for height in custom_image_heights:
            custom_common_image_sizes.append(f"{ratio[0]*height}x{ratio[1]*height}")

    custom_uncommon_image_sizes = []
    for ratio in custom_uncommon_ratios:
        for height in custom_image_heights:
            custom_uncommon_image_sizes.append(f"{ratio[0]*height}x{ratio[1]*height}")

    custom_pixelart_sizes = []
    for ratio in common_aspect_ratios:
        for height in pixelart_heights:
            custom_pixelart_sizes.append(f"{ratio[0]*height}x{ratio[1]*height}")

    # Text padding
    description = cycle([""])
    blob_uuids = cycle([""]) # Leave uuids empty for now, we'll insert them using another script

    # Prewriting file header
    with open("./database/csv/images_table.csv", "a+") as file:
        file.write("image_id,source_url,blob_storage_uuid,shape,upload_timestamp,upload_date,status_id,description,uploader_id,likes,dislikes\n")

        for idx in range(batch_count):
            print(f"Batch {idx+1}")
            index = np.arange(batch_size*idx, batch_size*(idx+1))
            
            # Image URLs and UUIDs
            urls = np.array([
                f"https://www.{tup[0]}.{tup[1]}.{tup[2]}.{tup[3]}"
                for tup in rng.choice(a=list(range(256)), size=(batch_size, 4))
            ])

            # Image Resolutions
            image_shape = rng.choice(
                [
                    *image_sizes,
                    *custom_common_image_sizes,
                    *custom_uncommon_image_sizes,
                    *custom_pixelart_sizes,
                ],
                size=batch_size,
                p=[
                    0.03, 0.06, 0.10, 0.06, 0.03, 0.39, 0.20,
                    *repeat(0.08/len(custom_common_image_sizes), len(custom_common_image_sizes)),
                    *repeat(0.04/len(custom_uncommon_image_sizes), len(custom_uncommon_image_sizes)),
                    *repeat(0.01/len(custom_pixelart_sizes), len(custom_pixelart_sizes)),
                ]
            )

            # Dates and timestamps
            image_timestamps = np.sort(np.random.randint(
                low=floor(start_time+batch_timediff*idx),
                high=floor(start_time+batch_timediff*(idx+1)),
                size=batch_size
            ))

            image_dates = np.sort(np.array([
                mktime(fasttime.parse_datetime(t).date().timetuple())
                for t in image_timestamps.astype("datetime64[s]").astype(str)
            ])).astype(int)

            # Status ID
            temp_status_id = rng.choice(
                a=range(7),
                size=batch_size,
                p=[0.05, 0.01, 0.02, 0.74, 0.03, 0.01, 0.14]
            )

            image_status = np.array([
                3 if t < int(mktime(fasttime.parse_datetime("2023-12-21").timetuple())) else temp_status_id[idx]
                for idx, t in enumerate(image_timestamps)
            ])

            # Uploader ID
            uploader_id = time_dependent_random(independent_time=user_timestamps, dependent_time=image_timestamps, random_func=random_func, offset=0)[:, 0]
            if uploader_id.shape[0] != batch_size:
                uploader_id = np.concatenate([np.full(shape=batch_size-uploader_id.shape[0], fill_value=-1), uploader_id])

            # Likes and Dislike
            likes = np.random.geometric(0.01, size=batch_size)
            dislikes = np.random.geometric(0.02, size=batch_size)

            file.writelines(
                (
                    ",".join(row)+"\n"
                    for row
                    in zip(
                        index.astype(str), urls.astype(str), blob_uuids,
                        image_shape.astype(str), image_timestamps.astype(str),
                        image_dates.astype(str), image_status.astype(str),
                        description, uploader_id.astype(str), likes.astype(str),
                        dislikes.astype(str)
                    )
                )
            )

    del ( 
        batch_size, batch_count, common_aspect_ratios, custom_uncommon_ratios, image_sizes, pixelart_heights, custom_image_heights,
        custom_pixelart_sizes, custom_common_image_sizes, custom_uncommon_image_sizes, ratio, height, idx, urls, blob_uuids,
        image_shape, image_timestamps, image_dates, temp_status_id, image_status, uploader_id, likes, dislikes,
        description, index, user_count, start_time, end_time, batch_timediff, user_timestamps, file
    )

Batch 1
Batch 2
Batch 3
Batch 4
Batch 5
Batch 6
Batch 7
Batch 8
Batch 9
Batch 10


## Comments Table (2,500,000 rows)

In [49]:
if not path.exists("./database/csv/comments_table.csv"):
    batch_size = 100_000
    batch_count = 2_500_000//batch_size
    start_time = mktime(fasttime.parse_datetime("2009-05-20").timetuple())
    end_time = mktime(fasttime.parse_datetime("2023-12-28").timetuple())
    batch_timediff = (end_time - start_time)/batch_count

    string = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"

    # Newline padding
    newline_padding = cycle("\n")

    with open("./database/csv/comments_table.csv", "a+") as file:
        file.write("comment_id,status_id,content,creation_timestamp,edited,likes,dislikes\n")

        for idx in range(batch_count):
            print(f"Batch {idx+1}")
            index = np.arange(batch_size*idx, batch_size*(idx+1)).astype(str)

            comment_content = np.apply_along_axis(
                func1d=lambda x: " ".join(x),
                axis=1,
                arr=np.apply_along_axis(
                    func1d=lambda x:"".join(x),
                    axis=1,
                    arr=rng.choice(a=list(string), size=(batch_size, 6, 8))
                )
            )

            comment_timestamps = np.sort(np.random.randint(
                low=start_time+batch_timediff*idx,
                high=start_time+batch_timediff*(idx+1),
                size=batch_size
            ))

            temp_status_id = rng.choice(
                a=range(12, 16),
                size=batch_size,
                p=[0.97, 0.0015, 0.0085, 0.02]
            )

            image_status = np.array([
                12 if t < int(mktime(fasttime.parse_datetime("2023-12-21").timetuple())) else temp_status_id[idx]
                for idx, t in enumerate(comment_timestamps)
            ])

            comment_edited = np.random.randint(low=0, high=2, size=batch_size)
            comment_likes = np.random.geometric(0.1, size=batch_size)
            comment_dislikes = np.random.geometric(0.2, size=batch_size)

            file.writelines(
                (
                    ",".join(row)+"\n"
                    for row
                    in zip(
                        index,
                        temp_status_id.astype(str), comment_content.astype(str), comment_timestamps.astype(str),
                        comment_edited.astype(str), comment_likes.astype(str), comment_dislikes.astype(str)
                    )
                )
            )

    del (
        batch_size, batch_count, newline_padding, file, idx, temp_status_id,
        image_status, comment_content, comment_timestamps, comment_edited,
        comment_likes, comment_dislikes, index, string, start_time, end_time,
        batch_timediff
    )

Batch 1
Batch 2
Batch 3
Batch 4
Batch 5
Batch 6
Batch 7
Batch 8
Batch 9
Batch 10
Batch 11
Batch 12
Batch 13
Batch 14
Batch 15
Batch 16
Batch 17
Batch 18
Batch 19
Batch 20
Batch 21
Batch 22
Batch 23
Batch 24
Batch 25


## Image Tag Junction Table 

In [49]:
if not path.exists("./database/csv/image_tag_junction.csv"):
    batch_size = 100_000
    tag_timestamps = pd.read_csv("./database/csv/tags_table.csv", usecols=["creation_timestamp"]).to_numpy()
    random_func = lambda size: np.random.lognormal(mean=np.log(3), sigma=0.5, size=size)

    with open("./database/csv/images_table.csv", "r") as image_file, open("./database/csv/image_tag_junction.csv", "a+") as output_file:
        # Removing headers
        image_file.readline()

        # Batched data processing
        for batch_idx, line in enumerate(image_file):
            print(f"Batch {batch_idx+1}")
            
            # Loading batched data from csv file
            image_line = [image_file.readline() for _ in range(batch_size)]
            image_timestamps = []
            for line in image_line:
                try:
                    image_timestamps.append(line[1:-1].split(",")[4])
                except:
                    pass
            image_timestamps = np.array(image_timestamps).astype(int)

            result = time_dependent_random(
                independent_time=tag_timestamps,
                dependent_time=image_timestamps,
                random_func=random_func,
                offset=batch_idx*batch_size
            ).astype(int)
            result = np.vstack(sorted(np.unique(result, axis=0), key=lambda x: x[1])).astype(str)

            output_file.writelines(
                ",".join(row)+"\n"
                for row
                in result
            )
    del batch_idx, batch_size, image_file, image_line, image_timestamps, line, output_file, tag_timestamps, result

Batch 1
Batch 2
Batch 3
Batch 4
Batch 5
Batch 6
Batch 7
Batch 8
Batch 9
Batch 10


## User Comments Junction Table

In [3]:
if not path.exists("./database/csv/user_comments_junction.csv"):
    batch_size = 100_000
    batch_count = 2_500_000//batch_size
    user_timestamps = np.transpose(pd.read_csv("./database/csv/users_table.csv", usecols=["creation_timestamp"]).to_numpy())[0]
    random_func = lambda size: np.zeros(shape=size)

    with open("./database/csv/comments_table.csv", "r") as comment_file, open("./database/csv/user_comment_junction.csv", "a+") as output_file:
        # Removing headers
        comment_file.readline()

        # Batched data processing
        for batch_idx, line in enumerate(comment_file):
            print(f"Batch {batch_idx+1}")
            
            # Loading batched data from csv file
            comment_line = [comment_file.readline() for _ in range(batch_size)]
            comment_timestamps = []
            for line in comment_line:
                try:
                    comment_timestamps.append(line[1:-1].split(",")[3])
                except:
                    pass
            comment_timestamps = np.array(comment_timestamps).astype(int)

            result = time_dependent_random(
                independent_time=user_timestamps,
                dependent_time=comment_timestamps,
                random_func=random_func,
                offset=batch_idx*batch_size
            )

            output_file.writelines(
                ",".join(row)+"\n"
                for row
                in time_dependent_random(
                    independent_time=user_timestamps,
                    dependent_time=comment_timestamps,
                    random_func=random_func,
                    offset=batch_idx*batch_size
                ).astype(str)
            )


Batch 1
Batch 2
Batch 3
Batch 4
Batch 5
Batch 6
Batch 7
Batch 8
Batch 9
Batch 10
Batch 11
Batch 12
Batch 13
Batch 14
Batch 15
Batch 16
Batch 17
Batch 18
Batch 19
Batch 20
Batch 21
Batch 22
Batch 23
Batch 24
Batch 25
