In [None]:
import gc
import os
import random
import sys
from zipfile import ZipFile

import enlighten
import numpy as np
import pandas as pd
import psutil

DATA_DIR = "ICWSM19_data"
BY_DAY_OUTPUT_DIR = "BY_DAY"
BY_DAY_CHUNKED_OUTPUT_DIR = "BY_DAY_CHUNKED"
BY_DAY_ZIPPED_OUTPUT_DIR = "BY_DAY_ZIPPED"

In [None]:
os.makedirs(BY_DAY_OUTPUT_DIR, exist_ok=False)

pickle_files = os.listdir(DATA_DIR)

with enlighten.get_manager() as manager:
    pickle_bar = manager.counter(total=len(pickle_files), desc="Loading pickle files")

    dfs_by_day = {}
    for pickle_file in pickle_files:
        # Emergency stop if memory is running low
        mem_percent = psutil.virtual_memory().percent
        print("Memory usage: ", mem_percent)
        if mem_percent > 90:
            print("Memory usage too high, exiting")
            sys.exit(138)
        
        print(pickle_file)
        df = pd.read_pickle(os.path.join(DATA_DIR, pickle_file))
        df["created_at"] = pd.to_datetime(df["created_at"], format="ISO8601")
        g = df.groupby(pd.DatetimeIndex(df["created_at"]).normalize())
        del df
        
        print(f"Days in {pickle_file}: {len(g)}")
        for day, day_df in g:
            day_pickle = os.path.join(BY_DAY_OUTPUT_DIR, day.strftime("%Y-%m-%d.pkl"))
            if not os.path.exists(day_pickle):
                day_df.to_pickle(day_pickle)
            else:
                old_df = pd.read_pickle(day_pickle)
                day_df = pd.concat([old_df, day_df])
                day_df.to_pickle(day_pickle)
                del old_df
            del day_df
            
        gc.collect()
        
        pickle_bar.update()

In [None]:
os.makedirs(BY_DAY_CHUNKED_OUTPUT_DIR, exist_ok=False) # TODO, set to False

MIN_CHATS_PER_CHUNK = 40
MAX_CHATS_PER_CHUNK = 100

by_day_pickle_files = os.listdir(BY_DAY_OUTPUT_DIR)

TAB_CHAR = "\t"

for pickle_file in by_day_pickle_files:
    df = pd.read_pickle(os.path.join(BY_DAY_OUTPUT_DIR, pickle_file))
    original_len = len(df)
    print(f"Total chats in {pickle_file}: {len(df)}, unique indices: {len(df.index.unique())}")
    chunks = []
    while len(df) > 0:
        chunk_size = random.randint(MIN_CHATS_PER_CHUNK, MAX_CHATS_PER_CHUNK)
        chunk = df.head(chunk_size)
        df = df.tail(len(df) - chunk_size)
        chunks.append(chunk)
    print(f"Total chunks: {len(chunks)}")
    for i, chunk in enumerate(chunks):
        pkl_dir_name = os.path.join(BY_DAY_CHUNKED_OUTPUT_DIR, pickle_file).replace(".pkl", "")
        os.makedirs(pkl_dir_name, exist_ok=True)
        chunk.drop(columns=["commenter_type", "fragments", "offset", "updated_at"], inplace=True)
        chunk.to_csv(os.path.join(pkl_dir_name, f"{i}.tsv"), sep=TAB_CHAR, index=False)
    

In [None]:
os.makedirs(BY_DAY_ZIPPED_OUTPUT_DIR, exist_ok=False) # TODO, set to False

dirs = os.listdir(BY_DAY_CHUNKED_OUTPUT_DIR)

for dir in dirs:
    print(dir)
    files = os.listdir(os.path.join(BY_DAY_CHUNKED_OUTPUT_DIR, dir))
    split = np.array_split(files, 4) # NOTE: Should be EQUAL to the MAX_CONCURRENCY as set in settings.toml
    for i, files in enumerate(split):
        os.makedirs(os.path.join(BY_DAY_ZIPPED_OUTPUT_DIR, dir), exist_ok=True)
        with ZipFile(os.path.join(BY_DAY_ZIPPED_OUTPUT_DIR, dir, f"{dir}_{i}.zip"), "w") as zip:
            for file in files:
                zip.write(os.path.join(BY_DAY_CHUNKED_OUTPUT_DIR, dir, file), file)