# Map uids
* creates encodings for username, mediaid, and timestamp fields

In [None]:
import DataFrames: DataFrame
import Dates
import Glob
import ProgressMeter
import DataFrames
import CSV
import Random
import JupyterFormatter

In [None]:
JupyterFormatter.enable_autoformat();

## Timestamps

In [None]:
source_dir = "../../data/raw_data"
outdir = "../../data/processed_data";

In [None]:
function save_timestamps()
    # no rating site existed before then
    min_timestamp = Dates.datetime2unix(Dates.DateTime(2000, 1, 1))
    max_timestamp = -Inf
    files = reduce(
        vcat,
        [
            Glob.glob("../../data/raw_data/user_$(x)_list.[0-9]*.csv") for
            x in ["manga", "anime"]
        ],
    )
    ProgressMeter.@showprogress for t = 1:length(files)
        df = CSV.read(files[t], DataFrame, select = [:updated_at])
        max_timestamp = max(max_timestamp, maximum(df.updated_at))
    end
    @assert max_timestamp != -Inf
    @assert max_timestamp > min_timestamp
    open(joinpath(outdir, "timestamps.csv"), "w") do f
        write(f, "min_timestamp,$(Int(min_timestamp))\n")
        write(f, "max_timestamp,$(Int(max_timestamp))\n")
    end
end;

## User and media ids

In [None]:
function get_unique_values(media, col)
    values = Set{String}()
    files = sort(Glob.glob("$source_dir/user_$(media)_list.[0-9]*.csv"))
    ProgressMeter.@showprogress for t = 1:length(files)
        df = CSV.read(files[t], DataFrame, select = [col])
        values = union(values, Set(df[:, col]))
    end
    values
end;

In [None]:
function shuffle_usernames()
    usernames = DataFrame(
        userid = collect(
            get_unique_values("manga", :userid) ∪ get_unique_values("anime", :userid),
        ),
    )
    Random.shuffle!(usernames.userid)
    usernames.uid = 0:(DataFrames.nrow(usernames)-1)
    CSV.write("$outdir/username_to_uid.csv", usernames, writeheader = true)
end;

In [None]:
function shuffle_media_ids(media)
    items = DataFrame(mediaid = collect(get_unique_values(media, :mediaid)))
    Random.shuffle!(items.mediaid)
    items.uid = 0:(DataFrames.nrow(items)-1)
    CSV.write("$outdir/$(media)_to_uid.csv", items, writeheader = true)
end;

# Process files

In [None]:
save_timestamps();

In [None]:
shuffle_usernames();

In [None]:
for media in ["manga", "anime"]
    shuffle_media_ids(media)
end;