# Feature engineering 
## TODO document

In [None]:
using DataFrames

In [None]:
function get_split(split, transpose)
    df = get_split(split)
    return transpose ? df' : df
end;

In [None]:
function get_dep(split)
    return get_split(split).rating
end

function get_indep(split, alphas)
    users = get_split(split).user
    X = zeros(length(users), length(alphas))
    for j = 1:length(alphas)
        X[:, j] = get_alpha(alphas[j], split).rating
    end
    X
end;

In [None]:
function fill_feature(split, transpose, user_feature)
    users = get_split(split, transpose).user
    feature = zeros(length(users))
    @tprogress Threads.@threads for j = 1:length(users)
        if users[j] <= length(user_feature)
            feature[j] = user_feature[users[j]]
        end
    end
    feature
end;

In [None]:
function count_feature(split, transpose)
    # when transpose is true, returns the number of rated items per user
    # when transpose is false, return the number of rated users per item
    users = get_split("training", transpose).user
    user_to_num_training_items = zeros(maximum(users))
    @tprogress Threads.@threads for i = 1:length(users)
        user_to_num_training_items[users[i]] += 1
    end
    fill_feature(split, transpose, user_to_num_training_items)
end;

In [None]:
function bias_feature(split, transpose)
    bias_param = transpose ? "a" : "u"
    users = get_split("training", transpose).user
    user_bias = read_params("UserItemBiases")[bias_param]
    fill_feature(split, transpose, user_bias)
end;

In [None]:
function std_feature(split, transpose)
    # get sum squared error
    users = get_split("training", transpose).user
    ratings = get_split("training", transpose).rating
    params = transpose ? ["a", "u"] : ["u", "a"]
    user_means =
        read_params("UserItemBiases")[params[1]] .+
        mean(read_params("UserItemBiases")[params[2]])
    user_to_training_std = zeros(maximum(users))
    user_to_num_training_items = zeros(maximum(users))
    @tprogress Threads.@threads for i = 1:length(users)
        u = users[i]
        user_to_training_std[u] += (ratings[u] - user_means[u])^2
        user_to_num_training_items[u] += 1
    end

    # transform to std
    @tprogress Threads.@threads for u = 1:maximum(users)
        if user_to_num_training_items[u] > 0
            user_to_training_std[u] /= user_to_num_training_items[u]
        end
    end
    user_to_training_std .= sqrt.(user_to_training_std)
    fill_feature(split, transpose, user_to_training_std)
end;

In [None]:
function popularity_feature(split, transpose)
    # returns the average popularity of all items the user has seen
    users = get_split("training", transpose).user
    items = get_split("training", transpose).item
    item_means = read_params("UserItemBiases")[transpose ? "u" : "a"]
    user_to_avg_item_pop = zeros(maximum(users))
    user_to_num_training_items = zeros(maximum(users))
    @tprogress Threads.@threads for i = 1:length(users)
        u = users[i]
        a = items[i]
        user_to_avg_item_pop[u] += item_means[a]
        user_to_num_training_items[u] += 1
    end

    @tprogress Threads.@threads for u = 1:maximum(users)
        if user_to_num_training_items[u] > 0
            user_to_avg_item_pop[u] /= user_to_num_training_items[u]
        end
    end
    fill_feature(split, transpose, user_to_avg_item_pop)
end;

In [None]:
function item_feature(split, col; categorical)
    # get anime dataframe
    anime_to_uid = DataFrame(CSV.File("../../data/processed_data/anime_to_uid.csv"))
    anime_to_uid.uid .+= 1
    anime = DataFrame(CSV.File("../../data/raw_data/anime.csv"))
    anime_to_uid = innerjoin(anime_to_uid, anime, on = "anime_id")

    # subset column of interest
    anime_to_col = fill(anime_to_uid[1, col], maximum(anime_to_uid.uid))
    for i = 1:size(anime_to_uid)[1]
        anime_to_col[anime_to_uid[i, "uid"]] = anime_to_uid[i, col]
    end

    # do a 1-hot encoding for categorical variables
    encoding_fn = x -> anime_to_col[x]
    ncols = 1
    if categorical
        encoding = sort(collect(Set(collect(anime_to_col))))
        function one_hot_encoding(item)
            if item > length(anime_to_col)
                return zeros(length(encoding))
            end
            encoding .== anime_to_col[item]
        end
        encoding_fn = one_hot_encoding
        ncols = length(encoding)
    end

    items = get_split(split).item
    feature = zeros(length(items), ncols)
    @tprogress Threads.@threads for j = 1:length(items)
        feature[j, :] .= encoding_fn(items[j])
    end
    feature
end;

In [None]:
function genre_embedding()
    # get anime dataframe
    anime_to_uid = DataFrame(CSV.File("../../data/processed_data/anime_to_uid.csv"))
    anime_to_uid.uid .+= 1
    anime = DataFrame(CSV.File("../../data/raw_data/anime.csv"))
    anime_to_uid = innerjoin(anime_to_uid, anime, on = "anime_id")

    # subset column of interest
    col = "genres"
    anime_to_col = fill(anime_to_uid[1, col], maximum(anime_to_uid.uid))
    for i = 1:size(anime_to_uid)[1]
        anime_to_col[anime_to_uid[i, "uid"]] = anime_to_uid[i, col]
    end

    # parse genres
    function parse_genres(str)
        tokens = Base.split(str, "'")
        [tokens[j] for j in 1:length(tokens) if j % 2 == 0]
    end
    encoding = sort(
        collect(
            Set(
                vcat(
                    collect(
                        parse_genres(anime_to_uid[i, "genres"]) for
                        i = 1:size(anime_to_uid)[1]
                    )...,
                ),
            ),
        ),
    )
    function one_hot_encoding(item)
        if item > length(anime_to_col)
            return zeros(length(encoding))
        end
        feature = zeros(length(encoding))
        genres = parse_genres(anime_to_col[item])
        for genre in genres
            feature += encoding .== genre
        end
        feature
    end

    encoding_fn = one_hot_encoding
    ncols = length(encoding)
    items = 1:num_items()
    features = zeros(length(items), ncols)
    @tprogress Threads.@threads for j = 1:length(items)
        features[j, :] .= encoding_fn(items[j])
    end
    features
end;

In [None]:
function get_xgboost_features(split)
    hcat(
        count_feature(split, true),
        count_feature(split, false),
        bias_feature(split, true),
        bias_feature(split, false),
        std_feature(split, true),
        std_feature(split, false),
        popularity_feature(split, true),
        popularity_feature(split, false),
        item_feature(split, "medium", categorical = true),
        item_feature(split, "source", categorical = true),
        item_feature(split, "num_episodes", categorical = false),
    )    
end