# Feature engineering 
## TODO document

In [None]:
using DataFrames
import StatsBase

In [None]:
function get_split(split, transpose)
    df = get_split(split)
    return transpose ? df' : df
end;

In [None]:
function get_dep(split)
    return get_split(split).rating
end

function get_indep(split, alphas)
    users = get_split(split).user
    X = zeros(length(users), length(alphas))
    @showprogress for j = 1:length(alphas)
        X[:, j] = get_alpha(alphas[j], split).rating
    end
    X
end

function get_errors(split, alphas)
    β = get_indep("validation", alphas) \ get_dep("validation")
    get_dep(split) - get_indep(split, alphas) * β
end;

In [None]:
function fill_feature(split, transpose, user_feature)
    users = get_split(split, transpose).user
    feature = zeros(length(users))
    @tprogress Threads.@threads for j = 1:length(users)
        if users[j] <= length(user_feature)
            feature[j] = user_feature[users[j]]
        end
    end
    feature
end;

In [None]:
function count_feature(split, transpose)
    # when transpose is true, returns the number of rated items per user
    # when transpose is false, return the number of rated users per item
    users = get_split("training", transpose).user
    user_to_num_training_items = zeros(maximum(users), Threads.nthreads())
    @tprogress Threads.@threads for i = 1:length(users)
        user_to_num_training_items[users[i], Threads.threadid()] += 1
    end
    user_to_num_training_items = sum(user_to_num_training_items, dims = 2)
    fill_feature(split, transpose, user_to_num_training_items)
end;

In [None]:
function bias_feature(split, transpose)
    bias_param = transpose ? "a" : "u"
    users = get_split("training", transpose).user
    user_bias = read_params("UserItemBiases")[bias_param]
    fill_feature(split, transpose, user_bias)
end;

In [None]:
function std_feature(split, transpose)
    # get sum squared error
    users = get_split("training", transpose).user
    ratings = get_split("training", transpose).rating
    params = transpose ? ["a", "u"] : ["u", "a"]
    user_means =
        read_params("UserItemBiases")[params[1]] .+
        mean(read_params("UserItemBiases")[params[2]])
    user_to_training_std = zeros(maximum(users), Threads.nthreads())
    user_to_num_training_items = zeros(maximum(users), Threads.nthreads())
    @tprogress Threads.@threads for i = 1:length(users)
        u = users[i]
        user_to_training_std[u, Threads.threadid()] += (ratings[u] - user_means[u])^2
        user_to_num_training_items[u, Threads.threadid()] += 1
    end
    user_to_training_std = sum(user_to_training_std, dims = 2)
    user_to_num_training_items = sum(user_to_num_training_items, dims = 2)

    # transform to std
    @tprogress Threads.@threads for u = 1:maximum(users)
        if user_to_num_training_items[u] > 0
            user_to_training_std[u] /= user_to_num_training_items[u]
        end
    end
    user_to_training_std .= sqrt.(user_to_training_std)
    fill_feature(split, transpose, user_to_training_std)
end;

In [None]:
function popularity_feature(split, transpose)
    # returns the average popularity of all items the user has seen
    users = get_split("training", transpose).user
    items = get_split("training", transpose).item
    item_means = read_params("UserItemBiases")[transpose ? "u" : "a"]
    user_to_avg_item_pop = zeros(maximum(users), Threads.nthreads())
    user_to_num_training_items = zeros(maximum(users), Threads.nthreads())
    @tprogress Threads.@threads for i = 1:length(users)
        u = users[i]
        a = items[i]
        user_to_avg_item_pop[u, Threads.threadid()] += item_means[a]
        user_to_num_training_items[u, Threads.threadid()] += 1
    end
    user_to_avg_item_pop = sum(user_to_avg_item_pop, dims = 2)
    user_to_num_training_items = sum(user_to_num_training_items, dims = 2)

    @tprogress Threads.@threads for u = 1:maximum(users)
        if user_to_num_training_items[u] > 0
            user_to_avg_item_pop[u] /= user_to_num_training_items[u]
        end
    end
    fill_feature(split, transpose, user_to_avg_item_pop)
end;

In [None]:
function date_to_year(x)
    if ismissing(x)
        return NaN
    end
    Dates.year(x) + Dates.month(x) / 12
end

function get_anime()
    anime_to_uid = DataFrame(CSV.File("../../data/processed_data/anime_to_uid.csv"))
    anime_to_uid.uid .+= 1
    anime = DataFrame(CSV.File("../../data/raw_data/anime.csv", ntasks = 1))
    anime = innerjoin(anime_to_uid, anime, on = "anime_id")
    anime.start_year = date_to_year.(anime[:, "start_date"])
    anime.end_year = date_to_year.(anime[:, "end_date"])
    anime
end

In [None]:
function get_anime_column(anime_to_uid, col)
    #TODO is there a better way of handling missing values?
    function is_missing(x)
        if ismissing(x)
            return true
        end        
        if isa(x, Real)
            return isnan(x)
        end
        return x == "NaN"
    end    
    default = StatsBase.mode(filter(x -> !is_missing(x), anime_to_uid[:, col]))
    anime_to_col = fill(default, num_items())
    for i = 1:size(anime_to_uid)[1]
        val = anime_to_uid[i, col]
        if !is_missing(val)
            anime_to_col[anime_to_uid[i, "uid"]] = val
        end
    end
    anime_to_col    
end

In [None]:
function item_feature(split, col; categorical)
    # subset column of interest
    anime_to_uid = get_anime()
    anime_to_col = get_anime_column(anime_to_uid, col)

    # do a 1-hot encoding for categorical variables
    encoding_fn = x -> anime_to_col[x]
    ncols = 1
    if categorical
        encoding = sort(collect(Set(collect(anime_to_col))))
        @debug "$col categories: $encoding"
        function one_hot_encoding(item)
            if item > length(anime_to_col)
                return zeros(length(encoding))
            end
            encoding .== anime_to_col[item]
        end
        encoding_fn = one_hot_encoding
        ncols = length(encoding)
    end

    items = get_split(split).item
    feature = zeros(length(items), ncols)
    @tprogress Threads.@threads for j = 1:length(items)
        feature[j, :] .= encoding_fn(items[j])
    end
    feature
end;

In [None]:
# number of items of the same category that the user has seen
function item_count_feature(split, col, to_category = identity)
    # subset column of interest
    anime_to_uid = get_anime()
    anime_to_uid[!, col] = to_category.(anime_to_uid[:, col])
    anime_to_col = get_anime_column(anime_to_uid, col)

    categories = sort(collect(Set(collect(anime_to_col))))
    @debug "$col categories: $categories"
    encoding(item) = findfirst(x -> x == anime_to_col[item], categories)

    # get the number of items per user-category
    users = get_split("training").user
    items = get_split("training").item
    user_to_num_training_items =
        zeros(maximum(users), length(categories), Threads.nthreads())
    @tprogress Threads.@threads for i = 1:length(users)
        user_to_num_training_items[users[i], encoding(items[i]), Threads.threadid()] += 1
    end
    user_to_num_training_items = sum(user_to_num_training_items, dims = 3)

    users = get_split(split).user
    items = get_split(split).item
    feature = zeros(length(items))
    @tprogress Threads.@threads for i = 1:length(items)
        feature[i] = user_to_num_training_items[users[i], encoding(items[i])]
    end
    feature
end;

In [None]:
function get_xgboost_features(split, alphas)
    round_to_multiple(x, n) = round(x / n) * n
    function bucket_num_epiodes(x)
        if isnan(x)
            return x
        end
        if x <= 6
            return round_to_multiple(x, 1)
        end
        if x <= 26
            return round_to_multiple(x, 13)
        end
        if x <= 52
            return round_to_multiple(x, 26)
        end
        return 100
    end
    hcat(
        # shared user/item features
        count_feature(split, true),
        count_feature(split, false),
        bias_feature(split, true),
        bias_feature(split, false),
        std_feature(split, true),
        std_feature(split, false),
        popularity_feature(split, true),
        popularity_feature(split, false),
        
        # item only features
        item_feature(split, "start_year", categorical = false),
        item_feature(split, "end_year", categorical = false),                
        item_feature(split, "nsfw", categorical = true),                
        item_feature(split, "medium", categorical = true),
        item_feature(split, "num_episodes", categorical = false),
        item_feature(split, "source", categorical = true),
        item_feature(split, "average_episode_duration", categorical = false),
        
        # number of series the user has seen in each category
        item_count_feature(split, "start_year", x -> round_to_multiple(x, 5)),
        item_count_feature(split, "end_year", x -> round_to_multiple(x, 5)),                
        item_count_feature(split, "nsfw"),                
        item_count_feature(split, "medium"),
        item_count_feature(split, "num_episodes", bucket_num_epiodes),
        item_count_feature(split, "source"),
        item_count_feature(split, "average_episode_duration", 
            x -> x <= 1800 ? round_to_multiple(x, 600) : round_to_multiple(x, 1800)
        ),
    )
end;

In [None]:
@memoize function get_augmented_indep(split, alphas, β)
    hcat(
        get_indep(split, alphas),
        get_indep(split, alphas) * β,
        get_xgboost_features(split, alphas),
    )
end;