In [1]:
import pandas as pd
from pathlib import Path
from typing import List, Dict, Tuple
import numpy as np
pd.options.display.float_format = '{:.5f}'.format

import pickle
def load_obj(name):
    with open("../data/prod/" + name + ".pkl", "rb") as f:
        return pickle.load(f)
    
def save_obj(obj, name):
    with open("../data/prod/" + name + ".pkl", "wb") as f:
        pickle.dump(obj, f, protocol = 2)
        
def load_and_preprocess_vitrine() -> pd.DataFrame:
    vitrine = pd.read_csv(Path("../data/all_features_ns_full/vitrine.csv"))

    vitrine = vitrine[vitrine.content_type_id == 0].reset_index(drop=True)
    vitrine["prior_question_had_explanation"].fillna(False, inplace=True)
    vitrine["prior_question_had_explanation"] = list(
        map(lambda x: int(x), list(vitrine["prior_question_had_explanation"]))
    )

    return vitrine


def aggregate(
    df_i: pd.DataFrame,
    groupby_list: List[str],
    agg_func: List,
    name_list: List,
    target_field: str,
) -> pd.DataFrame:
    df = df_i.copy()
    df = df.sort_values(by=groupby_list + ["timestamp"])
    df = df[groupby_list + [target_field]].groupby(groupby_list).agg(agg_func)
    df.columns = list(map(lambda x: x + target_field, name_list))
    return df.reset_index()


def get_user_last_f(
    df_i: pd.DataFrame, keys: List[str], nec_field: List[str]
) -> pd.DataFrame:
    df = df_i.copy()

    df = df[keys + list(set(nec_field) - set(["timestamp"])) + ["timestamp"]]
    df = df.sort_values(by=["user_id", "timestamp"])
    df = (
        df.drop_duplicates(subset=keys, keep="last")
        .sort_values(by=keys)
        .reset_index(drop=True)
    )

    return df


def get_answered_as_arr(
    vitrine: pd.DataFrame,
):
    features_user = vitrine.sort_values(by=["user_id", "timestamp"]).reset_index(
        drop=True
    )
    features_user = features_user[["user_id", "answered_correctly"]]

    return (
        features_user.groupby("user_id")["answered_correctly"]
        .apply(np.array)
        .to_frame()
    ).reset_index()


def df_features_to_dict(df: pd.DataFrame, index_fields: List[str]) -> Dict:
    df = df.set_index(index_fields)
    return df.to_dict('index')

def get_contnt_prod(
    result_fillna: Dict, 
    df: pd.DataFrame, 
    prename: str,
    keys: List[str],
    target: str = 'answered_correctly'
) -> Tuple[Dict, Dict]:
    df[f'{prename}_mean_{target}'] = df[f'{prename}_sum_{target}'] /\
                                                df[f'{prename}_count_{target}']

    result_fillna[f'{prename}_mean_{target}'] = df[f'{prename}_mean_{target}'].median()
    result_fillna[f'{prename}_sum_{target}'] = df[f'{prename}_sum_{target}'].median()
    result_fillna[f'{prename}_count_{target}'] = df[f'{prename}_count_{target}'].median()
    print(result_fillna)

    df.drop([f'{prename}_mean_{target}'], axis = 1, inplace = True)

    dict_ = df_features_to_dict(
        df,
        keys
    )
    print(list(dict_.items())[:4], len(dict_))
    del df
    return dict_, result_fillna


def get_answered_as_arr(
    vitrine: pd.DataFrame,
    agg_fields: List[str]
):
    vitrine = vitrine.sort_values(by = ['user_id', 'id']).reset_index(drop=True)
    return vitrine.groupby(agg_fields)['answered_correctly'].apply(np.array).to_frame().reset_index()




from functools import reduce
def ema(series):
    N = len(series)
    alpha = 2 / (N + 1)
    return reduce(lambda sum_, curr: (1 - alpha) * sum_ + alpha * curr, series)


def get_set_from_pos(list_):
    cur = 1
    set_ = []
    for el in list_[::-1]:
        if el == 1:
            set_.append(cur)
        cur += 1
    return set(set_)


def encode_binary(set_: set) -> int:
    res = 0
    for el in set_:
        res += 2 ** (el-1)
    return res

def decode_binary(value: int) -> list:
    l = []
    while value > 0:
        residue = value % 2
        value = int((value - residue) / 2)
        l.append(residue)
    return l[::-1]

vitrine = load_and_preprocess_vitrine()

In [2]:
user_he_ucount_part_new = aggregate(
    vitrine,
    groupby_list=["user_id", "prior_question_had_explanation"],
    agg_func=[set],
    name_list=["user_he_ucount_"],
    target_field="part",
)

user_he_ucount_part_new = df_features_to_dict(
    user_he_ucount_part_new,
    ["user_id", "prior_question_had_explanation"]
)

In [3]:
len(user_he_ucount_part_new)

710485

In [4]:
user_he_ucount_part_new

{(115, 0): {'user_he_ucount_part': {1, 2, 3, 4, 5}},
 (115, 1): {'user_he_ucount_part': {1}},
 (124, 0): {'user_he_ucount_part': {1, 2, 3, 4, 5, 6, 7}},
 (2746, 0): {'user_he_ucount_part': {2, 5}},
 (2746, 1): {'user_he_ucount_part': {2}},
 (5382, 0): {'user_he_ucount_part': {1, 2, 5}},
 (5382, 1): {'user_he_ucount_part': {1, 2, 5}},
 (8623, 0): {'user_he_ucount_part': {2, 5}},
 (8623, 1): {'user_he_ucount_part': {1, 2, 5, 7}},
 (8701, 0): {'user_he_ucount_part': {2, 5}},
 (8701, 1): {'user_he_ucount_part': {2}},
 (12741, 0): {'user_he_ucount_part': {1, 5, 6}},
 (12741, 1): {'user_he_ucount_part': {4, 5, 6, 7}},
 (13134, 0): {'user_he_ucount_part': {1, 2, 5}},
 (13134, 1): {'user_he_ucount_part': {1, 2, 3, 4, 5, 6}},
 (24418, 0): {'user_he_ucount_part': {1, 2, 3, 4, 5, 6, 7}},
 (24418, 1): {'user_he_ucount_part': {1, 2, 3, 4, 5, 6, 7}},
 (24600, 0): {'user_he_ucount_part': {1, 2, 3, 4, 5, 6, 7}},
 (24600, 1): {'user_he_ucount_part': {2, 5}},
 (32421, 0): {'user_he_ucount_part': {1, 2, 

In [5]:
user_he_ucount_part_new_encode = dict()
for key, value in user_he_ucount_part_new.items():
    user_he_ucount_part_new_encode[key] = encode_binary(value['user_he_ucount_part'])

In [6]:
user_he_ucount_part_new_encode

{(115, 0): 31,
 (115, 1): 1,
 (124, 0): 127,
 (2746, 0): 18,
 (2746, 1): 2,
 (5382, 0): 19,
 (5382, 1): 19,
 (8623, 0): 18,
 (8623, 1): 83,
 (8701, 0): 18,
 (8701, 1): 2,
 (12741, 0): 49,
 (12741, 1): 120,
 (13134, 0): 19,
 (13134, 1): 63,
 (24418, 0): 127,
 (24418, 1): 127,
 (24600, 0): 127,
 (24600, 1): 18,
 (32421, 0): 127,
 (40828, 0): 127,
 (40828, 1): 18,
 (44331, 0): 50,
 (44331, 1): 63,
 (45001, 0): 127,
 (46886, 0): 18,
 (46886, 1): 18,
 (50132, 0): 18,
 (50132, 1): 18,
 (51285, 0): 18,
 (51285, 1): 18,
 (53842, 0): 127,
 (81002, 0): 18,
 (81002, 1): 16,
 (81429, 0): 127,
 (91216, 0): 127,
 (91216, 1): 63,
 (99521, 0): 17,
 (99521, 1): 18,
 (107002, 0): 127,
 (107002, 1): 127,
 (108310, 0): 16,
 (108310, 1): 51,
 (128919, 0): 127,
 (137455, 0): 127,
 (137455, 1): 18,
 (138650, 0): 127,
 (138650, 1): 127,
 (140969, 0): 127,
 (140969, 1): 16,
 (141455, 0): 127,
 (141455, 1): 127,
 (142896, 0): 17,
 (142896, 1): 16,
 (146023, 0): 127,
 (146023, 1): 16,
 (146403, 0): 127,
 (157207

In [7]:
result_features = load_obj('result_features_no_att_ema')
result_features.keys()

dict_keys(['content_he_mean_answered_correctly', 'content_mean_answered_correctly', 'content_he_part_count_answered_correctly', 'abs_chng_timestamp_1', 'abs_chng_timestamp_2', 'abs_chng_timestamp_3', 'user_mean_answered_correctly', 'user_he_mean_answered_correctly', 'user_he_mean_prior_question_elapsed_time', 'user_he_ucount_part', 'user_ema_answered_correctly', 'user_he_ema_answered_correctly', 'strike', 'strike_bundle', 'lag_part'])

In [8]:
result_features['user_he_ucount_part'] = user_he_ucount_part_new_encode

In [9]:
len(result_features['user_he_ucount_part']), result_features['user_he_ucount_part']

(710485,
 {(115, 0): 31,
  (115, 1): 1,
  (124, 0): 127,
  (2746, 0): 18,
  (2746, 1): 2,
  (5382, 0): 19,
  (5382, 1): 19,
  (8623, 0): 18,
  (8623, 1): 83,
  (8701, 0): 18,
  (8701, 1): 2,
  (12741, 0): 49,
  (12741, 1): 120,
  (13134, 0): 19,
  (13134, 1): 63,
  (24418, 0): 127,
  (24418, 1): 127,
  (24600, 0): 127,
  (24600, 1): 18,
  (32421, 0): 127,
  (40828, 0): 127,
  (40828, 1): 18,
  (44331, 0): 50,
  (44331, 1): 63,
  (45001, 0): 127,
  (46886, 0): 18,
  (46886, 1): 18,
  (50132, 0): 18,
  (50132, 1): 18,
  (51285, 0): 18,
  (51285, 1): 18,
  (53842, 0): 127,
  (81002, 0): 18,
  (81002, 1): 16,
  (81429, 0): 127,
  (91216, 0): 127,
  (91216, 1): 63,
  (99521, 0): 17,
  (99521, 1): 18,
  (107002, 0): 127,
  (107002, 1): 127,
  (108310, 0): 16,
  (108310, 1): 51,
  (128919, 0): 127,
  (137455, 0): 127,
  (137455, 1): 18,
  (138650, 0): 127,
  (138650, 1): 127,
  (140969, 0): 127,
  (140969, 1): 16,
  (141455, 0): 127,
  (141455, 1): 127,
  (142896, 0): 17,
  (142896, 1): 16,
 

In [10]:
save_obj(result_features, 'result_features_ucount')