In [7]:
import pandas as pd
from pathlib import Path
from typing import List, Dict, Tuple
import numpy as np
pd.options.display.float_format = '{:.5f}'.format

In [3]:
def load_and_preprocess_vitrine() -> pd.DataFrame:
    vitrine = pd.read_csv(Path("../data/base/vitrine.csv"))

    vitrine = vitrine[vitrine.content_type_id == 0].reset_index(drop=True)
    vitrine["prior_question_had_explanation"].fillna(False, inplace=True)
    vitrine["prior_question_had_explanation"] = list(
        map(lambda x: int(x), list(vitrine["prior_question_had_explanation"]))
    )

    return vitrine


def aggregate(
    df_i: pd.DataFrame,
    groupby_list: List[str],
    agg_func: List,
    name_list: List,
    target_field: str,
) -> pd.DataFrame:
    df = df_i.copy()
    df = df.sort_values(by=groupby_list + ["timestamp"])
    df = df[groupby_list + [target_field]].groupby(groupby_list).agg(agg_func)
    df.columns = list(map(lambda x: x + target_field, name_list))
    return df.reset_index()


def get_user_last_f(
    df_i: pd.DataFrame, keys: List[str], nec_field: List[str]
) -> pd.DataFrame:
    df = df_i.copy()

    df = df[keys + list(set(nec_field) - set(["timestamp"])) + ["timestamp"]]
    df = df.sort_values(by=["user_id", "timestamp"])
    df = (
        df.drop_duplicates(subset=keys, keep="last")
        .sort_values(by=keys)
        .reset_index(drop=True)
    )

    return df


def get_answered_as_arr(
    vitrine: pd.DataFrame,
):
    features_user = vitrine.sort_values(by=["user_id", "timestamp"]).reset_index(
        drop=True
    )
    features_user = features_user[["user_id", "answered_correctly"]]

    return (
        features_user.groupby("user_id")["answered_correctly"]
        .apply(np.array)
        .to_frame()
    ).reset_index()


vitrine = load_and_preprocess_vitrine()

In [5]:
user_ema_answered_correctly = get_answered_as_arr(
    vitrine,
)

In [6]:
user_ema_answered_correctly

Unnamed: 0,user_id,answered_correctly
0,7136928,"[0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, ..."
1,22318239,"[1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, ..."
2,51771056,"[1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, ..."
3,53424674,"[1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,64764301,"[0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0]"
...,...,...
95,2088893187,"[0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, ..."
96,2099100705,"[1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, ..."
97,2100847337,"[0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, ..."
98,2143636986,"[1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, ..."


In [9]:
content_he_mean_answered_correctly = aggregate(
    vitrine,
    groupby_list=["content_id", "prior_question_had_explanation"],
    agg_func=["sum", "count"],
    name_list=["content_he_sum_", "content_he_count_"],
    target_field="answered_correctly",
)

content_he_part_sum_answered_correctly = aggregate(
    vitrine,
    groupby_list=["part", "prior_question_had_explanation"],
    agg_func=["sum"],
    name_list=["content_he_part_sum_"],
    target_field="answered_correctly",
)

In [7]:
content_he_mean_answered_correctly

Unnamed: 0,content_id,prior_question_had_explanation,content_he_sum_answered_correctly,content_he_count_answered_correctly
0,0,True,2,2
1,1,True,2,2
2,2,True,10,14
3,3,True,3,4
4,4,False,2,3
...,...,...,...,...
10447,32312,False,-3,3
10448,32535,False,-1,1
10449,32570,False,-2,2
10450,32625,False,-3,3


In [10]:
content_he_part_sum_answered_correctly

Unnamed: 0,part,prior_question_had_explanation,content_he_part_sum_answered_correctly
0,1,False,77
1,1,True,1436
2,2,False,-7
3,2,True,4211
4,3,False,51
5,3,True,2267
6,4,False,107
7,4,True,2029
8,5,False,37
9,5,True,4570


In [12]:
vitrine[vitrine.answered_correctly == -1]

Unnamed: 0,id,timestamp,user_id,content_id,content_type_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,part,concept,intention,solving_question,starter,kmean_cluster
100,2420478,85453640,51771056,22711,1,-1,,False,6,1.00000,0.00000,0.00000,0.00000,4
131,2420509,150817666,51771056,23815,1,-1,,False,3,1.00000,0.00000,0.00000,0.00000,2
175,2420553,204130046,51771056,32570,1,-1,,False,3,0.00000,0.00000,1.00000,0.00000,2
229,2420607,283146278,51771056,12354,1,-1,,False,6,1.00000,0.00000,0.00000,0.00000,4
230,2420608,286177331,51771056,7949,1,-1,,False,5,1.00000,0.00000,0.00000,0.00000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26835,101109014,1565090408,2145262856,27845,1,-1,,False,5,1.00000,0.00000,0.00000,0.00000,1
26848,101109027,2055709102,2145262856,26921,1,-1,,False,5,1.00000,0.00000,0.00000,0.00000,1
26853,101109032,2056753478,2145262856,31589,1,-1,,False,5,1.00000,0.00000,0.00000,0.00000,1
26874,101109053,2312647679,2145262856,5694,1,-1,,False,2,1.00000,0.00000,0.00000,0.00000,3


In [11]:
features.content_he_mean_answered_correctly.median()

0.6313962801459464

In [12]:
features.content_he_part_sum_answered_correctly.median()

12535037.0

In [9]:
features.abs_chng_timestamp.median()

30825.0

In [10]:
features.user_ema_answered_correctly.median()

0.5820437978734825

In [49]:
vitrine

Unnamed: 0,id,timestamp,user_id,content_id,content_type_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,part,concept,intention,solving_question,starter,kmean_cluster
0,365541,0,7136928,7900,0,0,,,1,,,,,0
1,365542,23989,7136928,7876,0,0,21000.00000,False,1,,,,,13
2,365543,48079,7136928,175,0,0,20000.00000,False,1,,,,,13
3,365544,74835,7136928,1278,0,0,21000.00000,False,2,,,,,11
4,365545,192811,7136928,2063,0,0,20000.00000,False,3,,,,,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26887,101109066,2313772645,2145262856,9106,0,0,18000.00000,True,5,,,,,1
26888,101109067,2313888806,2145262856,4506,0,1,18000.00000,True,5,,,,,16
26889,101109068,2314051785,2145262856,3854,0,0,25000.00000,True,5,,,,,1
26890,101109069,2314404971,2145262856,4405,0,0,56000.00000,True,5,,,,,1


In [9]:
features

Unnamed: 0,id,user_id,timestamp,prior_question_elapsed_time,prior_question_had_explanation,part,kmean_cluster,lect_user_part,lect_user_kmean_cluster,lect_user,...,rel_content_cl_mean_answ,rel_content_he_cl_mean_answ,rel_user_content_mean_answ,rel_user_content_he_mean_answ,rel_user_content_ema_answ,rel_user_content_he_ema_answ,mul_user_content_mean_answ,mul_user_content_ema_answ,mul_user_content_he_mean_answ,mul_user_content_he_ema_answ
0,365548.00000,7136928.00000,290933.00000,38000.00000,0,4.00000,6.00000,0.00000,0.00000,0.00000,...,1.09489,0.61355,0.23810,0.24026,0.28826,0.29088,0.08571,0.10377,0.08494,0.10284
1,365549.00000,7136928.00000,290933.00000,38000.00000,0,4.00000,6.00000,0.00000,0.00000,0.00000,...,1.73894,0.97447,0.66176,0.57812,0.79943,0.69839,0.09444,0.11409,0.10811,0.13060
2,365550.00000,7136928.00000,290933.00000,38000.00000,0,4.00000,6.00000,0.00000,0.00000,0.00000,...,2.51824,1.41117,0.85185,1.05556,0.99219,1.22945,0.05797,0.06752,0.04678,0.05449
3,365551.00000,7136928.00000,370847.00000,31333.00000,0,4.00000,6.00000,0.00000,0.00000,0.00000,...,4.59852,2.57692,1.40000,1.23333,1.57166,1.38456,0.02857,0.03207,0.03243,0.03641
4,365552.00000,7136928.00000,370847.00000,31333.00000,0,4.00000,6.00000,0.00000,0.00000,0.00000,...,2.12239,1.18935,0.58741,0.74747,0.63535,0.80848,0.05628,0.06087,0.04423,0.04784
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2151,100998258.00000,2143636986.00000,15189081620.00000,37000.00000,1,5.00000,1.00000,1.00000,1.00000,1.00000,...,1.01221,1.04735,0.80000,1.08271,0.88933,0.99539,0.28800,0.32016,0.36842,0.33871
2152,101109067.00000,2145262856.00000,2313888806.00000,18000.00000,1,5.00000,16.00000,9.00000,0.00000,13.00000,...,0.94184,0.96759,1.11111,1.17824,1.08962,1.12652,0.40000,0.39226,0.40093,0.38333
2153,101109068.00000,2145262856.00000,2314051785.00000,25000.00000,1,5.00000,1.00000,9.00000,9.00000,13.00000,...,1.45758,1.50819,1.60225,1.65185,1.57442,1.58295,0.27817,0.27334,0.28678,0.27482
2154,101109069.00000,2145262856.00000,2314404971.00000,56000.00000,1,5.00000,1.00000,9.00000,9.00000,13.00000,...,1.01221,1.04735,1.10955,1.14359,1.08629,1.09151,0.39944,0.39107,0.41169,0.39294


In [10]:
list(features.columns)

['id',
 'user_id',
 'timestamp',
 'prior_question_elapsed_time',
 'prior_question_had_explanation',
 'part',
 'kmean_cluster',
 'lect_user_part',
 'lect_user_kmean_cluster',
 'lect_user',
 'lect_user_part_concept',
 'lect_user_kmean_cluster_concept',
 'lect_user_concept',
 'lect_user_part_intention',
 'lect_user_kmean_cluster_intention',
 'lect_user_intention',
 'lect_user_part_solving_question',
 'lect_user_kmean_cluster_solving_question',
 'lect_user_solving_question',
 'lect_user_part_starter',
 'lect_user_kmean_cluster_starter',
 'lect_user_starter',
 'user_sum_answered_correctly',
 'user_count_answered_correctly',
 'user_mean_answered_correctly',
 'user_ema_answered_correctly',
 'user_he_sum_answered_correctly',
 'user_he_count_answered_correctly',
 'user_he_mean_answered_correctly',
 'user_he_ema_answered_correctly',
 'user_sum_prior_question_elapsed_time',
 'user_count_prior_question_elapsed_time',
 'user_mean_prior_question_elapsed_time',
 'user_ema_prior_question_elapsed_time'

# Content features

In [34]:
def get_content_last_f(
    df_i: pd.DataFrame, 
    keys: List[str], 
    nec_field: List[str]
) -> pd.DataFrame:
    df = df_i.copy()
    
    df = df[keys + nec_field]
    df = df.drop_duplicates(
        subset=keys, 
        keep='last'
    ).sort_values(
        by = keys
    ).reset_index(drop=True)
    
    return df

In [28]:
nec = features[[
    'content_id', 
    'prior_question_had_explanation',
    'content_he_count_answered_correctly',
    'content_he_sum_answered_correctly',
]]

In [35]:
get_content_last_f(
    features, 
    keys = ['content_id', 'prior_question_had_explanation'], 
    nec_field = [
        'content_he_count_answered_correctly',
        'content_he_sum_answered_correctly',
    ]
) 

Unnamed: 0,content_id,prior_question_had_explanation,content_he_count_answered_correctly,content_he_sum_answered_correctly
0,3.00000,1,4.00000,3.00000
1,4.00000,0,3.00000,2.00000
2,4.00000,1,6.00000,4.00000
3,6.00000,0,2.00000,2.00000
4,6.00000,1,16.00000,5.00000
...,...,...,...,...
1401,13359.00000,1,5.00000,3.00000
1402,13413.00000,1,5.00000,3.00000
1403,13478.00000,1,1.00000,1.00000
1404,13504.00000,1,5.00000,3.00000


In [37]:
get_content_last_f(
    features, 
    keys = ['part', 'prior_question_had_explanation'], 
    nec_field = [
        'content_he_part_sum_answered_correctly',
    ]
)

Unnamed: 0,part,prior_question_had_explanation,content_he_part_sum_answered_correctly
0,1.0,0,136.0
1,1.0,1,1420.0
2,2.0,0,108.0
3,2.0,1,4084.0
4,3.0,0,74.0
5,3.0,1,2261.0
6,4.0,0,146.0
7,4.0,1,2020.0
8,5.0,0,292.0
9,5.0,1,4414.0


In [23]:
df = pd.DataFrame(data = {
    'content_id': [1, 1, 1, 2, 2, 2, 2, 2],
    'prior_question_had_explanation': [0, 1, 0, 0, 1, 0, 0, 0],
    'val': [i for i in range(8)]
})

In [24]:
df

Unnamed: 0,content_id,prior_question_had_explanation,val
0,1,0,0
1,1,1,1
2,1,0,2
3,2,0,3
4,2,1,4
5,2,0,5
6,2,0,6
7,2,0,7


In [26]:
df.drop_duplicates(subset=['content_id', 'prior_question_had_explanation'], keep='last').reset_index(drop=True)

Unnamed: 0,content_id,prior_question_had_explanation,val
0,1,1,1
1,1,0,2
2,2,1,4
3,2,0,7


# User features

In [39]:
features[features.user_id == 7136928]

Unnamed: 0,id,user_id,timestamp,prior_question_elapsed_time,prior_question_had_explanation,part,kmean_cluster,lect_user_part,lect_user_kmean_cluster,lect_user,...,rel_content_cl_mean_answ,rel_content_he_cl_mean_answ,rel_user_content_mean_answ,rel_user_content_he_mean_answ,rel_user_content_ema_answ,rel_user_content_he_ema_answ,mul_user_content_mean_answ,mul_user_content_ema_answ,mul_user_content_he_mean_answ,mul_user_content_he_ema_answ
0,365548.0,7136928.0,290933.0,38000.0,0,4.0,6.0,0.0,0.0,0.0,...,1.09489,0.61355,0.2381,0.24026,0.28826,0.29088,0.08571,0.10377,0.08494,0.10284
1,365549.0,7136928.0,290933.0,38000.0,0,4.0,6.0,0.0,0.0,0.0,...,1.73894,0.97447,0.66176,0.57812,0.79943,0.69839,0.09444,0.11409,0.10811,0.1306
2,365550.0,7136928.0,290933.0,38000.0,0,4.0,6.0,0.0,0.0,0.0,...,2.51824,1.41117,0.85185,1.05556,0.99219,1.22945,0.05797,0.06752,0.04678,0.05449
3,365551.0,7136928.0,370847.0,31333.0,0,4.0,6.0,0.0,0.0,0.0,...,4.59852,2.57692,1.4,1.23333,1.57166,1.38456,0.02857,0.03207,0.03243,0.03641
4,365552.0,7136928.0,370847.0,31333.0,0,4.0,6.0,0.0,0.0,0.0,...,2.12239,1.18935,0.58741,0.74747,0.63535,0.80848,0.05628,0.06087,0.04423,0.04784
5,365553.0,7136928.0,370847.0,31333.0,0,4.0,6.0,0.0,0.0,0.0,...,6.89778,3.86538,1.75,3.08333,1.82304,3.21201,0.01587,0.01654,0.00901,0.00938
6,365554.0,7136928.0,451200.0,25666.0,0,4.0,6.0,0.0,0.0,0.0,...,1.42335,0.79762,0.33333,0.30769,0.33432,0.3086,0.07101,0.07122,0.07692,0.07715
7,365555.0,7136928.0,451200.0,25666.0,0,4.0,6.0,0.0,0.0,0.0,...,1.11393,0.62422,0.24224,0.25714,0.23383,0.24822,0.08425,0.08133,0.07937,0.07661
8,365556.0,7136928.0,451200.0,25666.0,0,4.0,6.0,0.0,0.0,0.0,...,2.13503,1.19643,0.43333,0.43636,0.40244,0.40525,0.04103,0.0381,0.04074,0.03784
9,365557.0,7136928.0,478945.0,25666.0,0,5.0,1.0,0.0,0.0,0.0,...,1.15944,0.75476,0.23864,0.24375,0.21315,0.21772,0.06548,0.05848,0.0641,0.05726


In [42]:
def get_user_last_f(
    df_i: pd.DataFrame, 
    keys: List[str], 
    nec_field: List[str]
) -> pd.DataFrame:
    df = df_i.copy()

    df = df[keys + list(set(nec_field) - set(['timestamp'])) + ['timestamp']]
    df = df.sort_values(
        by = ['user_id', 'timestamp']
    )
    df = df.drop_duplicates(
        subset=keys, 
        keep='last'
    ).sort_values(
        by = keys
    ).reset_index(drop=True)
    
    return df

In [43]:
df = pd.DataFrame(data = {
    'user_id': [1, 1, 1, 2, 2, 2, 2, 2],
    'timestamp': [i for i in range(8)]
})

In [44]:
df

Unnamed: 0,user_id,timestamp
0,1,0
1,1,1
2,1,2
3,2,3
4,2,4
5,2,5
6,2,6
7,2,7


In [45]:
get_user_last_f(
    df, 
    keys = ['user_id'], 
    nec_field = ['timestamp']
)

Unnamed: 0,user_id,timestamp
0,1,2
1,2,7


In [46]:
get_user_last_f(
    features, 
    keys = ['user_id'], 
    nec_field = ['timestamp']
)

Unnamed: 0,user_id,timestamp
0,7136928.00000,1381521.00000
1,22318239.00000,418433.00000
2,51771056.00000,1497032674.00000
3,53424674.00000,22975936976.00000
4,64764301.00000,610367939.00000
...,...,...
95,2088893187.00000,918210.00000
96,2099100705.00000,1255716.00000
97,2100847337.00000,12174551507.00000
98,2143636986.00000,15189081620.00000


In [48]:
features[features.user_id == 2099100705]

Unnamed: 0,id,user_id,timestamp,prior_question_elapsed_time,prior_question_had_explanation,part,kmean_cluster,lect_user_part,lect_user_kmean_cluster,lect_user,...,rel_content_cl_mean_answ,rel_content_he_cl_mean_answ,rel_user_content_mean_answ,rel_user_content_he_mean_answ,rel_user_content_ema_answ,rel_user_content_he_ema_answ,mul_user_content_mean_answ,mul_user_content_ema_answ,mul_user_content_he_mean_answ,mul_user_content_he_ema_answ
1297,98944169.0,2099100705.0,162622.0,16000.0,0,1.0,0.0,0.0,0.0,0.0,...,0.9642,0.90435,0.97959,0.85714,0.98104,0.85841,0.75,0.7511,0.85714,0.85841
1298,98944170.0,2099100705.0,196267.0,17000.0,0,1.0,0.0,0.0,0.0,0.0,...,0.84367,0.7913,0.875,0.875,0.8805,0.8805,0.875,0.8805,0.875,0.8805
1299,98944171.0,2099100705.0,224318.0,31000.0,0,1.0,0.0,0.0,0.0,0.0,...,0.84367,0.7913,0.88889,0.88889,0.89758,0.89758,0.88889,0.89758,0.88889,0.89758
1300,98944172.0,2099100705.0,252412.0,19000.0,0,1.0,0.0,0.0,0.0,0.0,...,1.1249,1.05507,1.2,1.35,1.21488,1.36674,0.675,0.68337,0.6,0.60744
1301,98944173.0,2099100705.0,281206.0,22000.0,0,1.0,0.0,0.0,0.0,0.0,...,1.04889,0.98378,1.13022,1.15702,1.1465,1.17369,0.73123,0.74176,0.71429,0.72457
1302,98944174.0,2099100705.0,306664.0,23000.0,0,1.0,0.0,0.0,0.0,0.0,...,0.84367,0.7913,0.83333,0.83333,0.81417,0.81417,0.83333,0.81417,0.83333,0.81417
1303,98944175.0,2099100705.0,333293.0,19000.0,0,1.0,0.0,0.0,0.0,0.0,...,1.1249,1.05507,1.12821,1.26923,1.11313,1.25227,0.63462,0.62614,0.5641,0.55656
1304,98944176.0,2099100705.0,356577.0,22000.0,0,1.0,13.0,0.0,0.0,0.0,...,0.94783,0.53097,1.04762,1.17857,0.9948,1.11915,0.58929,0.55957,0.52381,0.4974
1305,98944177.0,2099100705.0,381570.0,21000.0,0,1.0,13.0,0.0,0.0,0.0,...,0.94783,0.53097,0.97778,0.73333,0.89329,0.66997,0.55,0.50247,0.73333,0.66997
1306,98944178.0,2099100705.0,408774.0,16000.0,0,1.0,13.0,0.0,0.0,0.0,...,1.06631,0.59735,1.125,1.125,1.05364,1.05364,0.5,0.46828,0.5,0.46828


In [78]:
def get_answered_as_arr(
    features: pd.DataFrame,
    vitrine: pd.DataFrame,
):
    features_user = features[['id', 'user_id']]
    features_user = pd.merge(
        features_user,
        vitrine[['id', 'answered_correctly']],
        left_on=['id'],
        right_on=['id'],
        how = 'left'
    )
    features_user = features_user.sort_values(by = ['user_id', 'id']).reset_index(drop=True)
    
    
    
    return features_user.groupby('user_id')['answered_correctly'].apply(np.array).to_frame()

In [66]:
features_user = features[['id', 'user_id']]
print(features_user.shape)
features_user = pd.merge(
    features_user,
    vitrine[['id', 'answered_correctly']],
    left_on=['id'],
    right_on=['id'],
    how = 'left'
)
features_user = features_user.sort_values(by = ['user_id', 'id']).reset_index(drop=True)
print(features_user.shape)

(2156, 2)
(2156, 3)


In [67]:
features_user[features_user.user_id == 7136928]

Unnamed: 0,id,user_id,answered_correctly
0,365548.0,7136928.0,1
1,365549.0,7136928.0,0
2,365550.0,7136928.0,0
3,365551.0,7136928.0,0
4,365552.0,7136928.0,0
5,365553.0,7136928.0,0
6,365554.0,7136928.0,0
7,365555.0,7136928.0,0
8,365556.0,7136928.0,0
9,365557.0,7136928.0,0


In [59]:
df_2 = features_user.groupby('user_id')['answered_correctly'].apply(np.array).to_frame()

In [77]:
df_2

Unnamed: 0_level_0,answered_correctly
user_id,Unnamed: 1_level_1
7136928.00000,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
22318239.00000,"[1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, ..."
51771056.00000,"[1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, ..."
53424674.00000,"[0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
64764301.00000,"[0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0]"
...,...
2088893187.00000,"[0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, ..."
2099100705.00000,"[1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, ..."
2100847337.00000,"[1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2143636986.00000,"[1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, ..."


In [76]:
df_2.loc[df_2.index == 7136928.00000]['answered_correctly']

user_id
7136928.00000    [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Name: answered_correctly, dtype: object

In [68]:
df_2[df_2.index == 7136928].values

array([[array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       0])]], dtype=object)

In [69]:
list(features_user[features_user.user_id == 7136928]['answered_correctly'])

[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0]

In [80]:
df_2 = get_answered_as_arr(
    features,
    vitrine,
)
df_2[df_2.index == 7136928].values

array([[array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       0])]], dtype=object)

# Check

In [46]:
import pickle
def load_obj(name):
    with open("../data/base_8_2_none/" + name + ".pkl", "rb") as f:
        return pickle.load(f)
    
def save_obj(obj, name):
    with open("../data/base_8_2_none/" + name + ".pkl", "wb") as f:
        pickle.dump(obj, f, protocol = 2)

In [22]:
result_features = load_obj('result_features')

In [23]:
result_features.keys()

dict_keys(['user_ema_answered_correctly', 'abs_chng_timestamp', 'content_he_mean_answered_correctly', 'content_he_part_sum_answered_correctly'])

In [31]:
def df_features_to_dict(df: pd.DataFrame, index_fields: List[str]) -> Dict:
    df = df.set_index(index_fields)
    return df.to_dict('index')

from functools import reduce
def ema(series):
    N = len(series)
    alpha = 2 / (N + 1)
    return reduce(lambda sum_, curr: (1 - alpha) * sum_ + alpha * curr, series)

In [28]:
user_ema_answered_correctly = df_features_to_dict(
    result_features['user_ema_answered_correctly'],
    ['user_id']
)

abs_chng_timestamp = df_features_to_dict(
    result_features['abs_chng_timestamp'],
    ['user_id']
)

content_he_mean_answered_correctly = df_features_to_dict(
    result_features['content_he_mean_answered_correctly'],
    ['content_id', 'prior_question_had_explanation']
)

content_he_part_sum_answered_correctly = df_features_to_dict(
    result_features['content_he_part_sum_answered_correctly'],
    ['part', 'prior_question_had_explanation']
)

In [34]:
user_ema = dict()
for user_id in user_ema_answered_correctly:
    user_ema[user_id] = {
        'ema': ema(user_ema_answered_correctly[user_id]['answered_correctly']),
        'N': len(user_ema_answered_correctly[user_id]['answered_correctly'])
    }

In [35]:
user_ema

{115: {'ema': 0.7067087823362055, 'N': 46},
 124: {'ema': 0.2843183374621782, 'N': 30},
 2746: {'ema': 0.544325039759855, 'N': 19},
 5382: {'ema': 0.7084528563141884, 'N': 125},
 8623: {'ema': 0.7086288985304164, 'N': 109},
 8701: {'ema': 0.6815275642408487, 'N': 17},
 12741: {'ema': 0.5257534816907297, 'N': 265},
 13134: {'ema': 0.7588552851504797, 'N': 1243},
 24418: {'ema': 0.600045187247476, 'N': 6283},
 24600: {'ema': 0.4807412040412167, 'N': 50},
 32421: {'ema': 0.5024358065765372, 'N': 30},
 40828: {'ema': 0.7039005984555675, 'N': 92},
 44331: {'ema': 0.5388371721261588, 'N': 291},
 45001: {'ema': 0.2006265431826075, 'N': 30},
 46886: {'ema': 0.7244898933106227, 'N': 44},
 50132: {'ema': 0.5458006719017459, 'N': 74},
 51285: {'ema': 0.5639598175276652, 'N': 22},
 53842: {'ema': 0.3076069627693955, 'N': 30},
 81002: {'ema': 0.5082154826221165, 'N': 17},
 81429: {'ema': 0.5455866851901097, 'N': 30},
 91216: {'ema': 0.700212301406524, 'N': 1845},
 99521: {'ema': 0.8694870551595347,

In [50]:
ema_list = []
for key, value in user_ema.items():
    ema_list.append(value['ema'])

In [52]:
df = pd.DataFrame(data = {'ema': ema_list})
df.describe()

Unnamed: 0,ema
count,393656.0
mean,0.5681
std,0.17026
min,0.0
25%,0.45568
50%,0.58905
75%,0.69752
max,1.0


In [37]:
abs_chng_timestamp

{115: {'timestamp': 668090043},
 124: {'timestamp': 571323},
 2746: {'timestamp': 835457},
 5382: {'timestamp': 2101551456},
 8623: {'timestamp': 862338736},
 8701: {'timestamp': 1571291},
 12741: {'timestamp': 4465043002},
 13134: {'timestamp': 18122046414},
 24418: {'timestamp': 14243735782},
 24600: {'timestamp': 1550831},
 32421: {'timestamp': 44680547},
 40828: {'timestamp': 6376896727},
 44331: {'timestamp': 4661118198},
 45001: {'timestamp': 195681},
 46886: {'timestamp': 22050635752},
 50132: {'timestamp': 2308306376},
 51285: {'timestamp': 871989},
 53842: {'timestamp': 723461},
 81002: {'timestamp': 447505},
 81429: {'timestamp': 718663},
 91216: {'timestamp': 18804556370},
 99521: {'timestamp': 941291},
 107002: {'timestamp': 2831179364},
 108310: {'timestamp': 1280931},
 128919: {'timestamp': 646286},
 137455: {'timestamp': 1517630},
 138650: {'timestamp': 43758222307},
 140969: {'timestamp': 937101},
 141455: {'timestamp': 8230340598},
 142896: {'timestamp': 1214727},
 146

In [38]:
abs_chng_timestamp_new = dict()
for user_id, value in abs_chng_timestamp.items():
    abs_chng_timestamp_new[user_id] = value['timestamp']

In [40]:
abs_chng_timestamp_new

{115: 668090043,
 124: 571323,
 2746: 835457,
 5382: 2101551456,
 8623: 862338736,
 8701: 1571291,
 12741: 4465043002,
 13134: 18122046414,
 24418: 14243735782,
 24600: 1550831,
 32421: 44680547,
 40828: 6376896727,
 44331: 4661118198,
 45001: 195681,
 46886: 22050635752,
 50132: 2308306376,
 51285: 871989,
 53842: 723461,
 81002: 447505,
 81429: 718663,
 91216: 18804556370,
 99521: 941291,
 107002: 2831179364,
 108310: 1280931,
 128919: 646286,
 137455: 1517630,
 138650: 43758222307,
 140969: 937101,
 141455: 8230340598,
 142896: 1214727,
 146023: 618702659,
 146403: 773335,
 157207: 1860463,
 163243: 55877275,
 165081: 2402551612,
 166728: 14685531626,
 174754: 563034,
 176102: 2375014,
 176303: 2432140,
 178445: 4489808711,
 206168: 1378380,
 215672: 2543763343,
 220268: 15628295187,
 238966: 24747864011,
 239323: 1123759,
 246496: 466212984,
 247749: 628647,
 251201: 1090146,
 260489: 83528256,
 275762: 20157987361,
 286187: 41379451419,
 287029: 1331598,
 290191: 1031596,
 297533:

In [43]:
content_he_mean_answered_correctly#[(0, 0)]

{(0, 0): {'content_he_sum_answered_correctly': 299,
  'content_he_count_answered_correctly': 360},
 (0, 1): {'content_he_sum_answered_correctly': 5967,
  'content_he_count_answered_correctly': 6543},
 (1, 0): {'content_he_sum_answered_correctly': 118,
  'content_he_count_answered_correctly': 145},
 (1, 1): {'content_he_sum_answered_correctly': 6471,
  'content_he_count_answered_correctly': 7253},
 (2, 0): {'content_he_sum_answered_correctly': 2465,
  'content_he_count_answered_correctly': 5021},
 (2, 1): {'content_he_sum_answered_correctly': 22425,
  'content_he_count_answered_correctly': 39884},
 (3, 0): {'content_he_sum_answered_correctly': 656,
  'content_he_count_answered_correctly': 956},
 (3, 1): {'content_he_sum_answered_correctly': 17250,
  'content_he_count_answered_correctly': 22017},
 (4, 0): {'content_he_sum_answered_correctly': 8451,
  'content_he_count_answered_correctly': 14909},
 (4, 1): {'content_he_sum_answered_correctly': 11010,
  'content_he_count_answered_correctly

In [44]:
content_he_part_sum_answered_correctly

{(1, 0): {'content_he_part_sum_answered_correctly': 690405},
 (1, 1): {'content_he_part_sum_answered_correctly': 4863491},
 (2, 0): {'content_he_part_sum_answered_correctly': 530985},
 (2, 1): {'content_he_part_sum_answered_correctly': 12752354},
 (3, 0): {'content_he_part_sum_answered_correctly': 466416},
 (3, 1): {'content_he_part_sum_answered_correctly': 5594098},
 (4, 0): {'content_he_part_sum_answered_correctly': 682846},
 (4, 1): {'content_he_part_sum_answered_correctly': 4407838},
 (5, 0): {'content_he_part_sum_answered_correctly': 1414331},
 (5, 1): {'content_he_part_sum_answered_correctly': 23543239},
 (6, 0): {'content_he_part_sum_answered_correctly': 636669},
 (6, 1): {'content_he_part_sum_answered_correctly': 6392894},
 (7, 0): {'content_he_part_sum_answered_correctly': 447687},
 (7, 1): {'content_he_part_sum_answered_correctly': 2821374}}

In [58]:
new_content_he_part_sum_answered_correctly = dict()
for key, value in content_he_part_sum_answered_correctly.items():
    new_content_he_part_sum_answered_correctly[key] = value['content_he_part_sum_answered_correctly']
new_content_he_part_sum_answered_correctly

{(1, 0): 690405,
 (1, 1): 4863491,
 (2, 0): 530985,
 (2, 1): 12752354,
 (3, 0): 466416,
 (3, 1): 5594098,
 (4, 0): 682846,
 (4, 1): 4407838,
 (5, 0): 1414331,
 (5, 1): 23543239,
 (6, 0): 636669,
 (6, 1): 6392894,
 (7, 0): 447687,
 (7, 1): 2821374}

In [61]:
new_result_dict = {
    'user_ema': user_ema,
    'abs_chng_timestamp': abs_chng_timestamp_new,
    'content_he_mean_answered_correctly': content_he_mean_answered_correctly,
    'content_he_part_sum_answered_correctly': new_content_he_part_sum_answered_correctly
}

In [62]:
save_obj(new_result_dict, 'result_features_dict')

In [29]:
user_ema_answered_correctly

{115: {'answered_correctly': array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0,
         0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0,
         0, 1])},
 124: {'answered_correctly': array([1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
         1, 0, 0, 0, 0, 0, 0, 0])},
 2746: {'answered_correctly': array([0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1])},
 5382: {'answered_correctly': array([1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0,
         0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1,
         0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
         1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1,
         1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0])},
 8623: {'answered_correctly': array([1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1,
       

In [55]:
result_features['content_he_mean_answered_correctly']['mean'] = \
result_features['content_he_mean_answered_correctly']['content_he_sum_answered_correctly'] / \
result_features['content_he_mean_answered_correctly']['content_he_count_answered_correctly']
result_features['content_he_mean_answered_correctly'].describe()

Unnamed: 0,content_id,prior_question_had_explanation,content_he_sum_answered_correctly,content_he_count_answered_correctly,mean
count,26905.0,26905.0,26905.0,26905.0,26905.0
mean,6743.97119,0.50247,2425.00007,3689.69708,0.69736
std,3898.56889,0.5,4823.91738,8259.37703,0.18714
min,0.0,0.0,0.0,1.0,0.0
25%,3364.0,0.0,91.0,142.0,0.57925
50%,6743.0,1.0,323.0,525.0,0.72211
75%,10117.0,1.0,3183.0,4647.0,0.83746
max,13522.0,1.0,137766.0,168905.0,1.0


In [24]:
result_features['user_ema_answered_correctly']

Unnamed: 0,user_id,answered_correctly
0,115,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, ..."
1,124,"[1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, ..."
2,2746,"[0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, ..."
3,5382,"[1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, ..."
4,8623,"[1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, ..."
...,...,...
393651,2147470770,"[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
393652,2147470777,"[0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, ..."
393653,2147481750,"[0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, ..."
393654,2147482216,"[0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, ..."


In [53]:
result_features['abs_chng_timestamp'].describe()

Unnamed: 0,user_id,timestamp
count,393656.0,393656.0
mean,1076358303.90729,5134563231.23565
std,620131874.57678,10562533523.85521
min,115.0,0.0
25%,538759611.75,1120552.25
50%,1077717363.5,366033054.0
75%,1613533216.25,4542858824.75
max,2147482888.0,87425772049.0


In [56]:
result_features['content_he_part_sum_answered_correctly'].describe()

Unnamed: 0,part,prior_question_had_explanation,content_he_part_sum_answered_correctly
count,14.0,14.0,14.0
mean,4.0,0.5,4660330.5
std,2.0755,0.51887,6445627.00789
min,1.0,0.0,447687.0
25%,2.25,0.0,648213.25
50%,4.0,0.5,2117852.5
75%,5.75,1.0,5411446.25
max,7.0,1.0,23543239.0
