In [1]:
import pandas as pd
import numpy as np
import datetime

structure = pd.read_csv('course-217-structure.csv')
events = pd.read_csv('course-217-events.csv')
events['time_f'] = pd.to_datetime(events['time'], unit='s')

In [2]:
print(structure.head())
print('\n\n' 'Максимальные значения:')
print(structure[['module_position', 'lesson_position', 'step_position']].apply(np.max))

   course_id  module_id  module_position  lesson_id  lesson_position  step_id  \
0        217        614                2      13228                2    38842   
1        217        614                2      13228                2    39715   
2        217        614                2      13228                2    39716   
3        217        614                2      13228                2    39717   
4        217        614                2      13228                2    39721   

   step_position    step_type  step_cost  
0              1         text          0  
1              6         code          1  
2              7         code          1  
3              8         code          1  
4              9  free-answer          0  


Максимальные значения:
module_position    10
lesson_position     9
step_position      13
dtype: int64


In [3]:
structure = structure.sort_values(['module_position', 'lesson_position', 'step_position'])
structure['step_num'] = range(0, len(structure))
structure.head(30)

Unnamed: 0,course_id,module_id,module_position,lesson_id,lesson_position,step_id,step_position,step_type,step_cost,step_num
219,217,635,1,13140,1,38518,1,text,0,0
221,217,635,1,13140,1,38521,2,text,0,1
220,217,635,1,13140,1,38519,3,choice,0,2
229,217,635,1,13140,1,38529,4,code,1,3
227,217,635,1,13140,1,38527,5,text,0,4
222,217,635,1,13140,1,38522,6,text,0,5
223,217,635,1,13140,1,38523,7,text,0,6
224,217,635,1,13140,1,38524,8,text,0,7
225,217,635,1,13140,1,38525,9,text,0,8
228,217,635,1,13140,1,38528,10,text,0,9


In [4]:
print(events.head(), '\n')
print(events.dtypes)

   user_id  action  step_id        time              time_f
0        1  viewed    47435  1465181036 2016-06-06 02:43:56
1        1  viewed    47438  1465181034 2016-06-06 02:43:54
2        1  viewed    47435  1465181021 2016-06-06 02:43:41
3        1  viewed    47436  1465181015 2016-06-06 02:43:35
4        1  viewed    41605  1465180037 2016-06-06 02:27:17 

user_id             int64
action             object
step_id             int64
time                int64
time_f     datetime64[ns]
dtype: object


In [5]:
events = pd.merge(
    left = events,
    right = structure[['step_id', 'step_num']], 
    how = 'left', 
    on = 'step_id')

events = events[['user_id', 'time', 'step_num', 'step_id']]
print(events.head())

   user_id        time  step_num  step_id
0        1  1465181036       197    47435
1        1  1465181034       198    47438
2        1  1465181021       197    47435
3        1  1465181015       196    47436
4        1  1465180037       200    41605


In [6]:
#список пользователей
unique_user_id = set(events['user_id'])
unique_step_num = set(events['step_num'])

step_df = pd.DataFrame(index = list(unique_step_num), columns = ['users_count', 'comeback'], dtype='float')
step_df.fillna(0, inplace = True)

for user in unique_user_id:
    
    #список степов для данного пользователя
    unique_step_num_for_user = set(events[events['user_id'] == user]['step_num'])
    
    for step in unique_step_num_for_user:
        # время первого посещения степа
        step_start_time = events[(events['user_id'] == user) & (events['step_num'] == step)]['time'].min()
        # время последнего посещения степа
        step_max_time = events[(events['user_id'] == user) & (events['step_num'] == step)]['time'].max()
        # следующий (i+1) степ, начавшийся после текущего степа.
        # начало следующего степа
        next_steps_start_time = events[(events['user_id'] == user) & (events['step_num'] == step + 1) & (events['time'] > step_start_time)]['time'].min()
        
        if step_max_time > next_steps_start_time:
            step_df.loc[step, 'comeback'] += 1
            step_df.loc[step, 'users_count'] += 1
        else:
            step_df.loc[step, 'users_count'] += 1

step_df['comeback_ratio'] = step_df['comeback'] / step_df['users_count']
step_df.head()

Unnamed: 0,users_count,comeback,comeback_ratio
0,6625.0,2759.0,0.416453
1,6320.0,1634.0,0.258544
2,6228.0,1568.0,0.251766
3,6077.0,2509.0,0.412868
4,5667.0,1839.0,0.32451


In [7]:
step_df_m = pd.merge(
    left = step_df,
    right = structure[['step_num', 'step_id']],
    how = 'left', 
    left_index = True, 
    right_on = 'step_num')
step_df_m.head()

Unnamed: 0,users_count,comeback,comeback_ratio,step_num,step_id
219,6625.0,2759.0,0.416453,0,38518
221,6320.0,1634.0,0.258544,1,38521
220,6228.0,1568.0,0.251766,2,38519
229,6077.0,2509.0,0.412868,3,38529
227,5667.0,1839.0,0.32451,4,38527


In [8]:
top_10 = step_df_m.sort_values('comeback_ratio', ascending = False).head(10)
print(top_10)
print(list(top_10['step_id']))

     users_count  comeback  comeback_ratio  step_num  step_id
200        752.0     561.0        0.746011       215    41684
176        877.0     640.0        0.729761       191    41604
76        1589.0    1082.0        0.680931        56    41097
151        898.0     605.0        0.673719       142    41481
203        794.0     529.0        0.666247       214    42593
63        2583.0    1705.0        0.660085        40    38872
202        738.0     485.0        0.657182       216    41686
64        2008.0    1314.0        0.654382        48    39735
65        1826.0    1194.0        0.653888        49    39740
3         3675.0    2374.0        0.645986        22    39717
[41684, 41604, 41097, 41481, 42593, 38872, 41686, 39735, 39740, 39717]
