# Stepik ML contest

Final ML contest of the MOOC "Introduction to Data Science and Machine Learning" https://stepik.org/course/4852/

Having 2 datasets with train data from 2015 to 2018:

1) events_train.csv - data about users actions

1. step_id - "step" can be text lesson, video lesson, page with some info or exercise 
2. user_id - anonymized user id
3. timestamp - action time in unix date format
4. action - type of action: 
    <br>
    a  discovered - user clicked on step
    <br>
    b) viewed - user viewed step
    <br>
    c) started_attempt - (deprecated) - action before solving an exercise (usually some data downloading)
    <br>
    d) passed - step is passed ("next step" button was clicked)

2) submissions_train.csv - data about users submissions

1. step_id - "step" can be text lesson, video lesson, page with some info or exercise
2. timestamp - action time in unix date format
3. submission_status - submission status "correct" or "wrong"
4. user_id - anonymized user id

We are given 2 test datasets, which include data about the first 2 days of 6184 users activity from 2018 to 2019. 

1) submission_data_test.csv
2) events_data_test.csv

We need to predict whether user will complete the course or not. To complete the course is to solve correctly 41+ exercises.


In [1]:
from functools import wraps
import math
import pandas as pd
import numpy as np


def scale_column(df: pd.DataFrame, column: str) -> pd.DataFrame:
    df[column] = (df[column] - df[column].min())/(df[column].max() - df[column].min())
    return df


def ensure_data_not_lost(func):
    @wraps(func)
    def inner(*args, **kwargs):
        # contract here - checked df is always first arg (after self)
        df = args[1]
        size_before = df.size
        answer = func(*args, **kwargs)
        size_after = answer.size
        assert size_before <= size_after, f"{func.__name__}: Dataframe size decreased!\
                                            Before: {size_before}, after: {size_after}"
        return answer
    return inner


def check_no_inf_and_nan(columns: list):
    def inner_decorator(func):
        @wraps(func)
        def inner(*args, **kwargs):
            answer = func(*args, **kwargs)
            for column in columns:
                column_to_check = answer[column].replace([np.inf, -np.inf], np.nan)
                assert column_to_check.isna().sum() == 0, f"{func.__name__}, column {column}: \
                                                            found forbidden values!"
            return answer
        return inner
    return inner_decorator


class MOOCMetricsBuilder:
    
    DAYS_IN_PROGRESS = 30
    FIRST_DAYS_CHUNK = 2
    COURSE_PASSED_MIN_POINTS = 41
    
    def __init__(self, events: pd.DataFrame, submissions: pd.DataFrame):
        self.events = events
        self.submissions = submissions
    
    def _get_users_actions_time(self, events: pd.DataFrame) -> pd.DataFrame:
        first_last_action_time = events.groupby('user_id', as_index=False).agg({'timestamp': [np.min, np.max]})
        first_last_action_time.columns = ['user_id', 'min_timestamp', 'max_timestamp']
        return first_last_action_time
    
    def _drop_in_progress_users(self, df: pd.DataFrame, events: pd.DataFrame, 
                                      submissions: pd.DataFrame) -> pd.DataFrame:
        last_action = max(events['timestamp'])
        finished_threshold = last_action - self.DAYS_IN_PROGRESS * 24 * 60 * 60
        finished_users_scores = df[~((df['max_timestamp'] >= finished_threshold) & \
                                     (df['correct_submissions'] < self.COURSE_PASSED_MIN_POINTS))]
        finished_users_scores_results = finished_users_scores.assign(
            passed_course=finished_users_scores.correct_submissions >= self.COURSE_PASSED_MIN_POINTS)
        return finished_users_scores_results
    
    def _get_correct_score_df(self, events_: pd.DataFrame, submissions_: pd.DataFrame) -> pd.DataFrame:
        events_data_ = events_[['user_id']].drop_duplicates()
        submissions_data_ = submissions_[['user_id', 'submission_status', 'step_id']]
        grouped_correct_submissions_ = submissions_data_[submissions_data_.submission_status == 'correct']\
                                                        .groupby('user_id', as_index=False)['step_id']\
                                                        .nunique().rename({'step_id': 'correct_submissions'}, 
                                                                          axis=1)
        all_correct_submissions_ = events_data_.merge(grouped_correct_submissions_, how='outer')
        all_correct_submissions_['correct_submissions'] = all_correct_submissions_['correct_submissions'].fillna(0)
        return all_correct_submissions_
    
    @check_no_inf_and_nan(['correct_submissions'])
    @ensure_data_not_lost
    def _add_correct_submissions(self, df: pd.DataFrame) -> pd.DataFrame:
        grouped_correct_submissions = df[df.submission_status == 'correct']\
                                        .groupby('user_id', as_index=False)['step_id']\
                                        .nunique()\
                                        .rename({'step_id': 'correct_submissions'}, axis=1)
        all_correct_submissions = df.merge(grouped_correct_submissions, how='outer')
        all_correct_submissions['correct_submissions'] = all_correct_submissions['correct_submissions'].fillna(0)
        return all_correct_submissions
    
    @check_no_inf_and_nan(['wrong_submissions'])
    @ensure_data_not_lost
    def _add_wrong_submissions_df(self, df: pd.DataFrame, submissions: pd.DataFrame) -> pd.DataFrame:
        wrong_df = submissions[['user_id', 'submission_status']][submissions.submission_status == 'wrong']\
                   .groupby('user_id', as_index=False).count() \
                   .rename({'submission_status': 'wrong_submissions'}, axis=1)
        metrics = df.merge(wrong_df, how="outer")
        metrics['wrong_submissions'] = metrics['wrong_submissions'].fillna(0)
        return metrics
    
    @check_no_inf_and_nan(['success_rate'])
    @ensure_data_not_lost
    def _add_success_rate(self, df: pd.DataFrame) -> pd.DataFrame:
        df['success_rate'] = df['correct_submissions']/(df['correct_submissions'] + df['wrong_submissions'])
        fill_na = df.success_rate.mean()
        fill_minus_inf = df[df.success_rate != -np.inf].success_rate.min()
        df['success_rate'] = df['success_rate'].fillna(fill_na)
        df['success_rate'] = df['success_rate'].replace(-np.inf, fill_minus_inf)
        return df
    
    @check_no_inf_and_nan(['last_wrong_step_tries'])
    @ensure_data_not_lost
    def _add_last_wrong_step_tries(self, df: pd.DataFrame, submissions: pd.DataFrame) -> pd.DataFrame:
        users_last_steps_all_tries = submissions.merge(submissions \
                                       .groupby('user_id', as_index=False).max('timestamp'), 
                                                       on=['user_id', 'step_id']) \
                                       .query('submission_status == "wrong"') \
                                       [['step_id', 'user_id']]
        last_wrong_step_tries = users_last_steps_all_tries.groupby('user_id', as_index=False)\
                                                          .count()\
                                                          .rename({'step_id': 'last_wrong_step_tries'}, axis=1)
        metrics = df.merge(last_wrong_step_tries, how="outer")
        fill_na_value = metrics.last_wrong_step_tries.mean()
        metrics['last_wrong_step_tries'] = metrics['last_wrong_step_tries'].fillna(fill_na_value)
        return metrics
    
    @check_no_inf_and_nan(['viewed', 'discovered', 'passed'])
    @ensure_data_not_lost
    def _add_events_stats(self,  df: pd.DataFrame, events: pd.DataFrame) -> pd.DataFrame:
        user_viewed = events.query('action == "viewed"')[['user_id', 'step_id']]\
                      .groupby('user_id', as_index=False).count().rename({'step_id': 'viewed'}, axis=1)
        user_discovered = events.query('action == "discovered"')[['user_id', 'step_id']]\
                      .groupby('user_id', as_index=False).count().rename({'step_id': 'discovered'}, axis=1)
        user_passed = events.query('action == "passed"')[['user_id', 'step_id']]\
                      .groupby('user_id', as_index=False).count().rename({'step_id': 'passed'}, axis=1)
        
        metrics = df.merge(user_viewed, how="outer")
        metrics['viewed'] = metrics['viewed'].fillna(0)

        metrics = metrics.merge(user_discovered, how="outer")
        metrics['discovered'] = metrics['discovered'].fillna(0)

        metrics = metrics.merge(user_passed, how="outer")
        metrics['passed'] = metrics['passed'].fillna(0)

        return metrics
    
    @check_no_inf_and_nan(['viewer_coef'])
    @ensure_data_not_lost
    def _add_viewer_coef(self, metrics: pd.DataFrame) -> pd.DataFrame:
        """
        Build new metrics - 'viewer_coef'
        This field indicates if user is viewing video lessons while trying to pass course and how many
        """
        metrics['viewer_coef'] = metrics['viewed']/(metrics['correct_submissions'] + metrics['wrong_submissions'])
        finite_vals = metrics[~metrics.viewer_coef.isin([np.inf, -np.inf, np.nan])].viewer_coef
        fill_na = finite_vals.mean()
        fill_plus_inf = finite_vals.max()
        fill_minus_inf = finite_vals.min()
        metrics['viewer_coef'] = metrics['viewer_coef'].replace(-np.inf, fill_minus_inf)
        metrics['viewer_coef'] = metrics['viewer_coef'].replace(np.inf, fill_plus_inf)
        metrics['viewer_coef'] = metrics['viewer_coef'].replace(np.nan, fill_na)
        return metrics
    
    @check_no_inf_and_nan(['curiosity_coef'])
    @ensure_data_not_lost
    def _add_curiosity_coef(self, metrics: pd.DataFrame) -> pd.DataFrame:
        """
        Build new metrics - 'curiosity_coef'
        This field indicates if the user discovered future steps - possible mark of interest
        """
        metrics['curiosity_coef'] = metrics['discovered']/(metrics['passed'] + metrics['viewed'] 
                                                           + metrics['discovered'])
        finite_vals = metrics[~metrics.curiosity_coef.isin([np.inf, -np.inf, np.nan])].curiosity_coef
        fill_na = finite_vals.mean()
        fill_minus_inf = finite_vals.min()
        metrics['curiosity_coef'] = metrics['curiosity_coef'].replace(-np.inf, fill_minus_inf)
        metrics['curiosity_coef'] = metrics['curiosity_coef'].replace(np.nan, fill_na)
        
        return metrics
    
    @check_no_inf_and_nan(['perfectionist_coef', 'all_correct'])
    @ensure_data_not_lost
    def _add_perfectionist_coef(self, metrics: pd.DataFrame, submissions: pd.DataFrame) -> pd.DataFrame:
        """
        Build new metrics - 'perfectionist coef'
        This field indicates if user not only tried to solve exercise, 
        but also exepriments with several possible solutions
        """
        all_correct = submissions[['user_id', 'submission_status']].query('submission_status == "correct"')\
                                 .groupby('user_id', as_index=False).count()\
                                 .rename({'submission_status': 'all_correct'}, axis=1)
        metrics = metrics.merge(all_correct, how="outer")
        metrics['all_correct'] = metrics['all_correct'].fillna(0)
        metrics['perfectionist_coef'] = metrics['all_correct']/metrics['correct_submissions']
        fill_na = metrics.perfectionist_coef.mean()
        metrics['perfectionist_coef'] = metrics['perfectionist_coef'].fillna(fill_na)
        return metrics
    
    @check_no_inf_and_nan(['max_wrong_tries'])
    @ensure_data_not_lost
    def _add_max_wrong_tries(self, metrics: pd.DataFrame, submissions: pd.DataFrame) -> pd.DataFrame:
        """
        Build new metrics - 'max wrong tries'
        This field indicates how stubborn user could be
        """
        max_wrong_tries = submissions.query('submission_status == "wrong"')\
                              [['user_id', 'step_id', 'submission_status']]\
                              .groupby(['user_id', 'step_id'], as_index=False).count()\
                              .rename({"submission_status": "max_wrong_tries"}, axis=1)\
                              [['user_id', 'max_wrong_tries']].groupby('user_id', as_index=False).max()
        metrics = metrics.merge(max_wrong_tries, how="outer")
        fillna = metrics.max_wrong_tries.mean()
        metrics['max_wrong_tries'] = metrics['max_wrong_tries'].fillna(fillna)
        return metrics
    
    @check_no_inf_and_nan(['max_views_one_step'])
    @ensure_data_not_lost
    def _add_max_views_one_step(self, metrics: pd.DataFrame, events: pd.DataFrame) -> pd.DataFrame:
        """
        Build new metrics - 'max_views_one_step'
        This field indicates how many times can user rewatch a lesson 
        if there is something not clear for him/her
        """
        max_views_one_step = events.query('action == "viewed"')[['user_id', 'step_id', 'action']]\
                             .groupby(['user_id', 'step_id'], as_index=False).count()\
                             .rename({"action": "max_views_one_step"}, axis=1)[['user_id', 'max_views_one_step']]\
                             .groupby('user_id', as_index=False).max()
        metrics = metrics.merge(max_views_one_step, how="outer")
        fillna = metrics.max_views_one_step.mean()
        metrics['max_views_one_step'] = metrics['max_views_one_step'].fillna(fillna)
        return metrics
    
    @ensure_data_not_lost
    def _add_enrolled_date_dayname(self, metrics: pd.DataFrame, events: pd.DataFrame) -> pd.DataFrame:
        """
        Maybe there is difference between users enrolled in course on Monday and the ones enrolled on Saturday
        Add day of week to metrics
        """
        first_last_action_time = self._get_users_actions_time(events)
        metrics = metrics.merge(first_last_action_time[['user_id', 'min_timestamp']])
        metrics['day_name'] = pd.to_datetime(metrics.min_timestamp, unit='s').dt.day_name()
        metrics = pd.get_dummies(metrics)
        metrics = metrics.drop('min_timestamp', axis=1)
        return metrics
    
    @ensure_data_not_lost
    def _add_enrolled_day_hour(self, metrics: pd.DataFrame, events: pd.DataFrame) -> pd.DataFrame:
        """
        Most likely there is a difference between enrolled at 3 AM and at 10 AM
        Most likely there is also a difference between users enrolled in the work time and the others
        """
        first_last_action_time = self._get_users_actions_time(events)
        metrics = metrics.merge(first_last_action_time[['user_id', 'min_timestamp']])
        metrics['start_hour'] = pd.to_datetime(metrics.min_timestamp, unit='s').dt.hour
        
        # Work time from 9 to 18 inclusively
        metrics['work_time'] = ((metrics['start_hour'] // 9) % 2)
        metrics = metrics.drop('min_timestamp', axis=1)
        metrics['start_hour'] = metrics['start_hour'].astype(str)
        metrics = pd.get_dummies(metrics)
        return metrics
    
    @check_no_inf_and_nan(['avg_diff_between_correct_submissions'])
    @ensure_data_not_lost
    def _add_avg_period_between_correct_submissions(self, metrics: pd.DataFrame, 
                                                    submissions: pd.DataFrame) -> pd.DataFrame:
        # How fast does the user solve exercises?
        sorted_subm = submissions.query('submission_status == "correct"')\
                  .groupby(['user_id', 'step_id'], as_index=False).agg({'timestamp': 'min'})\
                  .sort_values(['user_id', 'timestamp'])[['user_id', 'timestamp']]
        sorted_subm['prev_diff'] = sorted_subm.groupby('user_id').timestamp.diff()
        
        user_avg_period = sorted_subm[~sorted_subm.prev_diff.isna()].groupby('user_id', as_index=False)\
                                .agg({'prev_diff': 'sum', 'timestamp': 'count'})\
                                .assign(avg_diff_between_correct_submissions=lambda x: x.prev_diff/x.timestamp)\
                                [['user_id', 'avg_diff_between_correct_submissions']]
        metrics = metrics.merge(user_avg_period, how='outer')
        fill_na_value = user_avg_period.avg_diff_between_correct_submissions.mean()
        metrics['avg_diff_between_correct_submissions'] = metrics['avg_diff_between_correct_submissions']\
                                                          .fillna(fill_na_value)
        metrics = scale_column(metrics, 'avg_diff_between_correct_submissions')
        return metrics
    
    @check_no_inf_and_nan(['diff_between_first_action_and_timestamp'])
    @ensure_data_not_lost
    def _add_period_from_first_action_to_first_submission(self, df: pd.DataFrame, events: pd.DataFrame, 
                                                                submissions: pd.DataFrame) -> pd.DataFrame:
        # Maybe the user has enrolled, but that's it, he/she won't solve exercises
        first_user_event_data = events[['user_id', 'timestamp']].groupby('user_id', as_index=False)\
                                .min().rename({'timestamp': 'first_event_timestamp'}, axis=1)
        first_user_submission_data = submissions[['user_id', 'timestamp']].groupby('user_id', as_index=False)\
                                .min().rename({'timestamp': 'first_submission_timestamp'}, axis=1)
        combined_data = first_user_event_data.merge(first_user_submission_data, how="outer")
        combined_data['diff_between_first_action_and_timestamp'] = combined_data['first_submission_timestamp'] \
                                                                   - combined_data['first_event_timestamp']
        fill_na = combined_data.diff_between_first_action_and_timestamp.mean()
        combined_data['diff_between_first_action_and_timestamp'] = combined_data['diff_between_first_action_and_timestamp'].fillna(fill_na)
        combined_data = combined_data.drop(['first_submission_timestamp', 'first_event_timestamp'], axis=1)
        combined_data = scale_column(combined_data, 'diff_between_first_action_and_timestamp')
        df = df.merge(combined_data, how='outer')
        return df
    
    @check_no_inf_and_nan(['diff_between_last_action_and_timestamp'])
    @ensure_data_not_lost
    def _add_period_from_last_submission_to_last_action(self, df: pd.DataFrame, events: pd.DataFrame,
                                                              submissions: pd.DataFrame) -> pd.DataFrame:
        # Maybe the user doesn't solve exercises for some time, but he/she is still active
        last_user_event_data = events[['user_id', 'timestamp']].groupby('user_id', as_index=False)\
                                .max().rename({'timestamp': 'last_event_timestamp'}, axis=1)
        last_user_submission_data = submissions[['user_id', 'timestamp']].groupby('user_id', as_index=False)\
                                .max().rename({'timestamp': 'last_submission_timestamp'}, axis=1)
        combined_data = last_user_event_data.merge(last_user_submission_data, how="outer")
        combined_data['diff_between_last_action_and_timestamp'] = combined_data['last_submission_timestamp'] \
                                                                  - combined_data['last_event_timestamp']
        fill_na = combined_data.diff_between_last_action_and_timestamp.mean()
        combined_data['diff_between_last_action_and_timestamp'] = combined_data['diff_between_last_action_and_timestamp'].fillna(fill_na)
        combined_data = combined_data.drop(['last_submission_timestamp', 'last_event_timestamp'], axis=1)
        combined_data = scale_column(combined_data, 'diff_between_last_action_and_timestamp')
        df = df.merge(combined_data, how='outer')
        return df
    
    @check_no_inf_and_nan(['success_series_length'])
    @ensure_data_not_lost
    def _add_correct_combo_length(self, df: pd.DataFrame, submissions: pd.DataFrame) -> pd.DataFrame:
        # How many exercisese was solved correctly in a row (starting from the first one)
        submissions_sequence = submissions.merge(submissions.groupby(['user_id', 'step_id'], as_index=False)\
                                                            .agg({'timestamp': 'min'}),
                                                 on=['user_id', 'step_id', 'timestamp'])\
                                          .sort_values(['user_id', 'timestamp'])
        first_wrong_submission = submissions_sequence.query('submission_status == "wrong"')\
                         .groupby('user_id', as_index=False).min('timestamp')\
                         .rename({'timestamp': 'first_wrong_submission_timestamp'}, axis=1)\
                         [['user_id', 'first_wrong_submission_timestamp']]

        merged_first_wrong = submissions_sequence.merge(first_wrong_submission, how="outer")
        fill_na_value = submissions_sequence.timestamp.max() + 1
        merged_first_wrong['first_wrong_submission_timestamp'] = merged_first_wrong['first_wrong_submission_timestamp']\
                                                                 .fillna(fill_na_value)
        user_series_lengths = merged_first_wrong.query('timestamp < first_wrong_submission_timestamp')\
                                [['user_id', 'step_id']].groupby('user_id', as_index=False)\
                                .count().rename({"step_id": "success_series_length"}, axis=1)
        df = df.merge(user_series_lengths, how="outer")
        df['success_series_length'] = df['success_series_length'].fillna(0)
        return df
    
    @check_no_inf_and_nan(['points'])
    @ensure_data_not_lost
    def _add_difficulty_coef(self, df: pd.DataFrame, submissions: pd.DataFrame) -> pd.DataFrame:
        """
        Different exercises have different complexity
        Evaluate the complexity and add this data to the user data
        """
        exercises_rank = submissions.query('submission_status == "correct"')\
                                     [['user_id', 'step_id']]\
                                    .drop_duplicates()\
                                    .groupby('step_id', as_index=False).count()\
                                    .rename({'user_id': 'solved_users'}, axis=1)
        users_correct_submissions = submissions.query('submission_status == "correct"')\
                                                     [['user_id', 'step_id']]\
                                                     .drop_duplicates()
        users_points_table = users_correct_submissions.merge(exercises_rank)
        users_points_table['solved_users'] = users_points_table['solved_users'].astype(float)\
                                               .apply(lambda x: np.power(x, -1))
        users_points_table = users_points_table.rename({'solved_users': 'points'}, axis=1)
        user_total_score = users_points_table[['user_id', 'points']].groupby('user_id', as_index=False).sum()
        df = df.merge(user_total_score, how="outer")
        df['points'] = df['points'].fillna(0)
        return df
    
    @check_no_inf_and_nan(['events_days', 'submissions_days', 'active_hours'])
    @ensure_data_not_lost
    def _add_activity_data(self, df: pd.DataFrame, events: pd.DataFrame, submissions: pd.DataFrame) -> pd.DataFrame:
        """
        How many days did the user solve exercises
        How many days dif the user watch video lessons etc
        How many hours has the user been active
        """
        events['day'] = pd.to_datetime(events.timestamp, unit='s').dt.day
        events['hour'] = pd.to_datetime(events.timestamp, unit='s').dt.hour
        submissions['day'] = pd.to_datetime(submissions.timestamp, unit='s').dt.day
        submissions['hour'] = pd.to_datetime(submissions.timestamp, unit='s').dt.hour
        users_events = events[['user_id', 'day', 'hour']].drop_duplicates()
        users_submissions = submissions[['user_id', 'day', 'hour']].drop_duplicates()
        users_days_events_activity = users_events[['user_id', 'day']].drop_duplicates()\
                                     .groupby('user_id', as_index=False).count()\
                                     .rename({'day': 'events_days'}, axis=1)
        users_days_submissions_activity = users_submissions[['user_id', 'day']].drop_duplicates()\
                                     .groupby('user_id', as_index=False).count()\
                                     .rename({'day': 'submissions_days'}, axis=1)
        users_active_hours = users_events[['user_id', 'hour']].groupby('user_id', as_index=False).count()\
                                         .rename({'hour': 'active_hours'}, axis=1)
        df = df.merge(users_days_events_activity, how='outer')
        df = df.merge(users_days_submissions_activity, how='outer')
        df['submissions_days'] = df['submissions_days'].fillna(0)
        df = df.merge(users_active_hours, how='outer')
        return df

    def get_first_days_chunk(self, df: pd.DataFrame, events: pd.DataFrame, submissions: pd.DataFrame) -> tuple:
        """
        We are interested only in first days of user activity
        Cut the datasets accordingly
        """
        events_merged = events.merge(df, how="outer")
        submissions_merged = submissions.merge(df, how="outer")
        first_events = events_merged[events_merged.timestamp <= events_merged.min_timestamp 
                                       + self.FIRST_DAYS_CHUNK * 24 * 60 * 60 ]
        first_submissions = submissions_merged[submissions_merged.timestamp <= submissions_merged.min_timestamp 
                                               + self.FIRST_DAYS_CHUNK * 24 * 60 * 60 ]
        return first_events, first_submissions
    
    def get_finished_users(self) -> pd.DataFrame:
        """
        Some users haven't passed the course yet, but they are still active 
        We can't be sure about them, so it's better to delete them
        """
        first_last_action_time = self._get_users_actions_time(self.events)
        timed_submissions = self.submissions.merge(first_last_action_time, how="outer")
        all_correct_submissions = self._add_correct_submissions(timed_submissions)
        finished_users_scores_results = self._drop_in_progress_users(all_correct_submissions, 
                                                                     self.events, self.submissions)
        return finished_users_scores_results
    
    def build_metrics_df(self, events=None, submissions=None) -> pd.DataFrame:
        """
        Build our ABT from raw data
        """
        if events is None:
            events = self.events
        if submissions is None:
            submissions = self.submissions

        metrics = self._get_correct_score_df(events, submissions)   
        metrics = self._add_wrong_submissions_df(metrics, submissions)
        metrics = self._add_success_rate(metrics)
        metrics = self._add_last_wrong_step_tries(metrics, submissions)
        metrics = self._add_events_stats(metrics, events)
        metrics = self._add_viewer_coef(metrics)
        metrics = self._add_curiosity_coef(metrics)
        metrics = self._add_perfectionist_coef(metrics, submissions)
        metrics = self._add_max_wrong_tries(metrics, submissions)
        metrics = self._add_max_views_one_step(metrics, events)
        metrics = self._add_enrolled_date_dayname(metrics, events)
        metrics = self._add_enrolled_day_hour(metrics, events)
        metrics = self._add_avg_period_between_correct_submissions(metrics, submissions)
        metrics = self._add_period_from_first_action_to_first_submission(metrics, events, submissions)
        metrics = self._add_period_from_last_submission_to_last_action(metrics, events, submissions)
        metrics = self._add_correct_combo_length(metrics, submissions)
        metrics = self._add_difficulty_coef(metrics, submissions)
        metrics = self._add_activity_data(metrics, events, submissions)
        return metrics
        
    @classmethod
    def process_train_data(cls, events: pd.DataFrame, submissions: pd.DataFrame) -> tuple:
        metrics_builder = cls(events, submissions)
        finished_users = metrics_builder.get_finished_users()
        y_train = finished_users[['user_id', 'passed_course']].drop_duplicates()
        unique_users = finished_users.drop(['passed_course', 
                                            'step_id', 'timestamp', 
                                            'submission_status'], axis=1).drop_duplicates('user_id')
        
        first_events, first_submissions = metrics_builder.get_first_days_chunk(unique_users, events, submissions)
        return metrics_builder.build_metrics_df(first_events, first_submissions), y_train
        
    @classmethod
    def process_test_data(cls, events: pd.DataFrame, submissions: pd.DataFrame) -> pd.DataFrame:
        metrics_builder = cls(events, submissions)
        return metrics_builder.build_metrics_df()
        

In [2]:
import random
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier


class Predictor:
    
    def __init__(self, X_train, y_train, 
                 X_test=None, y_test=None, 
                 test_size=0.33, random_state=42, 
                 scoring='roc_auc',
                 scoring_func=lambda fact, pred: roc_auc_score(fact, pred)):
        if X_test is None or y_test is None:
            X_train_, X_test_, y_train_, y_test_ = train_test_split(X_train, y_train, 
                                                                    test_size=test_size, 
                                                                    random_state=random_state,
                                                                    stratify=y_train)
            self.X_train = X_train_
            self.X_test = X_test_
            self.y_train = y_train_
            self.y_test = y_test_
        else:
            self.X_train = X_train
            self.X_test = X_test
            self.y_train = y_train
            self.y_test = y_test
        
        self.scoring = scoring
        self.scoring_func = scoring_func
        self._classifiers_result_log = []
    
    def train_randomized(self, params, random_state=42) -> tuple:    
        rf = RandomForestClassifier(random_state=random_state)
        rscv = RandomizedSearchCV(rf, params, n_jobs=-1, random_state=random_state, scoring=self.scoring)
        rscv.fit(self.X_train, self.y_train)
        est = rscv.best_estimator_
        features = pd.DataFrame({"features": list(self.X_train), "importances": est.feature_importances_})
        return self.scoring_func(self.y_test, est.predict_proba(self.X_test)[:, 1]), est, features.sort_values('importances', ascending=False)

    def train_detailed(self, params, random_state=42) -> tuple:
        rf = RandomForestClassifier(random_state=random_state)
        rscv = GridSearchCV(rf, params, n_jobs=-1, scoring=self.scoring)
        rscv.fit(self.X_train, self.y_train)
        est = rscv.best_estimator_
        features = pd.DataFrame({"features": list(self.X_train), "importances": est.feature_importances_})
        return self.scoring_func(self.y_test, est.predict_proba(self.X_test)[:, 1]), est, features.sort_values('importances', ascending=False)
    
    def choose_best_classifier(self, total_counts=100, estimates_per_random_state=10, 
                               accept_score_better_than=0, skip_detailed_training=False):
        random_states = [random.randint(1, 1000000) for _ 
                         in range(math.ceil(total_counts/estimates_per_random_state))]
        
        params = {
                  "n_estimators": range(10, 201, 2), 
                  "max_depth": range(5, 50),
                  "min_samples_leaf": range(10, 60),
                  "min_samples_split": range(10, 60),
                  "criterion": ["gini", "entropy"],
                  "bootstrap": [True, False]
                 }
        
        print("Starting randomized training...")
        already_added = set()
        for random_state in random_states:
            scores = []
            for _ in range(estimates_per_random_state):
                score, est, feature_importances = self.train_randomized(params, random_state=random_state)                                                                
                if score < accept_score_better_than:
                    print("Bad score:", score, "use next random state...")
                    break

                scores.append(score)
                if score not in already_added:
                    self._classifiers_result_log.append((score, est, feature_importances, random_state))
                    already_added.add(score)

            if scores:
                print("Randomized training done for random state:", random_state, "with max score", max(scores))
        
        score, est, features, random_score = max(self._classifiers_result_log, key=lambda x: x[0])
        if skip_detailed_training:
            return score, est, features, random_score

        print("Detailed training...")
        est_range = 5
        depth_range = 3
        leafs_range = 3
        split_range = 3
        new_params = {
                  "n_estimators": range(est.n_estimators - est_range, est.n_estimators + est_range), 
                  "max_depth": range(est.max_depth - depth_range, est.max_depth + depth_range), 
                  "min_samples_leaf": range(est.min_samples_leaf - leafs_range, est.min_samples_leaf + leafs_range),
                  "min_samples_split": range(est.min_samples_split - split_range, est.min_samples_split + split_range),
                  "criterion": ["gini", "entropy"],
                  "bootstrap": [True, False]
        }
        new_score, new_est, new_features = self.train_detailed(new_params, random_state=random_score)
        print("Old score:", score, "new score:", new_score, "difference:", new_score - score)
        return new_score, new_est, new_features, random_score
    

In [3]:
def pick_similar_users(train_ABT: pd.DataFrame, test_ABT: pd.DataFrame, features: list, key='user_id') -> list:
    """
    Pick from train set similar users to test set and return list with their keys.
    Similarity is based on the given list of features.
    
    <features> - list with features (in order of descending importance)
    <key> - name of key field in row
    """
    
    assert train_ABT.shape[0], "Empty train set"
    assert test_ABT.shape[0], "Empty test set"
    assert features, "Empty features list"
    assert key, "Empty row key"
    
    work_array = []
    TEST_MARK = 0
    TRAIN_MARK = 1
    for row in test_ABT.itertuples():
        row_features = [getattr(row, feature) for feature in features] + [TEST_MARK, getattr(row, key)]
        work_array.append(row_features)
    for row in train_ABT.itertuples():
        row_features = [getattr(row, feature) for feature in features] + [TRAIN_MARK, getattr(row, key)]
        work_array.append(row_features)
    work_array.sort(reverse=True)
    
    final_user_ids = []
    ready_user_ids = []
    test_debt = 0
    for row in work_array:
        *args, mark, key_value = row
        if mark == TRAIN_MARK:
            if test_debt > 0:
                final_user_ids.append(key_value)
                test_debt -= 1
            else:
                ready_user_ids.append(key_value)
        else:
            if ready_user_ids:
                final_user_ids.append(ready_user_ids.pop())
            else:
                test_debt += 1
    
    assert test_ABT.shape[0] == len(final_user_ids), "Unique pair for some user was not found"
    return final_user_ids


# Download train and test data

In [4]:
train_events = pd.read_csv('https://stepik.org/media/attachments/course/4852/event_data_train.zip')
train_submissions = pd.read_csv('https://stepik.org/media/attachments/course/4852/submissions_data_train.zip')

In [5]:
test_events = pd.read_csv('https://stepik.org/media/attachments/course/4852/events_data_test.csv')
test_submissions = pd.read_csv('https://stepik.org/media/attachments/course/4852/submission_data_test.csv')

# Prepare train and test data

In [6]:
X, y = MOOCMetricsBuilder.process_train_data(train_events, train_submissions)
test_ABT = MOOCMetricsBuilder.process_test_data(test_events, test_submissions)

# Pick users similar to test from train data

In [7]:
features = ['correct_submissions', 'viewed', 'active_hours', 'wrong_submissions']
similar_users_for_train = pick_similar_users(X, test_ABT, features)

In [8]:
X_train = X.sort_values('user_id').drop('user_id', axis=1)
y_train = y.sort_values('user_id').drop('user_id', axis=1).T.squeeze()

X_val = X[X.user_id.isin(similar_users_for_train)].sort_values('user_id').drop('user_id', axis=1)
y_val = y[y.user_id.isin(similar_users_for_train)].sort_values('user_id').drop('user_id', axis=1).T.squeeze()

X_train_res = X[~X.user_id.isin(similar_users_for_train)].sort_values('user_id').drop('user_id', axis=1)
y_train_res = y[~y.user_id.isin(similar_users_for_train)].sort_values('user_id').drop('user_id', axis=1).T.squeeze()

# Train our model, validate on picked similar users

In [9]:
# Specify set to test (use the similar ones)
predictor = Predictor(X_train, y_train, X_test=X_val, y_test=y_val)
threshold = 0.9
res = predictor.choose_best_classifier(accept_score_better_than=threshold)
res

Starting randomized training...
Bad score: 0.8942740137591055 use next random state...
Bad score: 0.8797894855216892 use next random state...
Bad score: 0.8923922672409855 use next random state...
Bad score: 0.8846164451291527 use next random state...
Bad score: 0.8922957604880809 use next random state...
Randomized training done for random state: 354983 with max score 0.9111697240204226
Bad score: 0.8973503354623391 use next random state...
Bad score: 0.8934335669950149 use next random state...
Bad score: 0.8937487683224289 use next random state...
Bad score: 0.8927801865418765 use next random state...
Detailed training...
Old score: 0.9111697240204226 new score: 0.9112700153518726 difference: 0.00010029133144995317


(0.9112700153518726,
 RandomForestClassifier(criterion='entropy', max_depth=20, min_samples_leaf=16,
                        min_samples_split=41, n_estimators=99,
                        random_state=354983),
                                    features  importances
 0                       correct_submissions     0.117928
 9                               all_correct     0.097092
 49                                   points     0.094293
 6                                    passed     0.083597
 5                                discovered     0.067924
 4                                    viewed     0.060283
 48                    success_series_length     0.051055
 52                             active_hours     0.044336
 51                         submissions_days     0.044322
 8                            curiosity_coef     0.042601
 45     avg_diff_between_correct_submissions     0.036020
 7                               viewer_coef     0.034677
 46  diff_between_first_action_and_t

# Show our model result

In [10]:
from sklearn.metrics import confusion_matrix, classification_report

score, est, features, rs = res
pred_y = est.predict(X_val)

print(confusion_matrix(y_val, pred_y))
print(classification_report(y_val, pred_y))

[[5455   58]
 [ 452  219]]
              precision    recall  f1-score   support

       False       0.92      0.99      0.96      5513
        True       0.79      0.33      0.46       671

    accuracy                           0.92      6184
   macro avg       0.86      0.66      0.71      6184
weighted avg       0.91      0.92      0.90      6184



In [11]:
users = test_ABT['user_id'].sort_values()
X_test = test_ABT.sort_values('user_id').drop('user_id', axis=1)

predictions = est.predict_proba(X_test)[:, 1]
answer = pd.DataFrame({'user_id': users, 'is_gone': predictions})

# Save result in file

In [12]:
answer = answer.reset_index().drop('index', axis=1)
answer.to_csv("stepik_answer.csv", index=False)