In [24]:
from datetime import datetime
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import roc_auc_score
from sklearn.calibration import CalibratedClassifierCV

In [4]:
event_train = pd.read_csv('.\StepikMLcontest/event_data_train.csv')
submission_train = pd.read_csv('.\StepikMLcontest/submissions_data_train.csv')
event_test = pd.read_csv('.\StepikMLcontest/events_data_test.csv')
submission_test = pd.read_csv('.\StepikMLcontest/submission_data_test.csv')

In [None]:
# Creating a dataset with basic features from two tables

In [9]:
def base_feat(event_train, submission_train):
    event_pivot = pd.pivot_table(data=event_train, values='step_id',
                                 index='user_id', columns='action',
                                 aggfunc='count', fill_value=0) \
        .reset_index()

    submission_pivot = pd.pivot_table(data=submission_train,
                                      values='step_id',
                                      index='user_id',
                                      columns='submission_status',
                                      aggfunc='count',
                                      fill_value=0).reset_index()

    users_data = pd.merge(event_pivot, submission_pivot, on='user_id', how='outer').fillna(0)
    assert users_data.user_id.nunique() == event_train.user_id.nunique()
    return users_data

In [630]:
# creating a table with the first visit

In [10]:
def time_fil(data, days=2):
    min_user_time = data.groupby('user_id').agg({'timestamp': 'min'}) \
        .rename(columns={'timestamp': 'min_timestamp'}) \
        .reset_index()
    data_time_filtered = pd.merge(data, min_user_time, on='user_id', how='outer')

    # selecting threshold values
    learning_time_threshold = days * 24 * 60 * 60
    data_time_filtered = data_time_filtered.query("timestamp <= min_timestamp + @learning_time_threshold")

    assert data_time_filtered.user_id.nunique() == data.user_id.nunique()

    return data_time_filtered.drop(['min_timestamp'], axis=1)

In [649]:
# identification of target users by the threshold value, it is considered if the user 
# has made a 'threshold' tasks - he passed the course


In [11]:
def target(submission_data, threshold=40):
    users_count_correct = submission_data[submission_data.submission_status == 'correct'] \
        .groupby('user_id').agg({'step_id': 'count'}) \
        .reset_index().rename(columns={'step_id': 'corrects'})

    # if the user has executed the 'threshold' tasks, then he will complete the course to the end

    users_count_correct['passed_course'] = (users_count_correct.corrects >= threshold).astype('int')
    return users_count_correct.drop(['corrects'], axis=1)


In [12]:
# how many unique steps has the user tried

In [13]:
def tried(submission_data):
    # how many steps did the user try to make
    tried = submission_data.groupby('user_id').step_id.nunique().to_frame().reset_index() \
        .rename(columns={'step_id': 'steps_tried'})

    return tried

In [679]:
# auxiliary feature - The ratio of correct answers to incorrect ones

In [14]:
def answers_ratio(data):
    data['correct_ratio'] = (data.correct / (data.correct + data.wrong)).fillna(0)

    return data


In [677]:
# Generating X and y datasets with target variables

In [15]:
def create_df(events_data, submission_data):
    # we filter the data by days from the beginning of the study
    events_2d = time_fil(events_data)
    submissions_2d = time_fil(submission_data)

    # creating a table with basic features
    users_data = base_feat(events_2d, submissions_2d)

    # creating a target variable
    users_target_feature = target(submission_data, threshold=40)

    # we create features with attempts of steps and the proportion of correct answers
    users_steps_tried = tried(submissions_2d)
    users_data = answers_ratio(users_data)

    # connecting
    merge_1 = users_data.merge(users_steps_tried, how='outer').fillna(0)

    # attaching the target variable
    merge_2 = merge_1.merge(users_target_feature, how='outer').fillna(0)

    # we separate the target variable and remove it from the main dataset
    y = merge_2['passed_course'].map(int)
    X = merge_2.drop(['passed_course'], axis=1)

    return X, y

In [16]:
# Formation of a test dataset without a target

In [17]:
def create_test_df(events_data, submission_data):
    # we filter the data by days from the beginning of the study
    events_2d = time_fil(events_data)
    submissions_2d = time_fil(submission_data)

    # creating a table with basic features
    users_data = base_feat(events_2d, submissions_2d)

    # we create features with attempts of steps and the proportion of correct answers
    users_steps_tried = tried(submissions_2d)
    users_data = answers_ratio(users_data)

    # connecting
    X = users_data.merge(users_steps_tried, how='outer').fillna(0)

    return X


In [20]:
X_train, y = create_df(event_train, submission_train)

X_test = create_test_df(event_test, submission_test)

In [61]:
def random_with_grid(train_data, y, size=0.20):
    X_train, X_test, y_train, y_test = train_test_split(train_data, y, test_size=size, random_state=42)

    param_grid = {'randomforestclassifier__n_estimators': range(20, 100, 2),
                  'randomforestclassifier__max_depth': range(1, 14)}

    pipe = make_pipeline(RandomForestClassifier())
    pipe.fit(X_train, y_train)
    grid = GridSearchCV(pipe, param_grid=param_grid, cv=5, n_jobs=-1)
    grid.fit(X_train, y_train)
    print(f"best_params: {grid.best_params_}")

    ypred_prob = grid.predict_proba(X_test)

    roc_score = roc_auc_score(y_test, ypred_prob[:, 1])
    score = grid.score(X_test, y_test)
    print(f"test_score: {score:.2f}")
    print(roc_score)



In [62]:
random_with_grid(X_train,y)

best_params: {'randomforestclassifier__max_depth': 4, 'randomforestclassifier__n_estimators': 60}
test_score: 0.90
0.8735898880255601


In [None]:
# Final training on a training dataset with the best parameters and
# obtaining predict_proba for a test dataset from entries in a csv file

In [63]:
def random_final(train_data, y, test_data, size=0.20):

    test_data = test_data.sort_values('user_id')

    X_train, X_test, y_train, y_test = train_test_split(train_data, y, test_size=size, random_state=42)

    calibrated_clf = CalibratedClassifierCV(RandomForestClassifier (max_depth=4, n_estimators=60), method='isotonic', cv=5)
    calibrated_clf.fit(X_train, y_train)

    ypred_prob = calibrated_clf.predict_proba(X_test)

    roc_score = roc_auc_score(y_test, ypred_prob[:, 1])
    score = calibrated_clf.score(X_test, y_test)
    print(f"Score on valid: {score:.3f}")
    print(f"Roc_auc_score on valid: {roc_score:.5f}")

    ypred_prob_final = calibrated_clf.predict_proba(test_data)
    result = test_data['user_id'].to_frame()
    result['is_gone'] = ypred_prob_final[:, 1]
    result[['user_id', 'is_gone']].to_csv(f'.\StepikMLcontest/prediction_{roc_score:.5f}.csv', index=False)
    print(f'The results are recorded in a file prediction_{roc_score:.5f}.csv')


In [64]:
random_final(X_train, y, X_test)

Score on valid: 0.902
Roc_auc_score on valid: 0.87343
The results are recorded in a file prediction_0.87343.csv
