### 1. Загрузка библиотек и данных

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import featuretools as ft
from sklearn.feature_selection import GenericUnivariateSelect, mutual_info_classif, SelectFromModel

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
submission_data= pd.read_csv('https://stepik.org/media/attachments/course/4852/submission_data_test.csv')
submission_data.head(2)

Unnamed: 0,step_id,timestamp,submission_status,user_id
0,31971,1526800961,wrong,24370
1,31971,1526800976,wrong,24370


In [5]:
events_data = pd.read_csv('https://stepik.org/media/attachments/course/4852/events_data_test.csv')
events_data.head(2)

Unnamed: 0,step_id,timestamp,action,user_id
0,30456,1526893787,viewed,24417
1,30456,1526893797,viewed,24417


### 2. Первичный анализ данных, EDA

In [6]:
submission_data['day'] = pd.to_datetime(submission_data.timestamp, unit='s').dt.date
events_data['day'] = pd.to_datetime(events_data.timestamp, unit='s').dt.date

In [7]:
threshold = 2 * 24 * 60 * 60
cut_events_data = events_data[(events_data['timestamp'] < events_data.groupby('user_id')['timestamp'].transform('min') + threshold)]
users_events_data = cut_events_data.pivot_table(index='user_id',
                       columns='action',
                       values='step_id',
                       aggfunc='count',
                       fill_value=0).reset_index()

In [8]:
cut_submissions_data = submission_data[(submission_data['timestamp'] < 
                                        submission_data.groupby('user_id')['timestamp'].transform('min') + threshold)]
users_scores = cut_submissions_data.pivot_table(index='user_id',
                       columns='submission_status',
                       values='step_id',
                       aggfunc=lambda x: len(x.unique()),
                       fill_value=0).reset_index()

In [9]:
data = users_events_data.merge(users_scores,how='outer', on="user_id")

In [10]:
data['is_gone'] = data.correct.apply(lambda x: 1 if x>=40 else 0)

In [11]:
data = data.fillna(0)

### 3. Конструирование признаков

In [12]:
data['corr%'] = data.correct/(data.correct+data.wrong)

In [13]:
data = data.merge(submission_data[["user_id","day"]].drop_duplicates().groupby("user_id")[['day']].apply(list).apply(lambda x: len(x)).reset_index(),how='outer', on="user_id")

In [14]:
data = data.merge(events_data[["user_id","day"]].drop_duplicates().groupby("user_id")[['day']].apply(list).apply(lambda x: len(x)).reset_index(),how='outer', on="user_id")

In [16]:
data = data.merge (submission_data.groupby("user_id", as_index=False)[['step_id']].max(),how='outer', on = 'user_id')

In [17]:
data = data.merge (events_data.groupby("user_id", as_index=False)[['step_id']].max(),how='outer', on = 'user_id')

In [18]:
data.head(2)

Unnamed: 0,user_id,discovered,passed,started_attempt,viewed,correct,wrong,is_gone,corr%,0_x,0_y,step_id_x,step_id_y
0,4,1,1,0,1,0.0,0.0,0,,,1,,32815
1,6,1,1,0,1,0.0,0.0,0,,,1,,32815


In [19]:
data = data.rename(columns ={'0_x':'submission_day','0_y':'events_day',
                            'step_id_x':'max_step_sub','step_id_y':'max_step_events'}).fillna(0)

In [24]:
es = ft.EntitySet(id = 'index')
es = es.add_dataframe(
      dataframe_name="data",
      dataframe=data[[ 'discovered', 'passed', 'started_attempt', 'viewed',
        'wrong', 'corr%', 'submission_day', 'events_day',
       'max_step_sub', 'max_step_events']],
      index="index",
)

In [25]:
feature_matrix, feature_defs = ft.dfs(entityset = es, target_dataframe_name = 'data',
                                      trans_primitives = ['add_numeric', 'multiply_numeric'])

### 4. Отбор признаков

In [28]:
selector = GenericUnivariateSelect(score_func=mutual_info_classif, 
                                   mode='k_best', 
                                   param=6)
selector.fit(feature_matrix, data.is_gone)
pd.DataFrame(data={'score':selector.scores_,
                   'support':selector.get_support()}, 
             index=X.columns).sort_values(by='score',ascending=False).head(7)

Unnamed: 0,score,support
events_day,0.00986,True
submission_day,0.008889,True
max_step_sub,0.007016,True
events_day + max_step_sub,0.00673,True
max_step_sub * submission_day,0.006684,True
max_step_sub + submission_day,0.006677,True
events_day * submission_day,0.006108,False


### 5. Обучение модели

In [30]:
X = feature_matrix [['events_day','submission_day','max_step_sub', 
'events_day + max_step_sub', 
'max_step_sub * submission_day', 
'max_step_sub + submission_day']]
Y = data.is_gone

In [31]:
X_train, X_test, y_train, y_test = train_test_split(
 X, Y, test_size=0.3, random_state=42)

In [32]:
parametr = parametr = {'n_estimators': range(1,10,2),'max_depth':range(1,12,2),
                       'min_samples_leaf':range(1,7), 'min_samples_split':range(2,9,2)}

In [33]:
grid_forest=GridSearchCV(RandomForestClassifier(random_state=42),parametr, n_jobs=-1,cv=5)

In [34]:
grid_forest.fit(X_train,y_train)

In [35]:
grid_forest_best= grid_forest.best_estimator_

In [36]:
roc_auc_score(y_test, grid_forest_best.predict_proba(X_test)[:, 1])

0.875876956287102

In [41]:
predict_proba = grid_forest_best.predict_proba(X)[:, 1]

In [42]:
stepik = pd.DataFrame({'user_id':data.user_id,'is_gone':predict_proba})

In [43]:
stepik.to_csv('file.csv')