https://www.kaggle.com/c/linking-writing-processes-to-writing-quality

# Imports

In [None]:
import numpy as np 
import pandas as pd

from sklearn.compose import ColumnTransformer, make_column_selector

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, PolynomialFeatures

import lightgbm as lgb

from sklearn.model_selection import KFold

from sklearn.metrics import mean_squared_error

from collections import Counter

import datetime
import re
import os

In [1]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


pd.set_option('display.max_rows', 200)



/kaggle/input/linking-writing-processes-to-writing-quality/sample_submission.csv
/kaggle/input/linking-writing-processes-to-writing-quality/test_logs.csv
/kaggle/input/linking-writing-processes-to-writing-quality/train_scores.csv
/kaggle/input/linking-writing-processes-to-writing-quality/train_logs.csv


# Pipeline steps

In [2]:
%%time
def catcolumns(logs):
    #корректировка колонки move from ....
    print('catcolumns')
    logs['activity2'] = logs.activity.map(lambda x: 'Move' if x[:4] == 'Move' else x)

    #из нажатия клавиш сократить редкоиспользуемые в новое поле key
    
    logs.loc[logs.down_event.str.startswith('Arrow'), 'down_event'] = 'Arrows'
    
    for s in ['.', ',', '?', ';', ':', '!', '-']:
        logs.loc[logs.down_event==s, 'down_event'] = 'punc_marks'
    for s in ['"', '\'', '(', ')', '[', ']', '/', '\\', '%', '+', '=']:
        logs.loc[logs.down_event==s, 'down_event'] = 'other_marks'
        
        
    downs = logs.down_event.value_counts()
    logs['key'] = 'Other'

    for ind in downs.index:
        
        logs.loc[#(logs.activity!='Input') &
                 (logs.down_event == ind), 'key'] = logs['down_event']

        if downs[ind] < 500:
            break
            
    #из замены текста сократить редкоиспользуемые в новое поле txt

    logs['txt_ch'] = logs.text_change.map(lambda x: 'change' if '=>' in x else 'Other')

    logs.loc[logs.activity=='Input', 'txt_ch'] = 'Input'
    
    for s in ['.', ',', '?', ';', ':', '!', '-']:
        logs.loc[logs.text_change==s, 'txt_ch'] = 'punc_marks'
              
    for s in ['"', '\'', '(', ')', '[', ']', '/', '\\', '%', '+', '=']:
        logs.loc[logs.text_change==s, 'txt_ch'] = 'other_marks'
    
    
    logs.loc[logs.text_change==' ',  'txt_ch'] = 'Space'
    logs.loc[logs.text_change=='\n', 'txt_ch'] = 'Enter'
    logs.loc[logs.text_change=='NoChange', 'txt_ch'] = 'NoChange'
    logs.loc[logs.text_change=='q', 'txt_ch'] = 'q'

    tc = logs[logs.activity!='Input'].txt_ch.value_counts()

    for ind in tc.index:
        logs.loc[(logs.text_change==ind) & (logs.activity!='Input'),
                 'txt_ch'] = logs['text_change']

        if tc[ind] < 1000:
            break
              
    return logs

CPU times: user 6 µs, sys: 0 ns, total: 6 µs
Wall time: 10.3 µs


In [3]:
#добавление признака - непрерывный ввод текста. 
def inputwave(df):
    
    rename_cols = {}
    for col in df.columns:
        if col[0:11]=='remainder__':
            rename_cols[col] = col[11:]
    logs = df.rename(columns=rename_cols)
    
    logs['input'] = logs.activity.map(lambda x: 1 if x=='Input' else 0).astype('int8')
    
    #исключить 1 НЕ инпут событие внутри волны. м.б. опечатка 
    logs['input_pr_or_nx'] = (((logs['input'].shift(periods=-1)==1) &
                               (logs['input'].shift(periods=1)==1)) * 1 ).astype('int8')
    logs['input_f'] = (((logs['input']==1) | (logs['input_pr_or_nx']==1))* 1).astype('int8')
    logs['in_diff'] = logs['input_f'].diff().fillna(0).astype('int8')
    
    logs.loc[(logs.event_id==1), 'in_diff'] = -1
    logs.loc[(logs.event_id==1) &  (logs['input']==1), 'in_diff'] = 1

    # теперь in_diff = 1 для начала волны инпутов и 0 внутри волны. 
    # зафиксировать номер события для начала волны инпутов 
    logs['id_input'] = (logs.in_diff * logs.event_id).fillna(0).astype('int32')

    # расставим номера событий для каждой волны, чтобы потом по ним группировать

    i = 0
    while logs.loc[(logs.id_input==0)]['input'].count()!=0:
        i += 1
        logs.loc[(logs.in_diff==0) & (logs.id_input==0),
                 'id_input'] = logs.id_input.shift(periods=1)

    print(i, 'окончание inputwave')
    return logs.drop(columns=['input_pr_or_nx'])

In [4]:
def addcolumns(logs):

    # паузы между событиями. как в мсек, так и логикал
    logs['time_lag'] = (logs.down_time - logs.up_time.shift(periods=1)).fillna(0)
    logs.loc[logs.event_id==1, 'time_lag'] = 0
    
    logs['pauses'] = (logs.down_time - logs.down_time.shift(periods=1)).fillna(0)
    logs.loc[logs.event_id==1, 'pauses'] = 0
    
    logs['pauses'] = logs.pauses.map(lambda x: x if 0 < x else 0).astype('int32')
    logs['hs_pauses'] = logs.pauses.map(lambda x: x if 0 < x <= 500 else 0).astype('int32')
    logs['1s_pauses'] = logs.pauses.map(lambda x: x if  500 < x <= 1000 else 0).astype('int32')
    logs['2s_pauses'] = logs.pauses.map(lambda x: x if 1000 < x <= 2000 else 0).astype('int32')
    logs['3s_pauses'] = logs.pauses.map(lambda x: x if 2000 < x <= 3000 else 0).astype('int32')
    #logs['4s_pauses'] = logs.pauses.map(lambda x: x if 3000 < x <= 4000 else 0).astype('int32')
    logs['5s_pauses'] = logs.pauses.map(lambda x: x if 4000 < x <= 5000 else 0).astype('int32')
    logs['30s_pauses'] = logs.pauses.map(lambda x: x if 5000 < x <= 30000 else 0).astype('int32')
    #logs['1m_pauses'] = logs.pauses.map(lambda x: x if 30000 < x <= 60000 else 0).astype('int32')
    logs['2m_pauses'] = logs.pauses.map(lambda x: x if 60000 < x <= 120000 else 0).astype('int32')
    logs['big_pauses'] = logs.pauses.map(lambda x: x if 30000 < x else 0).astype('int32')
    
    logs['less5s_pauses'] = logs.pauses.map(lambda x: x if 500 < x <= 5000 else 0).astype('int32')
    logs['less10s_pauses'] = logs.pauses.map(lambda x: x if 500 < x <= 10000 else 0).astype('int32')
    
    logs['more5s_pauses'] = logs.pauses.map(lambda x: x if 5000 < x else 0).astype('int32')
    logs['more10s_pauses'] = logs.pauses.map(lambda x: x if 10000 < x else 0).astype('int32')
    
    
    logs['no_pause']  = logs.pauses.map(lambda x: 1 if x <= 2000 else 0).astype('int8')
    logs['hs_pause'] = logs.pauses.map(lambda x: 1 if 0 < x <= 500 else 0).astype('int8')
    logs['1s_pause'] = logs.pauses.map(lambda x: 1 if  500 < x <= 1000 else 0).astype('int8')
    logs['2s_pause'] = logs.pauses.map(lambda x: 1 if 1000 < x <= 2000 else 0).astype('int8')
    logs['3s_pause'] = logs.pauses.map(lambda x: 1 if 2000 < x <= 3000 else 0).astype('int8')
    #logs['4s_pause'] = logs.pauses.map(lambda x: 1 if 3000 < x <= 4000 else 0).astype('int8')
    logs['5s_pause'] = logs.pauses.map(lambda x: 1 if 4000 < x <= 5000 else 0).astype('int8')
    logs['30s_pause'] = logs.pauses.map(lambda x: 1 if 5000 < x <= 30000 else 0).astype('int8')
    #logs['1m_pause'] = logs.pauses.map(lambda x: 1 if 30000 < x <= 60000 else 0).astype('int8')
    logs['2m_pause'] = logs.pauses.map(lambda x: 1 if 60000 < x <= 120000 else 0).astype('int8')
    logs['big_pause'] = logs.pauses.map(lambda x: 1 if 120000 < x else 0).astype('int8')
    
    logs['less5s_pause'] = logs.pauses.map(lambda x: 1 if 500 < x <= 5000 else 0).astype('int32')
    logs['less10s_pause'] = logs.pauses.map(lambda x: 1 if 500 < x <= 10000 else 0).astype('int32')
    
    logs['more5s_pause'] = logs.pauses.map(lambda x: 1 if 5000 < x else 0).astype('int32')
    logs['more10s_pause'] = logs.pauses.map(lambda x: 1 if 10000 < x else 0).astype('int32')
    
    #action_time for inputs only
    logs['inp_ac_time'] = logs['action_time'].astype('int32')
    logs.loc[logs.activity!='Input', 'inp_ac_time'] = 0

    logs['non_inp_ac_time'] = logs['action_time'].astype('int32')
    logs.loc[logs.activity=='Input', 'inp_ac_time'] = 0
    
    
    #добавилось слово
    logs['chnword'] = (logs.word_count - logs.word_count.shift(periods=1)
                      ).fillna(0).astype('int16')
    logs.loc[(logs.event_id==1), 'addword'] = logs.word_count

    logs.loc[logs.addword>0, 'addword'] = logs['chnword'].astype('int16')
    logs.loc[logs.addword<0, 'delword'] = logs['chnword'].astype('int16')
    
    return logs

In [5]:
# формирование эссе, обработка его, формирование dataseta
def f_dataset(df):
    logs = df.fillna(0).copy()
    
    # формирование будущих фичей: списка колонок и агрегирующих функций для каждой колонки
    agg_func = {}
    
    for col in logs.columns:
        if (col[0:4]=='cat_') | (col[-5:]=='pause'):
            agg_func[col] = ['sum', 'mean']
        elif (col[-6:]=='pauses'):
            agg_func[col] = ['sum', 'mean', 'max', 'min', 'std', 'var', 'quantile'] # , 'std'
    
    for col in ['cursor_position', 'word_count',
                'action_time', 'inp_ac_time', 
                #'non_inp_ac_time', 'delword'
                'addword', 'time_lag']: # 'chnword',
        agg_func[col] = ['sum', 'mean', 'max', 'std', 'var', 'quantile'] # , 'min'] # , 'std'
        
    agg_func['up_time'] = ['max']
    agg_func['down_time'] = ['min']
    agg_func['event_id'] = ['max']
    for u_col in ['activity', 'down_event', 'up_event', 'text_change',
             'cursor_position']:
        agg_func[u_col] = 'nunique'
        
    dataset = logs.groupby(by='id').agg(agg_func)

    new_cols = []
    for col in dataset.columns:
        new_cols.append(col[0]+'_'+col[1])
    dataset.columns=new_cols
    
    dataset['alltime'] = dataset.up_time_max - dataset.down_time_min
    
#    print('относительные показатели по эссе')
    dataset['all_t_fin_word'] = (dataset['alltime'] / dataset['word_count_max']).replace([np.inf, -np.inf], 0)
    dataset['all_t_inp_word'] = (dataset['alltime'] / dataset['addword_sum']).replace([np.inf, -np.inf], 0)
    dataset['all_t_event']    = (dataset['alltime'] / dataset['event_id_max']).replace([np.inf, -np.inf], 0)

    dataset['ev_inp_word'] = (dataset['event_id_max'] / dataset['addword_sum']).replace([np.inf, -np.inf], 0)

    dataset['inp_fin_word'] = (dataset['addword_sum'] / dataset['word_count_max']).replace([np.inf, -np.inf], 0)

    
    dataset['clean_time'] = (dataset['alltime'] - dataset['pauses_sum']).replace([np.inf, -np.inf], 0)
    
    dataset['cl_t_fin_word'] = (dataset['clean_time'] / dataset['word_count_max']).replace([np.inf, -np.inf], 0)
    dataset['cl_t_inp_word'] = (dataset['clean_time'] / dataset['addword_sum']).replace([np.inf, -np.inf], 0)
    dataset['cl_t_event']    = (dataset['clean_time'] / dataset['event_id_max']).replace([np.inf, -np.inf], 0)
    
    dataset['inp_event']    = (dataset['inp_ac_time_sum'] / dataset['cat__activity2_Input_sum']).replace([np.inf, -np.inf], 0)
    
#    print('inputs. количество событий инпут подряд (одно событие НЕинпут не считается)')
    
    inputs = logs[logs.id_input>0] \
        .groupby(by=['id', 'id_input']) \
        .agg(n_words=('addword', 'sum'), n_symb=('input', 'sum')) \
        .reset_index()

    inputs['symb_word'] = (inputs['n_symb'] / inputs[('n_words')]).replace([np.inf, -np.inf], 0)    
    
    inputs_df = inputs.groupby(by='id') \
        .agg(inputs=('id_input', 'count'),
             n_words_mean1=('n_words', 'mean'), n_words_max1=('n_words','max'),
             n_words_sum1=('n_words', 'sum'),
             n_symbs_mean1=('n_symb', 'mean'), n_symbs_max1=('n_symb','max'),
             symb_word_mean1=('symb_word', 'mean'), symb_word_max1=('symb_word','max')
             )

    dataset = pd.merge(dataset, inputs_df, how='left', left_index=True, right_index=True)

    print('Эссе')
    bad_inputs = set(logs.loc[(logs.activity=='Input') & (logs.text_change!='q') & 
                              (logs.text_change.str.len()>1), 'text_change'
                             ].values
                    ).union({'Â´', 'Ä±', 'Å\x9f', 'Ë\x86', 'â\x80\x93', 'â\x80\x99'
                            })
    
    for s in bad_inputs:
        #print(s)
        logs.loc[(logs.activity=='Input') & (logs.text_change==s), 'text_change'] = 'q'
        logs.loc[(logs.activity!='Input') & (logs.text_change.str.contains(s)),
                 'text_change'] = logs.text_change.str.replace(s, 'q')
    
    #Анализ текстов эссе

    symb_cols = {}
    symb_cols['.']  = 'dot'
    symb_cols[' ']  = 'space'
    symb_cols[',']  = 'comma'
    symb_cols['\n'] = 'enter'
    symb_cols['\''] = 'quot'
    symb_cols['"']  = 'dblquot'
    symb_cols['-']  = 'dash'
    symb_cols['?']  = 'question'
    symb_cols[';']  = 'semicolon'
    symb_cols['(']  = 'bracket1'
    symb_cols[')']  = 'bracket2'
    symb_cols[':']  = 'colon'
    symb_cols['!']  = 'excl'
    symb_cols['/']  = 'slash'
    symb_cols['%']  = 'percent'
    symb_cols_titl = []

    for key in symb_cols.keys():
        symb_cols_titl.append('symb_' + symb_cols[key]) 
    
    for key in range(1, 22):
        col = 'q' + str(key)
        symb_cols_titl.append(col)

    q_symb_data = []
    esse_data = []

    i=0
    dataset['text'] = ' '
    for esse_id in sorted(logs.id.unique()):
        #if i % 500 == 0:
        #    print(i)
        #i=i+1

        idActivity = logs[(logs.id==esse_id) & (logs.activity!='Nonproduction')][
            ['activity', 'cursor_position', 'text_change', 'event_id']].copy()

        esse_Text = ""

        for event, position, text_change, event_id in idActivity.values:
            if event == 'Input':
                esse_Text = (esse_Text[:position-len(text_change)]
                             + text_change + 
                             esse_Text[position-len(text_change):])

            elif event == 'Remove/Cut':

                esse_Text = esse_Text[:position] + esse_Text[position+len(text_change):]

            elif event == 'Replace':
                asis, tobe = text_change.split(' => ')

                esse_Text = (esse_Text[:position-len(tobe)]
                             + tobe + 
                             esse_Text[position-len(tobe)+len(asis):])

            elif event == 'Paste':
                esse_Text = (esse_Text[:position-len(text_change)]
                             + text_change + 
                             esse_Text[position-len(text_change):])

            elif 'Move' in event:
                coords_from, coords_to = event[11:-1].split('] To [')
                coords_from = [int(x) for x in coords_from.split(', ')]
                coords_to = [int(x) for x in coords_to.split(', ')]

                # .........coords_from[0]-text_change-coords_from[1]......coords_to[0][1]....
                # .........coords_from[0][1]......coords_to[0]-text_change-[1]....
                if coords_from[0] < coords_to[0]:
                    esse_Text = (esse_Text[:coords_from[0]] + esse_Text[coords_from[1]:coords_to[0]+len(text_change)]
                                 + text_change +
                                 esse_Text[coords_to[0]+len(text_change):])

                # ........coords_to[0][1].......coords_from[0]-text_change-coords_from[1].......
                # ........coords_to[0]-text_change-[1]......coords_from[0][1]......
                elif coords_from[0] > coords_to[0]:
                    esse_Text = (esse_Text[:coords_to[0]] + text_change +
                                 esse_Text[coords_to[0]:coords_from[0]] + esse_Text[coords_from[1]:])

        #все события одного эссе обработаны. Эссе в переменной esse_Text

        #подсчёт слов разной длины
        dbl_article = esse_Text.count('\n\n')
        t_dots = esse_Text.count('...')
        esse_Text = esse_Text.replace('\n\n', '\n')
        esse_Text1 = esse_Text.replace('...', '.')

        symb_data = []

        for s in symb_cols.keys():
            symb_data.append(esse_Text1.count(s))

            esse_Text1 = esse_Text1.replace(s, ' ')

        ss = set(esse_Text)
        ss.discard('q')
        for s in ss:
            esse_Text1 = esse_Text1.replace(s, ' ')


        result = Counter(esse_Text1.split())
        w3plus = 0
        w3plus_s = 0

        q_list =[]

        for key in range(1, 3):
            col = 'q'*key
            symb_data.append(result[col])

        for key in range(3, 22):
            col = 'q'*key
            symb_data.append(result[col])
            w3plus += result[col]
            w3plus_s += result[col]*key


        esse_data.append([dbl_article, t_dots, w3plus, w3plus_s, len(esse_Text)])
        q_symb_data.append(symb_data)
        
        dataset.loc[esse_id, ['text']] = esse_Text


    # Все эссе обработаны
    dataset[['dbl_article', 't_dots', 'w3plus', 'w3plus_s', 'fin_len']] = esse_data
    dataset[symb_cols_titl] = q_symb_data
    
    # абзацы. фичи по абзацам. кол-во слов/предложений в них
    dataset['paragraph'] = dataset['text'].apply(lambda x: x.split('\n'))
    
    par_df = dataset[['paragraph']].explode('paragraph')
    par_df['par_sent']  = par_df['paragraph'].apply(lambda x: len(re.split("\.|\?|\!", x)))
    par_df['par_len']   = par_df['paragraph'].str.len()
    par_df['par_words'] = par_df['paragraph'].apply(lambda x: len(x.split(' ')))

    agg_func = ['sum', 'mean', 'first', 'last', 'max', 'std', 'var', 'quantile', 'count']
    dataset1 = par_df.drop(columns=['paragraph']).groupby(level=0).agg(agg_func)
    
    new_cols = []
    for col in dataset1.columns:
        new_cols.append(col[0]+'_'+col[1])
    dataset1.columns=new_cols    

    dataset = pd.merge(dataset,
                       dataset1,
                       how='left', left_index=True, right_index=True)
   

    #предложения   
    dataset['sentenses_list'] = dataset['text'].apply(lambda x: re.split("\.|\?|\!", x))

    par_df = dataset[['sentenses_list']].explode('sentenses_list')
    par_df['sent_len']   = par_df['sentenses_list'].str.len()
    par_df['sent_words'] = par_df['sentenses_list'].apply(lambda x: len(x.split(' ')))

    agg_func = ['sum', 'mean', 'first', 'last', 'max', 'std', 'var', 'quantile', 'count']
    dataset1 = par_df.drop(columns=['sentenses_list']).groupby(level=0).agg(agg_func)
    
    new_cols = []
    for col in dataset1.columns:
        new_cols.append(col[0]+'_'+col[1])
    dataset1.columns=new_cols    

    dataset = pd.merge(dataset,
                       dataset1,
                       how='left', left_index=True, right_index=True)
    
    #new_time_feats. вычисляем по каждому эссе его статусы на 5й минуте, 10й, .... 30й
    
    first_events = logs[logs.event_id==1].set_index('id')
    
    logs['start_time'] = logs.id.map(first_events['down_time'])
    logs['timing'] = logs['down_time'] - logs['start_time']
    logs['period'] = logs.timing // 300000
    logs['period'] = logs.period.map(lambda x: 6 if x>6 else x)
    
    time_feats = logs.groupby(by=['id', 'period']).agg(
                        {'event_id': 'max', 
                         'cursor_position': 'max',
                         'word_count': 'max',
                         'input': 'sum'
                        }).reset_index()
    
    time_feats['add_words'] = time_feats.word_count.diff()
    time_feats['events'] = time_feats.event_id.diff()    
    time_feats.loc[time_feats.period==0, 'add_words'] = time_feats.word_count
    time_feats.loc[time_feats.period==0, 'events'] = time_feats.events
    
    dataset['00m_event_id'] = dataset.index.map(time_feats[time_feats.period==0].set_index('id')['event_id'])
    dataset['00m_cursor_max'] = dataset.index.map(time_feats[time_feats.period==0].set_index('id')['cursor_position'])
    dataset['00m_word_max'] = dataset.index.map(time_feats[time_feats.period==0].set_index('id')['word_count'])
    dataset['00m_add_words'] = dataset.index.map(time_feats[time_feats.period==0].set_index('id')['add_words'])
    dataset['00m_events'] = dataset.index.map(time_feats[time_feats.period==0].set_index('id')['events'])
    dataset['00m_inputs'] = dataset.index.map(time_feats[time_feats.period==0].set_index('id')['input'])
    dataset['00m_per_ev'] = (dataset['00m_events'] / dataset['event_id_max']).replace([np.inf, -np.inf], 0)
    dataset['00m_per_word'] = (dataset['00m_add_words'] / dataset['word_count_max']).replace([np.inf, -np.inf], 0)
    dataset['00m_per_input'] = (dataset['00m_inputs'] / dataset['00m_events']).replace([np.inf, -np.inf], 0)
       
    dataset['05m_event_id'] = dataset.index.map(time_feats[time_feats.period==1].set_index('id')['event_id'])
    dataset['05m_cursor_max'] = dataset.index.map(time_feats[time_feats.period==1].set_index('id')['cursor_position'])
    dataset['05m_word_max'] = dataset.index.map(time_feats[time_feats.period==1].set_index('id')['word_count'])
    dataset['05m_add_words'] = dataset.index.map(time_feats[time_feats.period==1].set_index('id')['add_words'])
    dataset['05m_events'] = dataset.index.map(time_feats[time_feats.period==1].set_index('id')['events'])
    dataset['05m_inputs'] = dataset.index.map(time_feats[time_feats.period==1].set_index('id')['input'])
    dataset['05m_per_ev'] = (dataset['05m_events'] / dataset['event_id_max']).replace([np.inf, -np.inf], 0)
    dataset['05m_per_word'] = (dataset['05m_add_words'] / dataset['word_count_max']).replace([np.inf, -np.inf], 0)
    dataset['05m_per_input'] = (dataset['05m_inputs'] / dataset['05m_events']).replace([np.inf, -np.inf], 0)
    
    dataset['10m_event_id']   = dataset.index.map(time_feats[time_feats.period==2].set_index('id')['event_id'])
    dataset['10m_cursor_max'] = dataset.index.map(time_feats[time_feats.period==2].set_index('id')['cursor_position'])
    dataset['10m_word_max']   = dataset.index.map(time_feats[time_feats.period==2].set_index('id')['word_count'])
    dataset['10m_add_words']  = dataset.index.map(time_feats[time_feats.period==2].set_index('id')['add_words'])
    dataset['10m_events'] = dataset.index.map(time_feats[time_feats.period==2].set_index('id')['events'])
    dataset['10m_inputs'] = dataset.index.map(time_feats[time_feats.period==2].set_index('id')['input'])
    dataset['10m_per_ev'] = (dataset['10m_events'] / dataset['event_id_max']).replace([np.inf, -np.inf], 0)
    dataset['10m_per_word']  = (dataset['10m_add_words'] / dataset['word_count_max']).replace([np.inf, -np.inf], 0)
    dataset['10m_per_input'] = (dataset['10m_inputs'] / dataset['10m_events']).replace([np.inf, -np.inf], 0)
       
    dataset['15m_event_id']   = dataset.index.map(time_feats[time_feats.period==3].set_index('id')['event_id'])
    dataset['15m_cursor_max'] = dataset.index.map(time_feats[time_feats.period==3].set_index('id')['cursor_position'])
    dataset['15m_word_max']   = dataset.index.map(time_feats[time_feats.period==3].set_index('id')['word_count'])
    dataset['15m_add_words']  = dataset.index.map(time_feats[time_feats.period==3].set_index('id')['add_words'])
    dataset['15m_events'] = dataset.index.map(time_feats[time_feats.period==3].set_index('id')['events'])
    dataset['15m_inputs'] = dataset.index.map(time_feats[time_feats.period==3].set_index('id')['input'])
    dataset['15m_per_ev'] = (dataset['15m_events'] / dataset['event_id_max']).replace([np.inf, -np.inf], 0)
    dataset['15m_per_word'] = (dataset['15m_add_words'] / dataset['word_count_max']).replace([np.inf, -np.inf], 0)
    dataset['15m_per_input'] = (dataset['15m_inputs'] / dataset['15m_events']).replace([np.inf, -np.inf], 0)
    
    dataset['20m_event_id'] = dataset.index.map(time_feats[time_feats.period==4].set_index('id')['event_id'])
    dataset['20m_cursor_max'] = dataset.index.map(time_feats[time_feats.period==4].set_index('id')['cursor_position'])
    dataset['20m_word_max'] = dataset.index.map(time_feats[time_feats.period==4].set_index('id')['word_count'])
    dataset['20m_add_words'] = dataset.index.map(time_feats[time_feats.period==4].set_index('id')['add_words'])
    dataset['20m_events'] = dataset.index.map(time_feats[time_feats.period==4].set_index('id')['events'])
    dataset['20m_inputs'] = dataset.index.map(time_feats[time_feats.period==4].set_index('id')['input'])
    dataset['20m_per_ev'] = (dataset['20m_events'] / dataset['event_id_max']).replace([np.inf, -np.inf], 0)
    dataset['20m_per_word'] = (dataset['20m_add_words'] / dataset['word_count_max']).replace([np.inf, -np.inf], 0)
    dataset['20m_per_input'] = (dataset['20m_inputs'] / dataset['20m_events']).replace([np.inf, -np.inf], 0)
    
    dataset['25m_event_id'] = dataset.index.map(time_feats[time_feats.period==5].set_index('id')['event_id'])
    dataset['25m_cursor_max'] = dataset.index.map(time_feats[time_feats.period==5].set_index('id')['cursor_position'])
    dataset['25m_word_max'] = dataset.index.map(time_feats[time_feats.period==5].set_index('id')['word_count'])
    dataset['25m_add_words'] = dataset.index.map(time_feats[time_feats.period==5].set_index('id')['add_words'])
    dataset['25m_events'] = dataset.index.map(time_feats[time_feats.period==5].set_index('id')['events'])
    dataset['25m_inputs'] = dataset.index.map(time_feats[time_feats.period==5].set_index('id')['input'])
    dataset['25m_per_ev'] = (dataset['25m_events'] / dataset['event_id_max']).replace([np.inf, -np.inf], 0)
    dataset['25m_per_word'] = (dataset['25m_add_words'] / dataset['word_count_max']).replace([np.inf, -np.inf], 0)
    dataset['25m_per_input'] = (dataset['25m_inputs'] / dataset['25m_events']).replace([np.inf, -np.inf], 0)
    
    dataset['39m_event_id'] = dataset.index.map(time_feats[time_feats.period>=6].set_index('id')['event_id'])
    dataset['39m_cursor_max'] = dataset.index.map(time_feats[time_feats.period>=6].set_index('id')['cursor_position'])
    dataset['39m_word_max'] = dataset.index.map(time_feats[time_feats.period>=6].set_index('id')['word_count'])
    dataset['39m_add_words'] = dataset.index.map(time_feats[time_feats.period>=6].set_index('id')['add_words'])
    dataset['39m_events'] = dataset.index.map(time_feats[time_feats.period==6].set_index('id')['events'])
    dataset['39m_inputs'] = dataset.index.map(time_feats[time_feats.period==6].set_index('id')['input'])
    dataset['39m_per_ev'] = (dataset['39m_events'] / dataset['event_id_max']).replace([np.inf, -np.inf], 0)
    dataset['39m_per_word'] = (dataset['39m_add_words'] / dataset['word_count_max']).replace([np.inf, -np.inf], 0)
    dataset['39m_per_input'] = (dataset['39m_inputs'] / dataset['39m_events']).replace([np.inf, -np.inf], 0)
    
    dataset['00m_work'] = dataset['00m_event_id'].map(lambda x: 1 if x>0 else 0)
    dataset['05m_work'] = dataset['05m_event_id'].map(lambda x: 1 if x>0 else 0)
    dataset['10m_work'] = dataset['10m_event_id'].map(lambda x: 1 if x>0 else 0)
    dataset['15m_work'] = dataset['15m_event_id'].map(lambda x: 1 if x>0 else 0)
    dataset['20m_work'] = dataset['20m_event_id'].map(lambda x: 1 if x>0 else 0)
    dataset['25m_work'] = dataset['25m_event_id'].map(lambda x: 1 if x>0 else 0)
    dataset['39m_work'] = dataset['39m_event_id'].map(lambda x: 1 if x>0 else 0)
    
    #new feats - статусы каждого эссе после каждой 1000 событий
    
    logs['step'] = logs.event_id // 1000
    logs['step'] = logs.step.map(lambda x: 8 if x>8 else x)
    
    event_feats = logs.groupby(by=['id', 'step']).agg(
                        {'down_time': 'min',
                         'up_time': 'max', 
                         'cursor_position': 'max',
                         'word_count': 'max',
                         'input': 'sum'
                        }).reset_index()
    
    event_feats['timing'] = event_feats.up_time - event_feats.down_time
    
    event_feats['add_words'] = event_feats.word_count.diff()
    event_feats.loc[event_feats.step==0, 'add_words'] = event_feats.word_count
    
    dataset['01t_timing'] = dataset.index.map(event_feats[event_feats.step==0].set_index('id')['timing'])
    dataset['01t_cursor_max'] = dataset.index.map(event_feats[event_feats.step==0].set_index('id')['cursor_position'])
    dataset['01t_word_max'] = dataset.index.map(event_feats[event_feats.step==0].set_index('id')['word_count'])
    dataset['01t_add_words'] = dataset.index.map(event_feats[event_feats.step==0].set_index('id')['add_words'])
    dataset['01t_inputs'] = dataset.index.map(event_feats[event_feats.step==0].set_index('id')['input'])
    dataset['01t_per_word'] = (dataset['01t_add_words'] / dataset['word_count_max']).replace([np.inf, -np.inf], 0)
        
    dataset['12t_timing'] = dataset.index.map(event_feats[event_feats.step==1].set_index('id')['timing'])
    dataset['12t_cursor_max'] = dataset.index.map(event_feats[event_feats.step==1].set_index('id')['cursor_position'])
    dataset['12t_word_max'] = dataset.index.map(event_feats[event_feats.step==1].set_index('id')['word_count'])
    dataset['12t_add_words'] = dataset.index.map(event_feats[event_feats.step==1].set_index('id')['add_words'])
    dataset['12t_inputs'] = dataset.index.map(event_feats[event_feats.step==1].set_index('id')['input'])
    dataset['12t_per_word'] = (dataset['12t_add_words'] / dataset['word_count_max']).replace([np.inf, -np.inf], 0)
    
    dataset['23t_timing'] = dataset.index.map(event_feats[event_feats.step==2].set_index('id')['timing'])
    dataset['23t_cursor_max'] = dataset.index.map(event_feats[event_feats.step==2].set_index('id')['cursor_position'])
    dataset['23t_word_max'] = dataset.index.map(event_feats[event_feats.step==2].set_index('id')['word_count'])
    dataset['23t_add_words'] = dataset.index.map(event_feats[event_feats.step==2].set_index('id')['add_words'])
    dataset['23t_inputs'] = dataset.index.map(event_feats[event_feats.step==2].set_index('id')['input'])
    dataset['23t_per_word'] = (dataset['23t_add_words'] / dataset['word_count_max']).replace([np.inf, -np.inf], 0)
    
    dataset['34t_timing'] = dataset.index.map(event_feats[event_feats.step==3].set_index('id')['timing'])
    dataset['34t_cursor_max'] = dataset.index.map(event_feats[event_feats.step==3].set_index('id')['cursor_position'])
    dataset['34t_word_max'] = dataset.index.map(event_feats[event_feats.step==3].set_index('id')['word_count'])
    dataset['34t_add_words'] = dataset.index.map(event_feats[event_feats.step==3].set_index('id')['add_words'])
    dataset['34t_inputs'] = dataset.index.map(event_feats[event_feats.step==3].set_index('id')['input'])
    dataset['34t_per_word'] = (dataset['34t_add_words'] / dataset['word_count_max']).replace([np.inf, -np.inf], 0)
    
    dataset['45t_timing'] = dataset.index.map(event_feats[event_feats.step==4].set_index('id')['timing'])
    dataset['45t_cursor_max'] = dataset.index.map(event_feats[event_feats.step==4].set_index('id')['cursor_position'])
    dataset['45t_word_max'] = dataset.index.map(event_feats[event_feats.step==4].set_index('id')['word_count'])
    dataset['45t_add_words'] = dataset.index.map(event_feats[event_feats.step==4].set_index('id')['add_words'])
    dataset['45t_inputs'] = dataset.index.map(event_feats[event_feats.step==4].set_index('id')['input'])
    dataset['45t_per_word'] = (dataset['45t_add_words'] / dataset['word_count_max']).replace([np.inf, -np.inf], 0)
    
    dataset['56t_timing'] = dataset.index.map(event_feats[event_feats.step==5].set_index('id')['timing'])
    dataset['56t_cursor_max'] = dataset.index.map(event_feats[event_feats.step==5].set_index('id')['cursor_position'])
    dataset['56t_word_max'] = dataset.index.map(event_feats[event_feats.step==5].set_index('id')['word_count'])
    dataset['56t_add_words'] = dataset.index.map(event_feats[event_feats.step==5].set_index('id')['add_words'])
    dataset['56t_inputs'] = dataset.index.map(event_feats[event_feats.step==5].set_index('id')['input'])
    dataset['56t_per_word'] = (dataset['56t_add_words'] / dataset['word_count_max']).replace([np.inf, -np.inf], 0)
    
    dataset['67t_timing'] = dataset.index.map(event_feats[event_feats.step==6].set_index('id')['timing'])
    dataset['67t_cursor_max'] = dataset.index.map(event_feats[event_feats.step==6].set_index('id')['cursor_position'])
    dataset['67t_word_max'] = dataset.index.map(event_feats[event_feats.step==6].set_index('id')['word_count'])
    dataset['67t_add_words'] = dataset.index.map(event_feats[event_feats.step==6].set_index('id')['add_words'])
    dataset['67t_inputs'] = dataset.index.map(event_feats[event_feats.step==6].set_index('id')['input'])
    dataset['67t_per_word'] = (dataset['67t_add_words'] / dataset['word_count_max']).replace([np.inf, -np.inf], 0)
    
    dataset['78t_timing'] = dataset.index.map(event_feats[event_feats.step==7].set_index('id')['timing'])
    dataset['78t_cursor_max'] = dataset.index.map(event_feats[event_feats.step==7].set_index('id')['cursor_position'])
    dataset['78t_word_max'] = dataset.index.map(event_feats[event_feats.step==7].set_index('id')['word_count'])
    dataset['78t_add_words'] = dataset.index.map(event_feats[event_feats.step==7].set_index('id')['add_words'])
    dataset['78t_inputs'] = dataset.index.map(event_feats[event_feats.step==7].set_index('id')['input'])
    dataset['78t_per_word'] = (dataset['78t_add_words'] / dataset['word_count_max']).replace([np.inf, -np.inf], 0)
    
    dataset['8t_timing'] = dataset.index.map(event_feats[event_feats.step>=8].set_index('id')['timing'])
    dataset['8t_cursor_max'] = dataset.index.map(event_feats[event_feats.step>=8].set_index('id')['cursor_position'])
    dataset['8t_word_max'] = dataset.index.map(event_feats[event_feats.step>=8].set_index('id')['word_count'])
    dataset['8t_add_words'] = dataset.index.map(event_feats[event_feats.step>=8].set_index('id')['add_words'])
    dataset['8t_inputs'] = dataset.index.map(event_feats[event_feats.step>=8].set_index('id')['input'])
    dataset['8t_per_word'] = (dataset['8t_add_words'] / dataset['word_count_max']).replace([np.inf, -np.inf], 0)
    
    dataset['12t_work'] = dataset['12t_timing'].map(lambda x: 1 if x>0 else 0)
    dataset['23t_work'] = dataset['23t_timing'].map(lambda x: 1 if x>0 else 0)
    dataset['34t_work'] = dataset['34t_timing'].map(lambda x: 1 if x>0 else 0)
    dataset['45t_work'] = dataset['45t_timing'].map(lambda x: 1 if x>0 else 0)
    dataset['56t_work'] = dataset['56t_timing'].map(lambda x: 1 if x>0 else 0)
    dataset['67t_work'] = dataset['67t_timing'].map(lambda x: 1 if x>0 else 0)
    dataset['78t_work'] = dataset['78t_timing'].map(lambda x: 1 if x>0 else 0)
    dataset['8t_work']  = dataset['8t_timing'].map(lambda x: 1 if x>0 else 0)

    return dataset.drop(columns=['paragraph', 'text', 'sentenses_list']).fillna(0)

In [6]:
def feats_Essays(df):
    #print('feats_Essays')

    dataset = df.copy()

    dataset['sentenses']  = (dataset['symb_dot'] + dataset['symb_question'] + dataset['symb_excl'])
    #dataset['sent_words'] = (dataset['w3plus'] / dataset['sentenses']).replace([np.inf, -np.inf], 0)
    #dataset['words_symb'] = (dataset['w3plus_s'] / dataset['w3plus']).replace([np.inf, -np.inf], 0)

    dataset['cl_t_word3']      = (dataset['clean_time'] / dataset['w3plus']).replace([np.inf, -np.inf], 0)
    dataset['cl_t_word3ss']    = (dataset['clean_time'] / dataset['w3plus_s']).replace([np.inf, -np.inf], 0)
    dataset['cl_t_fin_len']    = (dataset['clean_time'] / dataset['fin_len']).replace([np.inf, -np.inf], 0)

    dataset['cl_t_sents']      = (dataset.clean_time / dataset.sentenses).replace([np.inf, -np.inf], 0)
    #dataset['len_sents']       = (dataset.fin_len / dataset.sentenses).replace([np.inf, -np.inf], 0)
    #dataset['len_article']     = (dataset.sentenses / dataset.symb_enter).replace([np.inf, -np.inf], 0)
    dataset['per_inp_event']   = (dataset.cat__activity2_Input_sum / 
                                  dataset.event_id_max).replace([np.inf, -np.inf], 0)
    dataset['per_nonpr_event'] = (dataset.cat__activity2_Nonproduction_sum / 
                                  dataset.event_id_max).replace([np.inf, -np.inf], 0)
    dataset['cl_t_inp_word']   = (dataset['clean_time'] / dataset['addword_sum']).replace([np.inf, -np.inf], 0)
    dataset['cl_t_event']      = (dataset['clean_time'] / dataset['event_id_max']).replace([np.inf, -np.inf], 0)
    dataset['all_t_fin_word']  = (dataset['alltime'] / dataset['word_count_max']).replace([np.inf, -np.inf], 0)
    dataset['all_t_inp_word']  = (dataset['alltime'] / dataset['addword_sum']).replace([np.inf, -np.inf], 0)
    dataset['all_t_event']     = (dataset['alltime'] / dataset['event_id_max']).replace([np.inf, -np.inf], 0)

    dataset['ev_fin_word']     = (dataset['event_id_max'] / dataset['word_count_max']).replace([np.inf, -np.inf], 0)
    dataset['ev_inp_word']     = (dataset['event_id_max'] / dataset['addword_sum']).replace([np.inf, -np.inf], 0)

    dataset['inp_fin_word']    = (dataset['addword_sum'] / dataset['word_count_max']).replace([np.inf, -np.inf], 0)

    #dataset['sent_len']        = (dataset['fin_len'] / dataset['sentenses']).replace([np.inf, -np.inf], 0)
    #dataset['articlelen']      = (dataset['fin_len'] / dataset['symb_enter']).replace([np.inf, -np.inf], 0)
    #dataset['art_sent']        = (dataset['sentenses'] / dataset['symb_enter']).replace([np.inf, -np.inf], 0)
    
    dataset['q3_q4'] = (dataset['q3'] + dataset['q4']).astype('int32')
    dataset['q3_q4s'] = (dataset['q3']*3 + dataset['q4']*4).astype('int32')
    
    for i in range(5, 15):
        prev_col  = 'q3_q' + str(i-1)
        prev_cols = prev_col + 's'
        col       = 'q' + str(i)
        icol      = 'q3_q' + str(i)
        icols     = icol + 's'
        dataset[icol]  = (dataset[prev_col] + dataset[col]).astype('int32')
        dataset[icols] = (dataset[prev_cols] + dataset[col]*i).astype('int32')        
        
    
    dataset['q15_']  = (dataset['q15'] + dataset['q16'] + dataset['q17'] + dataset['q18'] + 
                        dataset['q19'] + dataset['q20'] + dataset['q21']).astype('int32')
    
    dataset['q15_s'] = (dataset['q15']*15 + dataset['q16']*16 + dataset['q17']*17 + dataset['q18']*18 + 
                        dataset['q19']*19 + dataset['q20']*20 + dataset['q21']*21).astype('int32')
    
    for i in range(14, 0, -1):
        prev_col  = 'q' + str(i+1) +'_'
        prev_cols = 'q' + str(i+1) +'_s'        
        col       = 'q' + str(i)
        icol      = 'q' + str(i) + '_'
        icols     = 'q' + str(i) + '_s'
        
        dataset[icol]  = (dataset[prev_col] + dataset[col]).astype('int32')
        dataset[icols] = (dataset[prev_cols] + dataset[col]*i).astype('int32')
    
    dataset.drop(columns=['q15', 'q16', 'q17', 'q18', 'q19', 'q20', 'q21'], inplace=True)

    dataset['q2_q4_20']    = ((dataset['q1'] + dataset['q2'] )
                               / dataset['w3plus']).replace([np.inf, -np.inf], 0)

    return dataset.fillna(0)

In [7]:
def outliers(df):
    d_set = df.copy()
    #print(d_set.info())
    total_n = d_set.shape[0]
    tr_min = 0.02
    tr_max = 1 - tr_min

    k = 1.5
    for col in d_set.columns:
        q1 = d_set[col].quantile(0.25)
        q3 = d_set[col].quantile(0.75)
        iqr = q3 - q1

        norm_range = (min(round(q1 - k * iqr, 2),
                          d_set[col].sort_values()[int(total_n*tr_min)]),
                      max(round(q3 + k * iqr, 2),
                          d_set[col].sort_values()[int(total_n*tr_max)]))

        d_set.loc[d_set[col] < norm_range[0], col] = norm_range[0]
        d_set.loc[d_set[col] > norm_range[1], col] = norm_range[1]
    
    drop_columns = [
         'start'
        ,'end'
        ,'cat__txt_ch_Input'
    ]
    
    return d_set.drop(axis=1, columns=drop_columns)

In [8]:
%%time
print('Dataset Pipeline')

data_prep = Pipeline(steps=[
    ('catcolumns', FunctionTransformer(catcolumns))
])

encoder = ColumnTransformer(transformers=[
    ('cat',
     OneHotEncoder(sparse_output=False, drop=None, dtype='int8', handle_unknown='ignore'),
     ['activity2', 'txt_ch', 'key'])], #
    remainder='passthrough')
encoder.set_output(transform="pandas")
    
step_dataset = Pipeline(steps=[
    ('inputwave', FunctionTransformer(inputwave)),
    ('addcolumns', FunctionTransformer(addcolumns)),
    ('dataset', FunctionTransformer(f_dataset)),
    ('feats_Essays', FunctionTransformer(feats_Essays))
   
#    ('outliers', FunctionTransformer(outliers)),    
#    ('polynom', PolynomialFeatures(degree=2))
])


preprocessor = Pipeline(steps=[
                    ('data_preparation',  data_prep)
                   ,('encoder', encoder)
                   ,('step_df_groupbyid', step_dataset)
#                   ,('step_scaler', step_scaler),
#                   ,('data_fin', data_fin)
                    ], 
                    verbose=True)

Dataset Pipeline
CPU times: user 109 µs, sys: 25 µs, total: 134 µs
Wall time: 127 µs


# Read files

In [9]:
logs = pd.read_csv('/kaggle/input/linking-writing-processes-to-writing-quality/train_logs.csv'
#                   ,nrows = 100500
                  )
scores = pd.read_csv('/kaggle/input/linking-writing-processes-to-writing-quality/train_scores.csv'
#                     ,nrows = logs.id.nunique()
                    )

logs.loc[(logs.id=='0081af50') & (logs.event_id==1987), 'text_change'] = ' qqqq'

test_logs = pd.read_csv('/kaggle/input/linking-writing-processes-to-writing-quality/test_logs.csv')


print(logs.shape, logs.id.nunique(), scores.shape, test_logs.shape)

(8405898, 11) 2471 (2471, 2) (6, 11)


In [10]:
X_trainkf = preprocessor.fit_transform(logs)

catcolumns
[Pipeline] .. (step 1 of 3) Processing data_preparation, total= 1.8min


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


[Pipeline] ........... (step 2 of 3) Processing encoder, total=  10.3s
inputs. новые колонки. количество событий инпут подряд
4874 окончание inputwave
addcolumns
формирование dataseta


  dataset['alltime'] = dataset.up_time_max - dataset.down_time_min
  dataset['all_t_fin_word'] = (dataset['alltime'] / dataset['word_count_max']).replace([np.inf, -np.inf], 0)
  dataset['all_t_inp_word'] = (dataset['alltime'] / dataset['addword_sum']).replace([np.inf, -np.inf], 0)
  dataset['all_t_event']    = (dataset['alltime'] / dataset['event_id_max']).replace([np.inf, -np.inf], 0)
  dataset['ev_inp_word'] = (dataset['event_id_max'] / dataset['addword_sum']).replace([np.inf, -np.inf], 0)
  dataset['inp_fin_word'] = (dataset['addword_sum'] / dataset['word_count_max']).replace([np.inf, -np.inf], 0)
  dataset['clean_time'] = (dataset['alltime'] - dataset['pauses_sum']).replace([np.inf, -np.inf], 0)
  dataset['cl_t_fin_word'] = (dataset['clean_time'] / dataset['word_count_max']).replace([np.inf, -np.inf], 0)
  dataset['cl_t_inp_word'] = (dataset['clean_time'] / dataset['addword_sum']).replace([np.inf, -np.inf], 0)
  dataset['cl_t_event']    = (dataset['clean_time'] / dataset['event_id_

Эссе


  dataset['34t_per_word'] = (dataset['34t_add_words'] / dataset['word_count_max']).replace([np.inf, -np.inf], 0)
  dataset['45t_timing'] = dataset.index.map(event_feats[event_feats.step==4].set_index('id')['timing'])
  dataset['45t_cursor_max'] = dataset.index.map(event_feats[event_feats.step==4].set_index('id')['cursor_position'])
  dataset['45t_word_max'] = dataset.index.map(event_feats[event_feats.step==4].set_index('id')['word_count'])
  dataset['45t_add_words'] = dataset.index.map(event_feats[event_feats.step==4].set_index('id')['add_words'])
  dataset['45t_inputs'] = dataset.index.map(event_feats[event_feats.step==4].set_index('id')['input'])
  dataset['45t_per_word'] = (dataset['45t_add_words'] / dataset['word_count_max']).replace([np.inf, -np.inf], 0)
  dataset['56t_timing'] = dataset.index.map(event_feats[event_feats.step==5].set_index('id')['timing'])
  dataset['56t_cursor_max'] = dataset.index.map(event_feats[event_feats.step==5].set_index('id')['cursor_position'])
  dataset

feats_Essays
[Pipeline]  (step 3 of 3) Processing step_df_groupbyid, total=112.1min


In [11]:
X_test = preprocessor.transform(test_logs)

catcolumns
inputs. новые колонки. количество событий инпут подряд
1 окончание inputwave
addcolumns
формирование dataseta
Эссе


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  dataset['alltime'] = dataset.up_time_max - dataset.down_time_min
  dataset['all_t_fin_word'] = (dataset['alltime'] / dataset['word_count_max']).replace([np.inf, -np.inf], 0)
  dataset['all_t_inp_word'] = (dataset['alltime'] / dataset['addword_sum']).replace([np.inf, -np.inf], 0)
  dataset['all_t_event']    = (dataset['alltime'] / dataset['event_id_max']).replace([np.inf, -np.inf], 0)
  dataset['ev_inp_word'] = (dataset['event_id_max'] / dataset['addword_sum']).replace([np.inf, -np.inf], 0)
  dataset['inp_fin_word'] = (dataset['addword_sum'] / dataset['word_count_max']).replace([np.inf, -np.inf], 0)
  dataset['clean_time'] = (dataset['alltime'] - dataset['pauses_sum']).replace([np.inf, -np.inf], 0)
  dataset

feats_Essays


  dataset['34t_per_word'] = (dataset['34t_add_words'] / dataset['word_count_max']).replace([np.inf, -np.inf], 0)
  dataset['45t_timing'] = dataset.index.map(event_feats[event_feats.step==4].set_index('id')['timing'])
  dataset['45t_cursor_max'] = dataset.index.map(event_feats[event_feats.step==4].set_index('id')['cursor_position'])
  dataset['45t_word_max'] = dataset.index.map(event_feats[event_feats.step==4].set_index('id')['word_count'])
  dataset['45t_add_words'] = dataset.index.map(event_feats[event_feats.step==4].set_index('id')['add_words'])
  dataset['45t_inputs'] = dataset.index.map(event_feats[event_feats.step==4].set_index('id')['input'])
  dataset['45t_per_word'] = (dataset['45t_add_words'] / dataset['word_count_max']).replace([np.inf, -np.inf], 0)
  dataset['56t_timing'] = dataset.index.map(event_feats[event_feats.step==5].set_index('id')['timing'])
  dataset['56t_cursor_max'] = dataset.index.map(event_feats[event_feats.step==5].set_index('id')['cursor_position'])
  dataset

# LGBMRegressor

In [12]:
y_test = pd.DataFrame({'id': sorted(test_logs.id.unique()),
                       'score': 0})

In [13]:
kf = KFold(n_splits=5, shuffle=True, random_state=13)

for i, (train_index, test_index) in enumerate(kf.split(X_trainkf)):
    X_train = X_trainkf.iloc[train_index]
    y_train = scores['score'].iloc[train_index]
    X_val = X_trainkf.iloc[test_index]
    y_val = scores['score'].iloc[test_index]

    model = lgb.LGBMRegressor(num_leaves=16,
                  max_depth=4,
                  learning_rate=0.005,
                  n_estimators=10000,
                  subsample=0.75,
                  colsample_bytree=0.8,
                  reg_alpha=0.0019,
                  reg_lambda=0.81,
                  verbosity=0,  
                  random_state=13)

    stop_callback = lgb.early_stopping(stopping_rounds=100)

    lgb_model = model.fit(X_train,
                      y_train,
                      eval_names=['train', 'val'],
                      eval_set=[(X_train, y_train), (X_val, y_val)],
                      eval_metric='rmse',
                      callbacks=[stop_callback])
    
    test_predict = np.array([(0.5 if x<0.5 else (6 if x>6 else x)) for x in 
                              lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration_)])
    y_test['score'] = y_test['score'] + test_predict / kf.n_splits

You can set `force_col_wise=true` to remove the overhead.
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1331]	train's rmse: 0.425601	train's l2: 0.181136	val's rmse: 0.600499	val's l2: 0.360599
You can set `force_col_wise=true` to remove the overhead.
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[961]	train's rmse: 0.449257	train's l2: 0.201832	val's rmse: 0.60543	val's l2: 0.366546
You can set `force_col_wise=true` to remove the overhead.
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[972]	train's rmse: 0.443766	train's l2: 0.196928	val's rmse: 0.635081	val's l2: 0.403328
You can set `force_col_wise=true` to remove the overhead.
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1077]	train's rmse: 0.442916	train's l2: 0.196174	val's rmse: 0.597995	val's l2: 0.357598
You can set `force_

In [14]:
y_test.to_csv('/kaggle/working/submission.csv', index=False)