# Introduction

This notebook I created with extra classes, functions that I developed during the competition. About most of these features I read in papers, but not all of them proved effective when added to the best competition solutions/notebooks. But some of them did improve the ensemble of several top solutions (I added some functions to different models). So I just decided to test all of these my preprocessors in one notebook without common best notebook features.

The notebook is created as a utility script, and can be used in main inference notebook by importing its code and executing needed functions. Also it contains dumped model and oof predictions.

# Libraries

In [1]:
print('__name__ is: ', __name__)

__name__ is:  __main__


In [2]:
class CONFIG:
#     solution_name = 'LGBMRegressor'
    solution_name = 'XGBRegressor'
    n_splits = 10
    random_state = 422
    data_path = '/kaggle/input/linking-writing-processes-to-writing-quality/'

In [3]:
import pandas as pd
import numpy as np
import re
from scipy.stats import skew, kurtosis
import warnings

from tqdm import tqdm
from pandas import Index

from scipy.stats import skew, kurtosis, iqr, entropy
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import jensenshannon
from scipy.stats import skew, kurtosis, iqr, entropy, gmean, tmean
from scipy.signal import argrelmin, argrelmax
import joblib

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)

if __name__ == '__main__':
    print('__name__ is: ', __name__)
    from lightgbm import LGBMRegressor
    from xgboost import XGBRegressor
    from sklearn.model_selection import StratifiedKFold
    from copy import deepcopy
    from sklearn.ensemble import VotingRegressor   

__name__ is:  __main__


# Feature Engineering

In [4]:
def reconstruct_essay(currTextInput):
    essayText = ""
    for Input in currTextInput.values:
        if Input[0] == 'Replace':
            replaceTxt = Input[2].split(' => ')
            essayText = essayText[:Input[1] - len(replaceTxt[1])] + replaceTxt[1] + essayText[Input[1] - len(replaceTxt[1]) + len(replaceTxt[0]):]
            continue
        if Input[0] == 'Paste':
            essayText = essayText[:Input[1] - len(Input[2])] + Input[2] + essayText[Input[1] - len(Input[2]):]
            continue
        if Input[0] == 'Remove/Cut':
            essayText = essayText[:Input[1]] + essayText[Input[1] + len(Input[2]):]
            continue
        if "M" in Input[0]:
            croppedTxt = Input[0][10:]
            splitTxt = croppedTxt.split(' To ')
            valueArr = [item.split(', ') for item in splitTxt]
            moveData = (int(valueArr[0][0][1:]), int(valueArr[0][1][:-1]), int(valueArr[1][0][1:]), int(valueArr[1][1][:-1]))
            if moveData[0] != moveData[2]:
                if moveData[0] < moveData[2]:
                    essayText = essayText[:moveData[0]] + essayText[moveData[1]:moveData[3]] + essayText[moveData[0]:moveData[1]] + essayText[moveData[3]:]
                else:
                    essayText = essayText[:moveData[2]] + essayText[moveData[0]:moveData[1]] + essayText[moveData[2]:moveData[0]] + essayText[moveData[1]:]
            continue
        essayText = essayText[:Input[1] - len(Input[2])] + Input[2] + essayText[Input[1] - len(Input[2]):]
    return essayText

def get_essay_df(df):
    df = df[df.activity != 'Nonproduction']
    temp = df.groupby('id').apply(lambda x: reconstruct_essay(x[['activity', 'cursor_position', 'text_change']]))
    essay_df = pd.DataFrame({'id': df['id'].unique().tolist()})
    essay_df = essay_df.merge(temp.rename('essay'), on='id')
    return essay_df

In [5]:
def iqr_nan(x):
    return iqr(x.to_numpy(), nan_policy='omit')

def entropy_nan(x, notna=True):
    if notna:
        return entropy(x[x.notna()].to_numpy().astype('float32'))
    else:
        return entropy(x.to_numpy().astype('float32'))
    
def relextrema_count(x):
    x = x.to_numpy()
    min_count = len(argrelmin(x)[0])
    max_count = len(argrelmax(x)[0])
    return (min_count + max_count) #/(len(x)+1e-7)

def jensenshannon_degree(x):
    a = x[x.notna()].to_numpy()
    if len(a) == 0:
        return np.NaN
    b = np.repeat(1/len(a), len(a))
    return jensenshannon(a, b)

In [6]:
class PreprocessorTimeWindows_Base:
    def __init__(self):
        pass
        
    def get_time_windows(self, df, time_win):
        bins = range(0, df['up_time'].max() + time_win, time_win)
        window_series = pd.cut(df['up_time'], bins, labels=False, right=False)
        return window_series

In [7]:
class PreprocessorTimeWindows_Activities(PreprocessorTimeWindows_Base):
    def __init__(self, agg_methods, time_windows=[30], filter_field=None, filter_values=[], field_suffix=''):
        self.agg_methods = agg_methods
        self.time_windows = time_windows
        self.filter_field = filter_field
        self.filter_values = filter_values
        self.field_suffix = field_suffix
    
    def get_time_window_agg(self, data, win_field='window', agg_method='mean'):
        win_max = data.iloc[-1][win_field]
        new_index = Index(np.arange(win_max+1), name=win_field)
        agg_windows_df = data.groupby(win_field).apply(
            lambda x: x.count()).rename(columns={win_field: 'events_count'}).reindex(new_index).fillna(0)
        agg_result = agg_windows_df['events_count'].agg(agg_method)
        return agg_result
    
    def get_windows_stats(self, df, win_field):
        feats_df = pd.DataFrame({'id': df['id'].unique().tolist()})
        stat_type = 'activity_count'
        pbar = tqdm(self.agg_methods)
        for method in pbar:
            if isinstance(method, str):
                method_name = method
            else:
                method_name = method.__name__
            pbar.set_postfix(method=method_name)
            agg_field_name = f'{win_field}_{stat_type}_{method_name}{self.field_suffix}'
            tmp_df = df.groupby('id')[[win_field]].apply(
                lambda x: self.get_time_window_agg(
                    x, win_field=win_field, agg_method=method
                )).to_frame().rename(columns={0: agg_field_name})
            feats_df = feats_df.merge(tmp_df, on='id', how='left')
        return feats_df                
    
    def make_feats(self, df, fillna=None):
        print("Starting to engineer features")
        filter_col = [self.filter_field] if self.filter_field != None else []
        data_df = df[['id', 'down_time', 'up_time'] + filter_col]
        merged_df = pd.DataFrame({'id': df['id'].unique().tolist()})
        for t_win in self.time_windows:
            win_field = f'window_{t_win}s'
            data_df[win_field] = self.get_time_windows(data_df, time_win=t_win*1000)
            if self.filter_field != None:
                data_df = data_df[data_df[self.filter_field].isin(self.filter_values)]
            if data_df.empty:
                data_df['id'] = df['id'].unique()
                data_df[win_field] = data_df[win_field].fillna(0)
            print('Calculating window stats')
            feats_df = self.get_windows_stats(data_df, win_field) #.reset_index()
            merged_df = merged_df.merge(feats_df, on='id', how='left')
            if fillna != None:
                merged_df = merged_df.fillna(fillna)
        
        return merged_df

In [8]:
class PreprocessorTimeWindows_Pauses(PreprocessorTimeWindows_Base):
    def __init__(self, agg_methods, time_windows=[30]):
        self.agg_methods = agg_methods
        self.time_windows = time_windows
        self.pauses_splits = [
#             ('pause_05_sec', [0.5, 1]),
            ('pause_1_sec', [1, 1.5]),
            ('pause_15_sec', [1.5, 2]),
            ('pause_2_sec', [2, 3]),
#             ('pause_3_sec', [3, np.inf]),
        ]
        
    def get_time_diff(self, df):
        up_time_lagged = df.groupby('id')['up_time'].shift(1).fillna(df['down_time'])
        time_diff = abs(df['down_time'] - up_time_lagged) / 1000
        return time_diff
        
    def get_time_window_agg(self, data, win_field='window', agg_method='mean', lim_lower=1, lim_upper=1.5):
        win_max = data.iloc[-1][win_field]
        new_index = Index(np.arange(win_max+1), name=win_field)
        group_window = data.groupby(win_field)['time_diff']
        agg_windows_df = group_window.apply(
            lambda x: ((x >= lim_lower) & (x < lim_upper)).sum()
        ).reindex(new_index).fillna(0) #.rename(columns={win_field: 'events_count'}).reindex(new_index).fillna(0)
        agg_result = agg_windows_df.agg(agg_method)
        return agg_result
    
    def get_windows_stats(self, df, win_field):
        feats_df = pd.DataFrame({'id': df['id'].unique().tolist()})
        pbar = tqdm(self.pauses_splits)
        for params in pbar:
            pause_name, lim_lower, lim_upper = params[0], params[1][0], params[1][1]
            for method in self.agg_methods:
                if isinstance(method, str):
                    method_name = method
                else:
                    method_name = method.__name__
                pbar.set_postfix(pause=pause_name, method=method_name)
                agg_field_name = f'{win_field}_{pause_name}_{method_name}'
                tmp_df = df.groupby('id')[['id', win_field, 'time_diff']].apply(
                    lambda x: self.get_time_window_agg(
                        x, win_field=win_field, agg_method=method, lim_lower=lim_lower, lim_upper=lim_upper
                    )).to_frame().reset_index().rename(columns={0: agg_field_name})
                feats_df = feats_df.merge(tmp_df, on='id', how='left')
        return feats_df                
    
    def make_feats(self, df, fillna=None):
        print("Starting to engineer features")
        data_df = df[['id', 'down_time', 'up_time']]
        merged_df = pd.DataFrame({'id': df['id'].unique().tolist()})
        data_df['time_diff'] = self.get_time_diff(data_df)
        
        for t_win in self.time_windows:
            win_field = f'window_{t_win}s'
            data_df[win_field] = self.get_time_windows(data_df, time_win=t_win*1000)

            print('Calculating window stats')
            feats_df = self.get_windows_stats(data_df, win_field)
            merged_df = merged_df.merge(feats_df, on='id', how='left')
        
        return merged_df

In [9]:
def update_activity(value):
    if value.startswith('Move'):
        return 'Move'
    if value.startswith('Remove/Cut'):
        return 'Remove'
    return value

In [10]:
class ShortcutsPreprocessor:
    def __init__(
        self, 
        train_df,
        column = 'down_event',
        vocab = [
            "control arrowleft", 
            "control arrowright", 
            "control c", 
            "control v", 
            "control x", 
            "control z",
            "control a",
            "leftclick", 
            "rightclick", 
            "middleclick", 
            "unknownclick"
        ],
        ngram_range = (1, 3),
        max_features = None
    ):
        self.train_df = train_df.copy()
        self.column = column
        self.vocab = vocab
        self.ngram_range = ngram_range
        self.token_pattern = r'(?u)\b\w+\b' # default=r”(?u)\b\w\w+\b”
        self.vectorizer = None
        self.initialized = False
        self.max_features = max_features
        
    def initialize(self):
        if self.initialized:
            return
        if self.column.__contains__('activity'):
            self.train_df['activity_changed'] = self.train_df['activity'].apply(update_activity)
            self.column = 'activity_changed'
            
        train_events_str_df = self.events_to_str(self.train_df)
        self.vectorizer = TfidfVectorizer(
                ngram_range=(1, 3), 
                token_pattern=self.token_pattern, 
                vocabulary=self.vocab
            )
        self.vectorizer.fit(train_events_str_df[self.column].to_numpy())
        self.train_df = None
        self.initialized = True
        
    def events_to_str(self, logs_df):
        events_str_df = logs_df.groupby(['id']).agg({self.column: list})[self.column].apply(
            lambda x: ' '.join(x)).to_frame().reset_index()
        return events_str_df
        
    def make_feats(self, logs_df):
        if not self.initialized:
            self.initialize()
            
        if self.column.__contains__('activity'):
            logs_df['activity_changed'] = logs_df['activity'].apply(update_activity)
            self.column = 'activity_changed'
            
        logs_events_str_df = self.events_to_str(logs_df)
        X_tokenized = self.vectorizer.transform(logs_events_str_df[self.column].to_numpy()).todense()
            
        feats_df = pd.DataFrame(
            X_tokenized, 
            index=logs_events_str_df['id'], 
            columns=[f'shortcuts_feature_{i}' for i in range(X_tokenized.shape[1])]
        )
#         feats_df.index.name = "id"
        return feats_df

In [11]:
def get_shortcuts_feats(logs_df, load_prepr=False, load_prepr_path="/kaggle/working"):
    if load_prepr:
        preprocessor_shortcuts = joblib.load(f"{load_prepr_path}/preprocessor_shortcuts")
    else:
        preprocessor_shortcuts = ShortcutsPreprocessor(logs_df)
        preprocessor_shortcuts.initialize()
        joblib.dump(preprocessor_shortcuts, f"preprocessor_shortcuts")

    shortcuts_feats = preprocessor_shortcuts.make_feats(logs_df).reset_index()
    return shortcuts_feats

In [12]:
def get_inner_word_pauses(x, min_len = 4):
    if x.shape[0] <= min_len:
        return np.NaN
    else:
        return x.iloc[1:].astype('float32').values

In [13]:
def get_iwp_stats(df):          
    df['up_time_lagged'] = df.groupby('id')['up_time'].shift(1).fillna(df['down_time'])
    df['time_diff'] = abs(df['down_time'] - df['up_time_lagged']) / 1000                          
    char_groups = df[df.down_event == 'q'].groupby(['id', 'word_count'])['time_diff']

    iw_pauses_sum = char_groups.apply(lambda x: np.sum(get_inner_word_pauses(x))).explode()
    iw_pauses_sum_mean = iw_pauses_sum.groupby('id').agg('mean')
    iw_pauses_sum_std = iw_pauses_sum.groupby('id').agg('std')
    iw_pauses_sum_entropy = iw_pauses_sum.groupby('id').agg(entropy_nan)
    iw_pauses_sum_first = iw_pauses_sum.groupby('id').agg('first')
    iw_pauses_sum_last = iw_pauses_sum.groupby('id').agg('last')
    
    result = pd.DataFrame({     
        'iw_pauses_sum_mean': iw_pauses_sum_mean,
        'iw_pauses_sum_std': iw_pauses_sum_std,
        'iw_pauses_sum_entropy': iw_pauses_sum_entropy,
        'iw_pauses_sum_first': iw_pauses_sum_first,
        'iw_pauses_sum_last': iw_pauses_sum_last,
    })
    return result

In [14]:
def get_before_word_pauses(x, min_len = 4):
    if x.shape[0] <= min_len:
        return np.NaN
    else:
        return x.iloc[0].astype('float32')

In [15]:
def get_bwp_stats(df):          
    df['up_time_lagged'] = df.groupby('id')['up_time'].shift(1).fillna(df['down_time'])
    df['time_diff'] = abs(df['down_time'] - df['up_time_lagged']) / 1000                          
    char_groups = df[df.down_event == 'q'].groupby(['id', 'word_count'])['time_diff']
    shift_chars = df[df.down_event == 'Shift'].groupby(['id'])['time_diff']

    bw_pauses_sum = char_groups.apply(lambda x: get_before_word_pauses(x)).explode()
    bw_pauses_sum_mean = bw_pauses_sum.groupby('id').agg('mean')
    bw_pauses_sum_std = bw_pauses_sum.groupby('id').agg('std')
    bw_pauses_sum_entropy = bw_pauses_sum.groupby('id').agg(entropy_nan)
    bw_pauses_sum_first = bw_pauses_sum.groupby('id').agg('first')
    bw_pauses_sum_last = bw_pauses_sum.groupby('id').agg('last')
    
    result = pd.DataFrame({
        'bw_pauses_sum_mean': bw_pauses_sum_mean,
        'bw_pauses_sum_std': bw_pauses_sum_std,
        'bw_pauses_sum_entropy': bw_pauses_sum_entropy,
        'bw_pauses_sum_first': bw_pauses_sum_first,
        'bw_pauses_sum_last': bw_pauses_sum_last,
    })
    result.fillna(0, inplace=True)
    return result

In [16]:
def get_tw_activities(logs_df, preprocessor):
    tw_feats = preprocessor.make_feats(logs_df, fillna=0)
    print("tw_feats.shape: ", tw_feats.shape)
    return tw_feats

In [17]:
class RepetedGroupsPreprocessor:
    def __init__(self, column, word_to_find, aggregates = ['mean', 'std'], prefix='repeated_group_'):
        self.column = column
        self.word_to_find = word_to_find
        self.aggregates = aggregates
        self.prefix = prefix
        
    def get_matches(self, input_string):
        pattern = rf'\b{self.word_to_find}\b(?:\s+\b{self.word_to_find}\b)*'
        matches = re.findall(pattern, input_string)
        counts = [len(match.split()) for match in matches]
        return counts
        
    def make_feats(self, logs_df):
        events_str_df = logs_df.groupby(['id']).agg({self.column: list})[self.column].apply(
            lambda x: ' '.join(x)).to_frame()
        feats_df = events_str_df[self.column].apply(self.get_matches).explode().groupby('id').agg(self.aggregates)
        feats_df = feats_df.add_prefix(self.prefix).astype('float32').reset_index()
        feats_df.fillna(0, inplace=True)
        return feats_df

In [18]:
def get_tfidf_essay_feats(essay_df, load_prepr=False, load_prepr_path="/kaggle/working"):
    tfidf_feats_all = pd.DataFrame({'id': essay_df.id.unique()})
    token_pattern = r"(?u)\b[a-zA-Z][a-zA-Z]+\b|!|\?|\"|\'|\.|\,|\:\|\;|\-|\\|\/"
    vocab = ['qqqqqqqqq qqq', 'qqq qqqqqqqqqq', 'qqqqqqq .', 'qqq qq',
       'qqqq qqq', 'qqqqqq qqqqq', 'qq', 'qqqqq qqqq', 'qqqqqqq qqq',
       'qq qqqqq', 'qq qqqqqqqqq', 'qqqqqq ,', 'qqqqqq', 'qqq qqqqqqqq',
       'qqq qqqqqq', 'qqqqqqqqqq', 'qqqq qqqqqqq', 'qqqqqqq qqqqqqq',
       'qq qqq', 'qqqq .', ', qq', '. qqq', 'qqqqqq qqq', 'qqq qqqq', '.',
       'qqqq', 'qqqqqqqqqqq qqqq', 'qqqq qqqqqqqqq', 'qq qqqqqqq',
       'qqqqqqq ,', 'qqqq qqqqqqqqqqq', 'qqqqqq qqqqqqqqq',
       'qqqqqqqq qqqqq', 'qqqqqqqqqq qqq', 'qqqqqqqq qqq',
       'qqqqqq qqqqqqq', 'qqqqqqq', 'qqqqq qq', 'qqqqqqqq .', '. qqqqq',
       'qqqqq qqqqqqqq', 'qq .', 'qqqq qqqqq', 'qqqqqqqq ,',
       'qqqqqqqqqqq qq', '. qqqqqqq', 'qqqqq qqqqq', 'qqqqqqq qqqqq',
       '. qqqq', 'qqqqqqqqqqq .', ',', '!', '?', ':', ';', '-', '/', '\\']
    if load_prepr:
        vectorizer = joblib.load(f"{load_prepr_path}/vectorizer_essay")
    else:
        vectorizer = TfidfVectorizer(
            ngram_range=(1, 2), 
            token_pattern=token_pattern,
            vocabulary = vocab
        )
        vectorizer.fit(essay_df['essay'])
        joblib.dump(vectorizer, f"vectorizer_essay")
    X_tokenizer = vectorizer.transform(essay_df['essay']).todense()
    feats_vect_df = pd.DataFrame(
        X_tokenizer, 
        index=essay_df['id'], 
        columns=[f'tfidf_essay_feature_{i}' for i in range(X_tokenizer.shape[1])]
    )
    return feats_vect_df.reset_index()

In [19]:
class VectorizedEventsPreprocessor():
    def __init__(self, train_df, column, vocab, lowercase=False, token_pattern=r"[^\|]+", ngram_range=(1, 1)):
        self.train_df = train_df
        self.column = column
        self.vocab = vocab
        self.lowercase = lowercase
        self.token_pattern = token_pattern
        self.ngram_range = ngram_range
        self.initialized = False
        self.vectorizer = None
        
    def initialize(self):
        if self.initialized:
            return
        train_events_str_df = self.events_to_str(self.train_df)
        self.vectorizer = TfidfVectorizer(
            ngram_range=self.ngram_range, 
            vocabulary=self.vocab, 
            lowercase=self.lowercase, 
            token_pattern=self.token_pattern
        )
        self.vectorizer.fit(train_events_str_df[self.column].to_numpy())
        self.train_df = None
        self.initialized = True
        
    def events_to_str(self, logs_df):
        events_str_df = logs_df.groupby(['id']).agg({self.column: list})[self.column].apply(
            lambda x: '|'.join(x)).to_frame().reset_index()
        return events_str_df
        
    def make_feats(self, logs_df):
        if not self.initialized:
            self.initialize()
        logs_events_str_df = self.events_to_str(logs_df)
        X_tokenizer = self.vectorizer.transform(logs_events_str_df[self.column].to_numpy()).todense()
        feats_vect_df = pd.DataFrame(
            X_tokenizer, 
            index=logs_events_str_df['id'], 
            columns=[f'tfidf_{self.column}_feature_{i}' for i in range(X_tokenizer.shape[1])]
        )
        return feats_vect_df

In [20]:
def get_tfidf_feats(logs_df, load_prepr=False, load_prepr_path="/kaggle/working"):
    activities = ['Input', 'Remove/Cut', 'Nonproduction', 'Replace', 'Paste']
    events = ['q', 'Space', 'Backspace', 'Shift', 'ArrowRight', 'Leftclick', 'ArrowLeft', '.', ',', 'ArrowDown', 'ArrowUp', 'Enter', 'CapsLock', "'", 'Delete', 'Unidentified']
    text_changes = ['q', ' ', '.', ',', '\n', "'", '"', '-', '?', ';', '=', '/', '\\', ':']
    fields_values_dict = {
        'activity': activities,
        'down_event': events,
        'up_event': events,
        'text_change': text_changes,
    }
    tfidf_feats_all = pd.DataFrame({'id': logs_df.id.unique()})
    
    for field, values in fields_values_dict.items():
        print(f" TfIdf, field: {field}")
        if load_prepr:
            preprocessor_tfidf = joblib.load(f"{load_prepr_path}/preprocessor_tfidf_{field}")
        else:
            preprocessor_tfidf = VectorizedEventsPreprocessor(logs_df, field, values)
            preprocessor_tfidf.initialize()
            joblib.dump(preprocessor_tfidf, f"preprocessor_tfidf_{field}")

        tfidf_feats = preprocessor_tfidf.make_feats(logs_df).reset_index()
        tfidf_feats_all = tfidf_feats_all.merge(tfidf_feats, on='id', how='left')
    return tfidf_feats_all

# Extract features

In [21]:
agg_methods = ['mean', 'std', entropy_nan]

In [22]:
preprocessor_tw_a_1 = PreprocessorTimeWindows_Activities(
    agg_methods, 
    time_windows=[60], 
    filter_field='activity', 
    filter_values=['Input'],
    field_suffix='_inp',
)

In [23]:
preprocessor_tw_a_2 = PreprocessorTimeWindows_Activities(
    agg_methods, 
    time_windows=[60], 
    filter_field='activity', 
    filter_values=['Remove/Cut'],
    field_suffix='_rem_all',
)

In [24]:
# preprocessor_tw_a_3 = PreprocessorTimeWindows_Activities(
#     agg_methods, 
#     time_windows=[60], 
#     filter_field='down_event', 
#     filter_values=['Backspace'], 
#     field_suffix='_bsp'
# )

In [25]:
# preprocessor_tw_a_4 = PreprocessorTimeWindows_Activities(
#     agg_methods, 
#     time_windows=[60], 
#     filter_field='down_event', 
#     filter_values=['q'], 
#     field_suffix='_q'
# )

In [26]:
preprocessor_tw_p = PreprocessorTimeWindows_Pauses(
    agg_methods,
    time_windows=[60]
)

In [27]:
preprocessor_rep_groups_1 = RepetedGroupsPreprocessor(
    column='activity', 
    word_to_find='Remove/Cut',
    aggregates = agg_methods,
    prefix='repeated_remove_'
)

In [28]:
preprocessor_rep_groups_2 = RepetedGroupsPreprocessor(
    column='down_event', 
    word_to_find='q', 
    aggregates = agg_methods,
    prefix='repeated_q_'
)

In [29]:
def get_feats_all(logs_file_name='train_logs.csv', load_prepr_path="/kaggle/working"):
    print('get_feats_all')
    logs_df = pd.read_csv(CONFIG.data_path + logs_file_name)
    essays_df = get_essay_df(logs_df)

    feats_df = logs_df.groupby('id')['event_id'].agg('max').to_frame().add_suffix("_max").reset_index()
        
    print("< Get Essay tfidf stats >")
    load_prepr = logs_file_name.__contains__("test")
    feats_df = feats_df.merge(
        get_tfidf_essay_feats(essays_df, load_prepr=load_prepr, load_prepr_path=load_prepr_path), 
        on='id', how='left'
    )
    
    print("< Get Logs tfidf stats >")
    feats_df = feats_df.merge(get_tfidf_feats(logs_df), on='id', how='left')
    
#     print("< Get shortcuts stats >")
#     feats_df = feats_df.merge(get_shortcuts_feats(logs_df), on='id', how='left')
    
    print("< get iwp stats >")
    feats_df = feats_df.merge(get_bwp_stats(logs_df), on='id', how='left')
    
    print("< tw_activities Preprocessor 1 >")
    tw_activities_feats = preprocessor_tw_a_1.make_feats(logs_df, fillna=0)
    feats_df = feats_df.merge(tw_activities_feats, on='id', how='left')

    print("< tw_activities Preprocessor 2 >")
    tw_activities_feats = preprocessor_tw_a_2.make_feats(logs_df, fillna=0)
    feats_df = feats_df.merge(tw_activities_feats, on='id', how='left')
    
    print("< tw_pauses Preprocessor >")
    tw_activities_feats = preprocessor_tw_p.make_feats(logs_df, fillna=0)
    feats_df = feats_df.merge(tw_activities_feats, on='id', how='left')
    
    print("< Get RepetedGroups stats >")
    feats_rep_gr_df = preprocessor_rep_groups_1.make_feats(logs_df)
    feats_df = feats_df.merge(feats_rep_gr_df, on='id', how='left')
    
    feats_rep_gr_df = preprocessor_rep_groups_2.make_feats(logs_df)
    feats_df = feats_df.merge(feats_rep_gr_df, on='id', how='left')
    
    return feats_df

In [30]:
if __name__ == '__main__':
    train_feats = get_feats_all('train_logs.csv')
    print()
    display(train_feats)

get_feats_all
< Get Essay tfidf stats >
< Get Logs tfidf stats >
 TfIdf, field: activity
 TfIdf, field: down_event
 TfIdf, field: up_event
 TfIdf, field: text_change
< get iwp stats >
< tw_activities Preprocessor 1 >
Starting to engineer features
Calculating window stats


100%|██████████| 3/3 [01:38<00:00, 32.84s/it, method=entropy_nan]


< tw_activities Preprocessor 2 >
Starting to engineer features
Calculating window stats


100%|██████████| 3/3 [01:27<00:00, 29.32s/it, method=entropy_nan]


< tw_pauses Preprocessor >
Starting to engineer features
Calculating window stats


100%|██████████| 3/3 [04:11<00:00, 83.89s/it, method=entropy_nan, pause=pause_2_sec]


< Get RepetedGroups stats >



Unnamed: 0,id,event_id_max,tfidf_essay_feature_0,tfidf_essay_feature_1,tfidf_essay_feature_2,tfidf_essay_feature_3,tfidf_essay_feature_4,tfidf_essay_feature_5,tfidf_essay_feature_6,tfidf_essay_feature_7,tfidf_essay_feature_8,tfidf_essay_feature_9,tfidf_essay_feature_10,tfidf_essay_feature_11,tfidf_essay_feature_12,tfidf_essay_feature_13,tfidf_essay_feature_14,tfidf_essay_feature_15,tfidf_essay_feature_16,tfidf_essay_feature_17,tfidf_essay_feature_18,tfidf_essay_feature_19,tfidf_essay_feature_20,tfidf_essay_feature_21,tfidf_essay_feature_22,tfidf_essay_feature_23,tfidf_essay_feature_24,tfidf_essay_feature_25,tfidf_essay_feature_26,tfidf_essay_feature_27,tfidf_essay_feature_28,tfidf_essay_feature_29,tfidf_essay_feature_30,tfidf_essay_feature_31,tfidf_essay_feature_32,tfidf_essay_feature_33,tfidf_essay_feature_34,tfidf_essay_feature_35,tfidf_essay_feature_36,tfidf_essay_feature_37,tfidf_essay_feature_38,tfidf_essay_feature_39,tfidf_essay_feature_40,tfidf_essay_feature_41,tfidf_essay_feature_42,tfidf_essay_feature_43,tfidf_essay_feature_44,tfidf_essay_feature_45,tfidf_essay_feature_46,tfidf_essay_feature_47,tfidf_essay_feature_48,tfidf_essay_feature_49,tfidf_essay_feature_50,tfidf_essay_feature_51,tfidf_essay_feature_52,tfidf_essay_feature_53,tfidf_essay_feature_54,tfidf_essay_feature_55,tfidf_essay_feature_56,tfidf_essay_feature_57,tfidf_activity_feature_0,tfidf_activity_feature_1,tfidf_activity_feature_2,tfidf_activity_feature_3,tfidf_activity_feature_4,tfidf_down_event_feature_0,tfidf_down_event_feature_1,tfidf_down_event_feature_2,tfidf_down_event_feature_3,tfidf_down_event_feature_4,tfidf_down_event_feature_5,tfidf_down_event_feature_6,tfidf_down_event_feature_7,tfidf_down_event_feature_8,tfidf_down_event_feature_9,tfidf_down_event_feature_10,tfidf_down_event_feature_11,tfidf_down_event_feature_12,tfidf_down_event_feature_13,tfidf_down_event_feature_14,tfidf_down_event_feature_15,tfidf_up_event_feature_0,tfidf_up_event_feature_1,tfidf_up_event_feature_2,tfidf_up_event_feature_3,tfidf_up_event_feature_4,tfidf_up_event_feature_5,tfidf_up_event_feature_6,tfidf_up_event_feature_7,tfidf_up_event_feature_8,tfidf_up_event_feature_9,tfidf_up_event_feature_10,tfidf_up_event_feature_11,tfidf_up_event_feature_12,tfidf_up_event_feature_13,tfidf_up_event_feature_14,tfidf_up_event_feature_15,tfidf_text_change_feature_0,tfidf_text_change_feature_1,tfidf_text_change_feature_2,tfidf_text_change_feature_3,tfidf_text_change_feature_4,tfidf_text_change_feature_5,tfidf_text_change_feature_6,tfidf_text_change_feature_7,tfidf_text_change_feature_8,tfidf_text_change_feature_9,tfidf_text_change_feature_10,tfidf_text_change_feature_11,tfidf_text_change_feature_12,tfidf_text_change_feature_13,bw_pauses_sum_mean,bw_pauses_sum_std,bw_pauses_sum_entropy,bw_pauses_sum_first,bw_pauses_sum_last,window_60s_activity_count_mean_inp,window_60s_activity_count_std_inp,window_60s_activity_count_entropy_nan_inp,window_60s_activity_count_mean_rem_all,window_60s_activity_count_std_rem_all,window_60s_activity_count_entropy_nan_rem_all,window_60s_pause_1_sec_mean,window_60s_pause_1_sec_std,window_60s_pause_1_sec_entropy_nan,window_60s_pause_15_sec_mean,window_60s_pause_15_sec_std,window_60s_pause_15_sec_entropy_nan,window_60s_pause_2_sec_mean,window_60s_pause_2_sec_std,window_60s_pause_2_sec_entropy_nan,repeated_remove_mean,repeated_remove_std,repeated_remove_entropy_nan,repeated_q_mean,repeated_q_std,repeated_q_entropy_nan
0,001519c8,2557,0.030876,0.033073,0.030573,0.039390,0.090986,0.043700,0.687499,0.026207,0.041034,0.078621,0.058290,0.034905,0.272405,0.041502,0.065438,0.105168,0.013606,0.073942,0.155723,0.042078,0.016433,0.014204,0.052563,0.025975,0.181604,0.466981,0.000000,0.043986,0.013317,0.017137,0.000000,0.000000,0.017305,0.000000,0.000000,0.047729,0.246462,0.130401,0.000000,0.000000,0.018051,0.000000,0.078526,0.020245,0.052224,0.000000,0.085085,0.031169,0.041485,0.021090,0.137168,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.977447,0.202866,0.058355,0.006176,0.000000,0.945302,0.208445,0.244070,0.016590,0.001962,0.053739,0.002137,0.012266,0.007275,0.000000,0.000000,0.002380,0.000000,0.002179,0.0,0.0,0.945302,0.208445,0.244070,0.016590,0.001962,0.053739,0.002137,0.012266,0.007275,0.000000,0.000000,0.002380,0.000000,0.002179,0.0,0.0,0.975533,0.219244,0.014080,0.007310,0.002050,0.003126,0.000000,0.000000,0.000000,0.001069,0.000000,0.0,0.0,0.000000,1.003597,2.785475,3.688558,0.115,1.561,67.000000,49.235535,3.105981,13.900000,12.712606,2.982507,1.709677,1.574665,2.971915,0.967742,1.079626,2.730704,0.677419,1.012821,2.425480,4.343750,4.426780,4.144963,4.260526,2.389074,5.789355
1,0022f953,2454,0.000000,0.024555,0.000000,0.097483,0.164054,0.032445,0.587475,0.058372,0.050776,0.097286,0.000000,0.000000,0.221507,0.030813,0.077735,0.039041,0.010102,0.000000,0.115616,0.031241,0.048803,0.031637,0.048781,0.134994,0.115569,0.645259,0.000000,0.000000,0.009887,0.025447,0.000000,0.000000,0.012848,0.000000,0.000000,0.023624,0.115569,0.077452,0.000000,0.000000,0.000000,0.015557,0.087451,0.015031,0.000000,0.000000,0.021057,0.011571,0.061601,0.000000,0.183311,0.0,0.054753,0.0,0.0,0.075567,0.0,0.0,0.982854,0.131912,0.128816,0.000920,0.001547,0.948197,0.248822,0.165859,0.064958,0.049187,0.035651,0.057061,0.009549,0.013876,0.004486,0.003267,0.003892,0.000000,0.002375,0.0,0.0,0.948197,0.248822,0.165859,0.064958,0.049187,0.035651,0.057061,0.009549,0.013876,0.004486,0.003267,0.003892,0.000000,0.002375,0.0,0.0,0.968917,0.246509,0.010271,0.014220,0.004071,0.002838,0.006968,0.006455,0.003196,0.000000,0.000000,0.0,0.0,0.000000,0.324325,0.804238,3.463406,0.143,0.048,71.777778,70.338923,2.775079,9.629630,7.509728,2.928341,1.233333,1.222866,2.828328,0.433333,0.727932,2.204785,0.666667,0.994236,2.362728,3.170732,3.317760,4.006828,3.616505,1.937847,5.890656
2,0042269b,4136,0.032427,0.011578,0.000000,0.064352,0.072805,0.010199,0.581261,0.045873,0.057460,0.055047,0.040812,0.024439,0.263384,0.038744,0.064144,0.055226,0.028579,0.025886,0.072687,0.029462,0.023012,0.029835,0.082805,0.118212,0.172562,0.554015,0.014476,0.061594,0.037297,0.107990,0.041520,0.045587,0.109045,0.012456,0.041961,0.011139,0.336042,0.045650,0.012132,0.045547,0.037917,0.029342,0.045817,0.028350,0.036565,0.037576,0.069501,0.021823,0.019364,0.014767,0.172871,0.0,0.000000,0.0,0.0,0.017816,0.0,0.0,0.991070,0.123828,0.049342,0.003581,0.000000,0.970628,0.184500,0.147087,0.013717,0.003370,0.043134,0.000000,0.007022,0.007982,0.000000,0.000000,0.005791,0.000000,0.000000,0.0,0.0,0.970531,0.184799,0.147326,0.013740,0.003375,0.043204,0.000000,0.007033,0.007995,0.000000,0.000000,0.005801,0.000000,0.000000,0.0,0.0,0.982549,0.185529,0.006938,0.008144,0.007072,0.000000,0.001228,0.000569,0.000000,0.000000,0.003033,0.0,0.0,0.000000,0.557424,4.734651,2.031375,0.059,0.029,117.166667,101.026826,2.997400,14.633333,15.960043,2.819634,1.533333,1.455864,2.844912,0.833333,1.176885,2.477073,0.833333,1.315251,2.321541,4.525773,5.960098,3.998324,4.737357,2.586641,6.271934
3,0059420b,1556,0.000000,0.017204,0.000000,0.095622,0.094660,0.000000,0.593802,0.068163,0.042691,0.054531,0.015161,0.018157,0.296901,0.000000,0.040848,0.041031,0.042467,0.076928,0.094507,0.029185,0.000000,0.059110,0.054685,0.108095,0.175441,0.580306,0.021510,0.000000,0.083131,0.000000,0.020565,0.000000,0.000000,0.000000,0.000000,0.033104,0.310396,0.067833,0.018027,0.000000,0.000000,0.000000,0.081697,0.021063,0.000000,0.037224,0.029507,0.032428,0.043161,0.000000,0.042812,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.990540,0.114749,0.075202,0.001378,0.002317,0.961533,0.225099,0.141145,0.066286,0.000000,0.016681,0.000000,0.012047,0.002886,0.000000,0.000000,0.002832,0.004889,0.002305,0.0,0.0,0.961533,0.225099,0.141145,0.066286,0.000000,0.016681,0.000000,0.012047,0.002886,0.000000,0.000000,0.002832,0.004889,0.002305,0.0,0.0,0.971101,0.238115,0.011016,0.002640,0.003455,0.003161,0.000000,0.000000,0.000000,0.000000,0.010651,0.0,0.0,0.000000,2.031143,3.125509,3.909962,0.304,1.238,54.333333,35.657387,2.901195,6.291667,6.868195,2.672402,3.375000,3.187100,2.739871,1.416667,1.815792,2.379838,1.333333,1.372610,2.618376,2.287879,3.805927,3.630921,3.830258,2.327128,5.431554
4,0075873a,2531,0.000000,0.029498,0.013634,0.117108,0.150709,0.012992,0.566908,0.058436,0.012199,0.093497,0.000000,0.015566,0.150404,0.037016,0.070038,0.070350,0.000000,0.000000,0.162039,0.025020,0.073285,0.063342,0.000000,0.139003,0.161974,0.613186,0.036881,0.000000,0.035633,0.045855,0.017630,0.000000,0.030869,0.000000,0.013363,0.014190,0.173543,0.046522,0.030908,0.000000,0.000000,0.018689,0.093384,0.000000,0.046579,0.000000,0.025296,0.000000,0.061669,0.018811,0.281385,0.0,0.043850,0.0,0.0,0.000000,0.0,0.0,0.965697,0.257192,0.035803,0.000000,0.000000,0.928798,0.195283,0.312366,0.024736,0.000000,0.019898,0.000000,0.013868,0.015020,0.000000,0.000000,0.006143,0.000000,0.012745,0.0,0.0,0.928798,0.195283,0.312366,0.024736,0.000000,0.019898,0.000000,0.013868,0.015020,0.000000,0.000000,0.006143,0.000000,0.012745,0.0,0.0,0.979827,0.198061,0.015965,0.012951,0.006102,0.015508,0.000000,0.000000,0.001863,0.000000,0.002508,0.0,0.0,0.000000,0.361107,1.020375,3.597630,0.222,0.257,69.357143,56.025929,2.989368,18.464286,27.359719,2.559043,0.857143,1.297127,2.322894,0.392857,0.628890,2.145842,0.607143,0.737327,2.507026,8.913794,16.931528,3.178015,4.065963,2.433970,5.771085
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2466,ffb8c745,4739,0.000000,0.000000,0.040896,0.058546,0.092730,0.000000,0.566827,0.058427,0.048791,0.128540,0.012996,0.031127,0.219790,0.061685,0.046685,0.035170,0.060668,0.000000,0.046290,0.025017,0.000000,0.038000,0.035156,0.057910,0.150383,0.636234,0.000000,0.039226,0.083133,0.045848,0.000000,0.000000,0.015432,0.000000,0.026722,0.000000,0.300765,0.104660,0.015452,0.043510,0.016098,0.018686,0.058357,0.018054,0.015524,0.015953,0.012646,0.013898,0.024664,0.000000,0.159021,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.964745,0.258230,0.050819,0.000976,0.000000,0.924095,0.211528,0.312689,0.056076,0.000000,0.007801,0.000000,0.013978,0.010796,0.000000,0.000000,0.002318,0.000000,0.002829,0.0,0.0,0.924095,0.211528,0.312689,0.056076,0.000000,0.007801,0.000000,0.013978,0.010796,0.000000,0.000000,0.002318,0.000000,0.002829,0.0,0.0,0.975294,0.219948,0.015962,0.011798,0.003033,0.002691,0.003304,0.000510,0.000000,0.000000,0.000000,0.0,0.0,0.001590,0.305762,0.773772,4.175142,0.027,0.058,132.888889,111.745361,2.901412,32.000000,40.986120,2.708464,1.366667,1.629117,2.711311,0.600000,0.932183,2.293118,0.366667,0.764890,1.846220,11.428572,35.313496,2.778181,4.348624,2.553715,6.319776
2467,ffbef7e5,2604,0.016177,0.008664,0.016018,0.055035,0.129393,0.030528,0.502932,0.027462,0.050165,0.068655,0.007635,0.027432,0.231077,0.028993,0.048000,0.020663,0.064159,0.000000,0.054393,0.044093,0.025830,0.029768,0.020655,0.156507,0.197095,0.693231,0.000000,0.015364,0.027910,0.035916,0.000000,0.011371,0.018133,0.000000,0.015700,0.008336,0.203891,0.068322,0.009078,0.017042,0.037832,0.010979,0.096000,0.042429,0.000000,0.009373,0.052009,0.000000,0.101434,0.000000,0.172483,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.997784,0.025007,0.061658,0.000756,0.000000,0.970207,0.231421,0.031139,0.057749,0.000000,0.018645,0.000000,0.016056,0.012902,0.000000,0.000000,0.006332,0.000000,0.005152,0.0,0.0,0.970207,0.231421,0.031139,0.057749,0.000000,0.018645,0.000000,0.016056,0.012902,0.000000,0.000000,0.006332,0.000000,0.005152,0.0,0.0,0.972575,0.231493,0.016716,0.012623,0.006195,0.005668,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.441056,0.936876,4.343417,0.507,0.139,79.833333,53.860665,3.110362,2.000000,6.721043,1.357438,1.433333,1.454679,2.840452,0.800000,1.030567,2.578629,0.800000,1.063501,2.520867,5.000000,9.890307,1.539671,3.978769,2.179102,6.008812
2468,ffccd6fd,3063,0.019345,0.000000,0.000000,0.082265,0.114012,0.018253,0.682689,0.032839,0.034279,0.114938,0.018261,0.000000,0.390108,0.086676,0.180398,0.032946,0.034099,0.000000,0.146350,0.017576,0.020592,0.017798,0.148197,0.097645,0.065018,0.422617,0.025908,0.018372,0.050063,0.000000,0.000000,0.054392,0.021684,0.022292,0.018774,0.019936,0.178800,0.114382,0.000000,0.000000,0.000000,0.000000,0.049199,0.000000,0.021813,0.000000,0.053309,0.078115,0.017328,0.026428,0.017188,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.998548,0.030856,0.044162,0.000000,0.000000,0.460542,0.884487,0.041926,0.000000,0.023159,0.004279,0.046095,0.002377,0.000987,0.032387,0.000000,0.005813,0.007526,0.000000,0.0,0.0,0.460542,0.884487,0.041926,0.000000,0.023159,0.004279,0.046095,0.002377,0.000987,0.032387,0.000000,0.005813,0.007526,0.000000,0.0,0.0,0.481021,0.876662,0.002799,0.001453,0.008559,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,1.895989,3.829346,3.604955,3.639,0.427,86.333333,133.378846,2.732648,3.520000,5.205126,2.553076,2.484848,2.526376,2.946798,1.151515,1.502523,2.654880,0.878788,1.082750,2.709767,2.095238,2.886550,3.295090,4.004132,2.503935,5.305804
2469,ffec5b38,3242,0.009107,0.009755,0.054106,0.085202,0.053674,0.017186,0.612178,0.023190,0.024207,0.069570,0.008597,0.020591,0.283132,0.016322,0.038603,0.093061,0.032106,0.032715,0.145451,0.016549,0.019389,0.050274,0.062015,0.145569,0.206610,0.505047,0.000000,0.017299,0.086417,0.060658,0.023322,0.000000,0.010208,0.020989,0.008839,0.000000,0.306089,0.092311,0.030664,0.057564,0.010649,0.000000,0.069486,0.023886,0.051346,0.021107,0.033462,0.009194,0.032631,0.024883,0.186111,0.0,0.000000,0.0,0.0,0.015011,0.0,0.0,0.995186,0.094916,0.024407,0.000000,0.000000,0.974925,0.188709,0.114245,0.022596,0.000000,0.005783,0.000000,0.012806,0.011577,0.000000,0.000000,0.002525,0.002179,0.002055,0.0,0.0,0.974925,0.188709,0.114245,0.022596,0.000000,0.005783,0.000000,0.012806,0.011577,0.000000,0.000000,0.002525,0.002179,0.002055,0.0,0.0,0.982446,0.185653,0.012882,0.011409,0.003089,0.001884,0.004626,0.000714,0.000000,0.001611,0.000000,0.0,0.0,0.000000,0.808656,1.652555,4.429623,0.056,0.270,115.800000,71.294577,2.956822,11.040000,10.721785,2.770271,1.346154,1.412581,2.639113,1.038462,1.310901,2.477556,0.576923,0.808608,2.303489,3.000000,4.169486,3.950353,4.404851,2.619534,6.119018


In [31]:
if __name__ == '__main__':
    train_feats.to_csv('train_feats_extra.csv', index=False)

In [32]:
if __name__ == '__main__':
    test_feats = get_feats_all('test_logs.csv')
    print()
    display(test_feats)

get_feats_all
< Get Essay tfidf stats >
< Get Logs tfidf stats >
 TfIdf, field: activity
 TfIdf, field: down_event
 TfIdf, field: up_event
 TfIdf, field: text_change
< get iwp stats >
< tw_activities Preprocessor 1 >
Starting to engineer features
Calculating window stats


100%|██████████| 3/3 [00:00<00:00, 80.43it/s, method=entropy_nan]


< tw_activities Preprocessor 2 >
Starting to engineer features
Calculating window stats


100%|██████████| 3/3 [00:00<00:00, 104.07it/s, method=entropy_nan]


< tw_pauses Preprocessor >
Starting to engineer features
Calculating window stats


100%|██████████| 3/3 [00:00<00:00, 25.77it/s, method=entropy_nan, pause=pause_2_sec]


< Get RepetedGroups stats >



Unnamed: 0,id,event_id_max,tfidf_essay_feature_0,tfidf_essay_feature_1,tfidf_essay_feature_2,tfidf_essay_feature_3,tfidf_essay_feature_4,tfidf_essay_feature_5,tfidf_essay_feature_6,tfidf_essay_feature_7,tfidf_essay_feature_8,tfidf_essay_feature_9,tfidf_essay_feature_10,tfidf_essay_feature_11,tfidf_essay_feature_12,tfidf_essay_feature_13,tfidf_essay_feature_14,tfidf_essay_feature_15,tfidf_essay_feature_16,tfidf_essay_feature_17,tfidf_essay_feature_18,tfidf_essay_feature_19,tfidf_essay_feature_20,tfidf_essay_feature_21,tfidf_essay_feature_22,tfidf_essay_feature_23,tfidf_essay_feature_24,tfidf_essay_feature_25,tfidf_essay_feature_26,tfidf_essay_feature_27,tfidf_essay_feature_28,tfidf_essay_feature_29,tfidf_essay_feature_30,tfidf_essay_feature_31,tfidf_essay_feature_32,tfidf_essay_feature_33,tfidf_essay_feature_34,tfidf_essay_feature_35,tfidf_essay_feature_36,tfidf_essay_feature_37,tfidf_essay_feature_38,tfidf_essay_feature_39,tfidf_essay_feature_40,tfidf_essay_feature_41,tfidf_essay_feature_42,tfidf_essay_feature_43,tfidf_essay_feature_44,tfidf_essay_feature_45,tfidf_essay_feature_46,tfidf_essay_feature_47,tfidf_essay_feature_48,tfidf_essay_feature_49,tfidf_essay_feature_50,tfidf_essay_feature_51,tfidf_essay_feature_52,tfidf_essay_feature_53,tfidf_essay_feature_54,tfidf_essay_feature_55,tfidf_essay_feature_56,tfidf_essay_feature_57,tfidf_activity_feature_0,tfidf_activity_feature_1,tfidf_activity_feature_2,tfidf_activity_feature_3,tfidf_activity_feature_4,tfidf_down_event_feature_0,tfidf_down_event_feature_1,tfidf_down_event_feature_2,tfidf_down_event_feature_3,tfidf_down_event_feature_4,tfidf_down_event_feature_5,tfidf_down_event_feature_6,tfidf_down_event_feature_7,tfidf_down_event_feature_8,tfidf_down_event_feature_9,tfidf_down_event_feature_10,tfidf_down_event_feature_11,tfidf_down_event_feature_12,tfidf_down_event_feature_13,tfidf_down_event_feature_14,tfidf_down_event_feature_15,tfidf_up_event_feature_0,tfidf_up_event_feature_1,tfidf_up_event_feature_2,tfidf_up_event_feature_3,tfidf_up_event_feature_4,tfidf_up_event_feature_5,tfidf_up_event_feature_6,tfidf_up_event_feature_7,tfidf_up_event_feature_8,tfidf_up_event_feature_9,tfidf_up_event_feature_10,tfidf_up_event_feature_11,tfidf_up_event_feature_12,tfidf_up_event_feature_13,tfidf_up_event_feature_14,tfidf_up_event_feature_15,tfidf_text_change_feature_0,tfidf_text_change_feature_1,tfidf_text_change_feature_2,tfidf_text_change_feature_3,tfidf_text_change_feature_4,tfidf_text_change_feature_5,tfidf_text_change_feature_6,tfidf_text_change_feature_7,tfidf_text_change_feature_8,tfidf_text_change_feature_9,tfidf_text_change_feature_10,tfidf_text_change_feature_11,tfidf_text_change_feature_12,tfidf_text_change_feature_13,bw_pauses_sum_mean,bw_pauses_sum_std,bw_pauses_sum_entropy,bw_pauses_sum_first,bw_pauses_sum_last,window_60s_activity_count_mean_inp,window_60s_activity_count_std_inp,window_60s_activity_count_entropy_nan_inp,window_60s_activity_count_mean_rem_all,window_60s_activity_count_std_rem_all,window_60s_activity_count_entropy_nan_rem_all,window_60s_pause_1_sec_mean,window_60s_pause_1_sec_std,window_60s_pause_1_sec_entropy_nan,window_60s_pause_15_sec_mean,window_60s_pause_15_sec_std,window_60s_pause_15_sec_entropy_nan,window_60s_pause_2_sec_mean,window_60s_pause_2_sec_std,window_60s_pause_2_sec_entropy_nan,repeated_remove_mean,repeated_remove_std,repeated_remove_entropy_nan,repeated_q_mean,repeated_q_std,repeated_q_entropy_nan
0,0000aaaa,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,0.153846,0.375534,0.693147,1.0,0.0,0.0,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0
1,2222bbbb,2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.447214,0.0,1.0,0.0,0.0,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,0.0,2.0,0.0,0.0
2,4444cccc,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.707107,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.707107,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.707107,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.5,0.0,1.0,0.0,0.0,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0,0.0,0.0,1.0,0.0,0.0


In [33]:
if __name__ == '__main__':
    drop_features = ['id']
    train_scores = pd.read_csv(CONFIG.data_path + 'train_scores.csv')
    data = train_feats.merge(train_scores, on='id', how='left')
    x = data.drop(drop_features + ['score'], axis=1)
    y = data['score'].values
    print(f'Number of features: {len(x.columns)}')

Number of features: 136


# Training and Evaluation

In [34]:
def train_valid_split(data_x, data_y, train_idx, valid_idx):
    x_train = data_x.iloc[train_idx]
    y_train = data_y[train_idx]
    x_valid = data_x.iloc[valid_idx]
    y_valid = data_y[valid_idx]
    return x_train, y_train, x_valid, y_valid

In [35]:
def evaluate(data_x, data_y, model, random_state=42, n_splits=5):
    all_models = []
    skf = StratifiedKFold(n_splits=n_splits, random_state=random_state, shuffle=True)
    oof_pred = np.zeros(len(data_x))
    for i, (train_index, valid_index) in enumerate(skf.split(data_x, data_y.astype(str))):
        train_x, train_y, valid_x, valid_y = train_valid_split(data_x, data_y, train_index, valid_index)
        model.fit(
            train_x, 
            train_y,
#             eval_metric='rmse',
#             eval_set=[(valid_x, valid_y)],
#             early_stopping_rounds=50,
#             verbose=1, 
        )
        all_models.append(deepcopy(model))
        oof_pred[valid_index] = model.predict(valid_x)
    return oof_pred, all_models

In [36]:
if __name__ == '__main__' and CONFIG.solution_name == 'XGBRegressor':
    param = {
        'n_estimators': 974,
        'max_depth': 4,
        'reg_alpha': 0.5941098854452742,
        'reg_lambda': 2.640144679549458,
        'subsample': 0.7346364905474035,
        'colsample_bynode': 0.45241336160800116,
        'eta': 0.06581840471615191,
        'eval_metric': 'rmse',
        'random_state': 41
    }
    solution = XGBRegressor(**param)

In [37]:
if __name__ == '__main__':
    test_ids = test_feats['id'].values
    testin_x = test_feats.drop(drop_features, axis=1)

    print('< Learning and Evaluation >')
    oof_pred, all_models = evaluate(
        x.copy(),
        y.copy(), 
        solution, 
        random_state=CONFIG.random_state,
        n_splits=CONFIG.n_splits
    )

< Learning and Evaluation >


In [38]:
if __name__ == '__main__':
    rmse = mean_squared_error(y, oof_pred, squared=False)
    print(f"OOF RMSE: {rmse}")

OOF RMSE: 0.6360308904562758


In [39]:
if __name__ == '__main__':
    voting_reg = VotingRegressor(estimators=all_models)
    voting_reg.estimators_ = all_models
    voting_reg_preds = voting_reg.predict(testin_x.copy())

In [40]:
if __name__ == '__main__':
    joblib.dump(voting_reg, "extra_features.model")
    joblib.dump(oof_pred, "extra_features_oof_preds")

# Submission

In [41]:
if __name__ == '__main__':
    submission_df = pd.DataFrame({
        'id': test_ids, 
        'score': voting_reg_preds
    })
    submission_df.to_csv('submission.csv', index=False)
    display(submission_df)

Unnamed: 0,id,score
0,0000aaaa,1.320873
1,2222bbbb,1.283517
2,4444cccc,1.366141
