По аналогии с созданием трейн и валид датасетов, создадим тестовый датасет, путем добавления информации о будущей сессии пользователя, о которой мы знаем только дату и время

In [1]:
import numpy as np
import pandas as pd
import gc
import holidays

In [2]:
def reduce_mem_usage(df):
    """ 
    iterate through all the columns of a dataframe and 
    modify the data type to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print(('Memory usage of dataframe is {:.2f}' 
                     'MB').format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max <\
                  np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max <\
                   np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max <\
                   np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max <\
                   np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max <\
                   np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max <\
                   np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')
    end_mem = df.memory_usage().sum() / 1024**2
    print(('Memory usage after optimization is: {:.2f}' 
                              'MB').format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) 
                                             / start_mem))
    
    return df

In [3]:
df = pd.read_csv("../input/alfabattle-1-parq/alfa1_train_expend.csv")
df_test = pd.read_csv('../input/alfabattle1/alfabattle2_prediction_session_timestamp.csv')

In [4]:
df = pd.concat((df, df_test)).reset_index(drop=True)

In [5]:
df = df.sort_values(by=['client_pin', 'timestamp'])

In [6]:
df['timestamp'] = pd.to_datetime(df['timestamp'])

In [7]:
df['session_id'].fillna(df['client_pin']+'0', inplace=True)

In [8]:
df_temp = df.copy()

In [9]:
df_stat = pd.DataFrame()
df_stat['client_pin'] = df_temp['client_pin'].unique()

In [10]:
last_ses = df_temp.groupby('client_pin')['session_id'].tail(1).values

In [11]:
df_stat = df_stat.merge(df_temp.loc[~df_temp['session_id'].isin(last_ses)].groupby(['client_pin'])['multi_class_target'].agg(
    lambda x: pd.Series.mode(x)[0]).rename('most_popular'), how='left', on='client_pin')

In [12]:
df_temp = df_temp.merge(df_temp[['client_pin', 'timestamp']].groupby(['client_pin']).tail(1).rename({'timestamp':'next_time'}, axis=1), how='left', on='client_pin')

In [13]:
df_temp['timedelta_act'] = (df_temp['next_time'] - df_temp['timestamp']) / np.timedelta64(1, 'h')

In [14]:
df_stat = df_stat.merge(df_temp[['client_pin', 'timedelta_act']].loc[~df_temp['session_id'].isin(last_ses)].groupby('client_pin').tail(1), how='left', on='client_pin')

In [15]:
for action in df_temp['multi_class_target'].unique():
    df_time = df_temp.loc[(~df_temp['session_id'].isin(last_ses)) & (df_temp['multi_class_target'] == action)]
    df_time['next_time_act'] = df_time[['client_pin', 'timestamp']].groupby(['client_pin']).shift(-1)
    df_time.dropna(inplace=True)
    df_time[f'mean_timedelta_{action}'] = (df_time['next_time_act'] - df_time['timestamp']) / np.timedelta64(1, 'h')
    df_time = df_time[['client_pin', f'mean_timedelta_{action}']].groupby('client_pin').mean()
    df_stat = df_stat.merge(df_time, how='left', on='client_pin')
    df_stat = df_stat.merge(df_temp.loc[(~df_temp['session_id'].isin(last_ses)) & (df_temp['multi_class_target'] == action)].groupby(['client_pin']).tail(1)[['client_pin', 'timedelta_act']].rename({'timedelta_act':f'timedelta_{action}'}, axis=1), 
                            how='left', on='client_pin')
    df_stat[f'err_timedelta_{action}'] = np.abs(df_stat[f'mean_timedelta_{action}'] - df_stat[f'timedelta_{action}'])
    df_stat[f'timedelta_{action}'].fillna(10000, inplace=True)
    df_stat[f'mean_timedelta_{action}'].fillna(10000, inplace=True)
    df_stat[f'err_timedelta_{action}'].fillna(10000, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats 

In [16]:
from_parq = ['application_id', 'event_type', 'event_category', 'event_name',
       'event_label', 'device_screen_name', 'timezone', 'device_is_webview',
       'page_urlhost', 'page_urlpath_full', 'net_connection_type',
       'net_connection_tech', 'timedelta', 'count']

In [17]:
df_stat = df_stat.merge(df_temp.loc[~df_temp['session_id'].isin(last_ses)].groupby('client_pin').tail(1)[from_parq+['client_pin']], 
                            how='left', on='client_pin')

In [18]:
exp_df = df[['session_id', 'client_pin', 'multi_class_target']]
exp_df['row_number'] = exp_df.groupby(['client_pin']).cumcount()+1
exp_df.drop(['session_id'], axis=1, inplace=True)
exp_df.set_index(['client_pin', 'row_number'])
exp_df = exp_df.groupby(['client_pin']).tail(35)
exp_df['row_number'] = exp_df.groupby('client_pin').rank(ascending=False).astype('int32')
exp_df.set_index(['client_pin', 'row_number'], inplace=True)
df_lag = pd.DataFrame()
df_lag['total'] = [0]*len(df['client_pin'].unique())*35
pins = df['client_pin'].unique()
lags = list(range(1, 36))
index = pd.MultiIndex.from_product([pins, lags], names=['client_pin', 'row_number'])
df_lag.set_index(index, inplace=True)
df_lag = df_lag.merge(exp_df, how='left', on=['client_pin', 'row_number'])
df_lag.drop(['total'], axis=1, inplace=True)
df_lag = df_lag.unstack().add_prefix('lag_').rename_axis([None, None], axis=1)
df_lag.columns = df_lag.columns.droplevel(0)
df_lag['weight'] = 1
df_lag.reset_index(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [19]:
df_lag = df_lag.merge(df_stat, how='left', on='client_pin')

In [20]:
df_train = pd.read_csv('../input/alfabattle-1-stat/alfa1_df_train6.csv')

In [21]:
df_valid = pd.read_csv('../input/alfabattle-1-stat/alfa1_df_valid6.csv')

In [22]:
df_train.fillna('nothing', inplace=True)

In [23]:
df_valid.fillna('nothing', inplace=True)

In [24]:
df_lag.fillna('nothing', inplace=True)

In [25]:
df_lag['lag_1'] = df_lag['lag_2']

In [26]:
df_lag = df_lag[df_valid.columns]

In [27]:
df_lag.to_csv('df_test_nn.csv', index=False)

In [28]:
df_lag = reduce_mem_usage(df_lag)

Memory usage of dataframe is 50.80MB
Memory usage after optimization is: 13.10MB
Decreased by 74.2%


In [29]:
del df
del df_test
del df_temp
gc.collect()

30

In [30]:
df = pd.read_csv("../input/alfa-parq3/alfa1_train_expend3.csv")

In [31]:
df_test = pd.read_csv('../input/alfabattle1/alfabattle2_prediction_session_timestamp.csv')

In [32]:
df = pd.concat((df, df_test)).reset_index(drop=True)

In [33]:
df = df.sort_values(by=['client_pin', 'timestamp'])

In [34]:
df['timestamp'] = pd.to_datetime(df['timestamp'])

In [35]:
df['session_id'].fillna(df['client_pin']+'0', inplace=True)

In [36]:
df_temp = df.copy()

In [37]:
df_stat = pd.DataFrame()
df_stat['client_pin'] = df_temp['client_pin'].unique()

In [38]:
last_ses = df_temp.groupby('client_pin')['session_id'].tail(1).values

In [39]:
from_parq = list(df_temp.drop(['session_id', 'client_pin', 'timestamp', 'multi_class_target'], axis=1).columns)

In [40]:
df_stat = df_stat.merge(df_temp[['client_pin'] + from_parq].loc[~df_temp['session_id'].isin(last_ses)].groupby(['client_pin']).tail(1),
                        how='left', on='client_pin')

In [41]:
exp_df = df[['session_id', 'client_pin', 'multi_class_target']]
exp_df['row_number'] = exp_df.groupby(['client_pin']).cumcount()+1
exp_df.drop(['session_id'], axis=1, inplace=True)
exp_df.set_index(['client_pin', 'row_number'])
exp_df = exp_df.groupby(['client_pin']).tail(2)
exp_df['row_number'] = exp_df.groupby('client_pin').rank(ascending=False).astype('int32')
exp_df.set_index(['client_pin', 'row_number'], inplace=True)
df_lag_exp = pd.DataFrame()
df_lag_exp['total'] = [0]*len(df['client_pin'].unique())*2
pins = df['client_pin'].unique()
lags = list(range(1, 3))
index = pd.MultiIndex.from_product([pins, lags], names=['client_pin', 'row_number'])
df_lag_exp.set_index(index, inplace=True)
df_lag_exp = df_lag_exp.merge(exp_df, how='left', on=['client_pin', 'row_number'])
df_lag_exp.drop(['total'], axis=1, inplace=True)
df_lag_exp = df_lag_exp.unstack().add_prefix('lag_').rename_axis([None, None], axis=1)
df_lag_exp.columns = df_lag_exp.columns.droplevel(0)
df_lag_exp['weight'] = 1
df_lag_exp.reset_index(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [42]:
df_lag_exp = df_lag_exp.merge(df_stat, how='left', on='client_pin')

In [43]:
df_lag_exp.to_csv('df_test_exp.csv', index=False)

In [44]:
df_lag_exp = reduce_mem_usage(df_lag_exp)

Memory usage of dataframe is 39.31MB
Memory usage after optimization is: 13.31MB
Decreased by 66.1%


In [45]:
del df
del df_test
del df_temp
gc.collect()

20

In [46]:
df = pd.read_csv("../input/alfa-parq3/alfa1_train_expend4.csv")
df_test = pd.read_csv('../input/alfabattle1/alfabattle2_prediction_session_timestamp.csv')
df = pd.concat((df, df_test)).reset_index(drop=True)
df = df.sort_values(by=['client_pin', 'timestamp'])
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['session_id'].fillna(df['client_pin']+'0', inplace=True)

In [47]:
df_temp = df.copy()

In [48]:
df_stat = pd.DataFrame()
df_stat['client_pin'] = df_temp['client_pin'].unique()

In [49]:
last_ses = df_temp.groupby('client_pin')['session_id'].tail(1).values

In [50]:
from_parq = list(df_temp.drop(['session_id', 'client_pin', 'timestamp', 'multi_class_target'], axis=1).columns)

In [51]:
df_stat = df_stat.merge(df_temp[['client_pin'] + from_parq].loc[~df_temp['session_id'].isin(last_ses)].groupby(['client_pin']).tail(1),
                        how='left', on='client_pin')

In [52]:
exp_df = df[['session_id', 'client_pin', 'multi_class_target']]
exp_df['row_number'] = exp_df.groupby(['client_pin']).cumcount()+1
exp_df.drop(['session_id'], axis=1, inplace=True)
exp_df.set_index(['client_pin', 'row_number'])
exp_df = exp_df.groupby(['client_pin']).tail(2)
exp_df['row_number'] = exp_df.groupby('client_pin').rank(ascending=False).astype('int32')
exp_df.set_index(['client_pin', 'row_number'], inplace=True)
df_lag_exp1 = pd.DataFrame()
df_lag_exp1['total'] = [0]*len(df['client_pin'].unique())*2
pins = df['client_pin'].unique()
lags = list(range(1, 3))
index = pd.MultiIndex.from_product([pins, lags], names=['client_pin', 'row_number'])
df_lag_exp1.set_index(index, inplace=True)
df_lag_exp1 = df_lag_exp1.merge(exp_df, how='left', on=['client_pin', 'row_number'])
df_lag_exp1.drop(['total'], axis=1, inplace=True)
df_lag_exp1 = df_lag_exp1.unstack().add_prefix('lag_').rename_axis([None, None], axis=1)
df_lag_exp1.columns = df_lag_exp1.columns.droplevel(0)
df_lag_exp1['weight'] = 1
df_lag_exp1.reset_index(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [53]:
df_lag_exp1 = df_lag_exp1.merge(df_stat, how='left', on='client_pin')

In [54]:
df_lag_exp1.to_csv('df_test_exp1.csv', index=False)

In [55]:
df_lag_exp1 = reduce_mem_usage(df_lag_exp1)

Memory usage of dataframe is 39.31MB
Memory usage after optimization is: 13.31MB
Decreased by 66.1%


In [56]:
del df
del df_test
del df_temp
gc.collect()

20

In [57]:
df = pd.read_csv("../input/alfa-parq3/alfa1_train_expend5.csv")
df_test = pd.read_csv('../input/alfabattle1/alfabattle2_prediction_session_timestamp.csv')
df = pd.concat((df, df_test)).reset_index(drop=True)
df = df.sort_values(by=['client_pin', 'timestamp'])
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['session_id'].fillna(df['client_pin']+'0', inplace=True)

In [58]:
df_temp = df.copy()

In [59]:
df_stat = pd.DataFrame()
df_stat['client_pin'] = df_temp['client_pin'].unique()

In [60]:
last_ses = df_temp.groupby('client_pin')['session_id'].tail(1).values

In [61]:
from_parq = list(df_temp.drop(['session_id', 'client_pin', 'timestamp', 'multi_class_target'], axis=1).columns)

In [62]:
df_stat = df_stat.merge(df_temp[['client_pin'] + from_parq].loc[~df_temp['session_id'].isin(last_ses)].groupby(['client_pin']).tail(1),
                        how='left', on='client_pin')

In [63]:
exp_df = df[['session_id', 'client_pin', 'multi_class_target']]
exp_df['row_number'] = exp_df.groupby(['client_pin']).cumcount()+1
exp_df.drop(['session_id'], axis=1, inplace=True)
exp_df.set_index(['client_pin', 'row_number'])
exp_df = exp_df.groupby(['client_pin']).tail(2)
exp_df['row_number'] = exp_df.groupby('client_pin').rank(ascending=False).astype('int32')
exp_df.set_index(['client_pin', 'row_number'], inplace=True)
df_lag_exp2 = pd.DataFrame()
df_lag_exp2['total'] = [0]*len(df['client_pin'].unique())*2
pins = df['client_pin'].unique()
lags = list(range(1, 3))
index = pd.MultiIndex.from_product([pins, lags], names=['client_pin', 'row_number'])
df_lag_exp2.set_index(index, inplace=True)
df_lag_exp2 = df_lag_exp2.merge(exp_df, how='left', on=['client_pin', 'row_number'])
df_lag_exp2.drop(['total'], axis=1, inplace=True)
df_lag_exp2 = df_lag_exp2.unstack().add_prefix('lag_').rename_axis([None, None], axis=1)
df_lag_exp2.columns = df_lag_exp2.columns.droplevel(0)
df_lag_exp2['weight'] = 1
df_lag_exp2.reset_index(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [64]:
df_lag_exp2 = df_lag_exp2.merge(df_stat, how='left', on='client_pin')

In [65]:
df_lag_exp2.to_csv('df_test_exp2.csv', index=False)