In [1]:
import numpy as np
import pandas as pd
import gc
import holidays

In [2]:
def reduce_mem_usage(df):
    """ 
    iterate through all the columns of a dataframe and 
    modify the data type to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print(('Memory usage of dataframe is {:.2f}' 
                     'MB').format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max <\
                  np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max <\
                   np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max <\
                   np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max <\
                   np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max <\
                   np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max <\
                   np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')
    end_mem = df.memory_usage().sum() / 1024**2
    print(('Memory usage after optimization is: {:.2f}' 
                              'MB').format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) 
                                             / start_mem))
    
    return df

In [3]:
df = pd.read_csv("../input/alfa-parq3/alfa1_train_expend4.csv")

In [4]:
df = df.sort_values(by=['client_pin', 'timestamp'])

In [5]:
df['timestamp'] = pd.to_datetime(df['timestamp'])

In [6]:
df_temp = df.copy()

In [7]:
df_stat = pd.DataFrame()
df_stat['client_pin'] = df_temp['client_pin'].unique()

In [8]:
last_ses = df_temp.groupby('client_pin')['session_id'].tail(1).values

In [9]:
from_parq = list(df_temp.drop(['session_id', 'client_pin', 'timestamp', 'multi_class_target'], axis=1).columns)

In [10]:
df_stat = df_stat.merge(df_temp[['client_pin'] + from_parq].loc[~df_temp['session_id'].isin(last_ses)].groupby(['client_pin']).tail(1),
                        how='left', on='client_pin')

In [11]:
exp_df = df[['session_id', 'client_pin', 'multi_class_target']]
exp_df['row_number'] = exp_df.groupby(['client_pin']).cumcount()+1
exp_df.drop(['session_id'], axis=1, inplace=True)
exp_df.set_index(['client_pin', 'row_number'])
exp_df = exp_df.groupby(['client_pin']).tail(2)
exp_df['row_number'] = exp_df.groupby('client_pin').rank(ascending=False).astype('int32')
exp_df.set_index(['client_pin', 'row_number'], inplace=True)
df_lag = pd.DataFrame()
df_lag['total'] = [0]*len(df['client_pin'].unique())*2
pins = df['client_pin'].unique()
lags = list(range(1, 3))
index = pd.MultiIndex.from_product([pins, lags], names=['client_pin', 'row_number'])
df_lag.set_index(index, inplace=True)
df_lag = df_lag.merge(exp_df, how='left', on=['client_pin', 'row_number'])
df_lag.drop(['total'], axis=1, inplace=True)
df_lag = df_lag.unstack().add_prefix('lag_').rename_axis([None, None], axis=1)
df_lag.columns = df_lag.columns.droplevel(0)
df_lag = df_lag[df_lag['lag_2'].notnull()]
df_lag = df_lag[df_lag['lag_1'].notnull()]
df_lag['weight'] = 1
df_lag.reset_index(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [12]:
df_lag = df_lag.merge(df_stat, how='left', on='client_pin')

In [13]:
df_lag.to_csv('alfa1_df_valid11.csv', index=False)

In [14]:
concat_list = []
for i in range(1, 65):
    exp_df = df[['session_id', 'client_pin', 'multi_class_target']]
    exp_df['multi_class_target'] = df.groupby('client_pin')['multi_class_target'].shift(i)
    exp_df['row_number'] = exp_df.groupby(['client_pin']).cumcount()+1
    exp_df.drop(['session_id'], axis=1, inplace=True)
    exp_df.set_index(['client_pin', 'row_number'])
    exp_df = exp_df.groupby(['client_pin']).tail(2)
    exp_df['row_number'] = exp_df.groupby('client_pin').rank(ascending=False).astype('int32')
    exp_df.set_index(['client_pin', 'row_number'], inplace=True)
    df_lag = pd.DataFrame()
    df_lag['total'] = [0]*len(df['client_pin'].unique())*2
    pins = df['client_pin'].unique()
    lags = list(range(1, 3))
    index = pd.MultiIndex.from_product([pins, lags], names=['client_pin', 'row_number'])
    df_lag.set_index(index, inplace=True)
    df_lag = df_lag.merge(exp_df, how='left', on=['client_pin', 'row_number'])
    df_lag.drop(['total'], axis=1, inplace=True)
    df_lag = df_lag.unstack().add_prefix('lag_').rename_axis([None, None], axis=1)
    df_lag.columns = df_lag.columns.droplevel(0)
    df_lag = df_lag[df_lag['lag_2'].notnull()]
    df_lag = df_lag[df_lag['lag_1'].notnull()]
    df_lag.reset_index(inplace=True)
    
    if (i >= 1) and (i <= 5):
        df_lag['weight'] = 1
    elif i <= 10:
        df_lag['weight'] = 6/7
    elif i <= 15:
        df_lag['weight'] = 5/7
    elif i <= 20:
        df_lag['weight'] = 4/7
    elif i <= 25:
        df_lag['weight'] = 3/7
    elif i <= 30:
        df_lag['weight'] = 2/7
    else:
        df_lag['weight'] = 1/7
        
        
    df_temp = df.copy()
    cols = df_temp.drop(['client_pin'], axis=1).columns
    df_temp[cols] = df.groupby('client_pin')[cols].shift(i)
    df_temp.dropna(inplace=True)

    df_stat = pd.DataFrame()
    df_stat['client_pin'] = df_temp['client_pin'].unique()
    last_ses = df_temp.groupby('client_pin')['session_id'].tail(1).values
    
    df_stat = df_stat.merge(df_temp[['client_pin'] + from_parq].loc[~df_temp['session_id'].isin(last_ses)].groupby(['client_pin']).tail(1),
                        how='left', on='client_pin')
    
    df_lag = df_lag.merge(df_stat, how='left', on='client_pin')
  
    concat_list.append(df_lag)
    del exp_df
    del df_lag
    del df_stat
    del df_temp
    gc.collect()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stabl

In [15]:
df_all = pd.concat(concat_list).reset_index(drop=True)

In [16]:
df_all.to_csv('alfa1_df_train11.csv', index=False)