In [None]:
import os

def scale_input_data(scale_factor):
  file_bases = ['./input/test', './input/train', './input/historical_transactions',
                './input/new_merchant_transactions']
  for file_base in file_bases:
    import pandas as pd
    import shutil
    if scale_factor == 1.0:
      shutil.copyfile(file_base + '.csv', file_base + '.scaled.csv')
      continue
    df_to_scale = pd.read_csv(file_base + '.csv')
    new_num_rows = int(scale_factor * len(df_to_scale))
    if scale_factor <= 1.0:
      df_to_scale = df_to_scale.iloc[:new_num_rows]
    else:
      while len(df_to_scale) < new_num_rows:
        df_to_scale = pd.concat([df_to_scale, df_to_scale[:min(new_num_rows - len(df_to_scale), len(df_to_scale))]])
    df_to_scale.to_csv(file_base + '.scaled.csv', index=False)

if 'INPUT_SCALE_FACTOR' in os.environ:
  scale_input_data(float(os.environ['INPUT_SCALE_FACTOR']))

This is a version of Ashish Gupta's notebook, which in turn was based on Chau Ngoc Huynh's kernel (3.699) and Panchajanya Banerjee's idea to include the holidays. This purpose of this notebook is just to save teh feature-engineered train and test sets, so that they can be further used in other kenels. 


Special Brazil holidays:

For 2017-18

1. Mother's Day : Second Sunday of May : May 13 2018
2. Father's Day : Second Sunday of August :  August 13 2017
3. Valentine's Day : 12th June, 2017
4. Children's Day : 12th October 2017
5. Black Friday : 24th November 2017
6. Christmas day : 25th December 2017


In [1]:
# Import the necessary libraries
import numpy as np
# import pandas as pd
exec(os.environ['IREWR_IMPORTS'])
# FIRST-AUTHOR: remove plotting, GC code
# import os
# import time
# import warnings
# import gc
# import os
# from six.moves import urllib
# import matplotlib
# import matplotlib.pyplot as plt
# import seaborn as sns
import datetime
# FIRST-AUTHOR: remove plotting, ML code
# warnings.filterwarnings('ignore')
# %matplotlib inline
# plt.style.use('seaborn')
# from scipy.stats import norm, skew
# from sklearn.preprocessing import StandardScaler

In [2]:
# FIRST-AUTHOR: remove ML code
# #Add All the Models Libraries

# # Scalers
# from sklearn.utils import shuffle
# from sklearn.pipeline import Pipeline
# from sklearn.pipeline import FeatureUnion

# # Models

# from sklearn.linear_model import Lasso
# from sklearn.metrics import mean_squared_log_error,mean_squared_error, r2_score,mean_absolute_error

# from sklearn.model_selection import train_test_split #training and testing data split
# from sklearn import metrics #accuracy measure
# from sklearn.metrics import confusion_matrix #for confusion matrix
# from scipy.stats import reciprocal, uniform

# from sklearn.model_selection import StratifiedKFold

# # Cross-validation
# from sklearn.model_selection import KFold #for K-fold cross validation
# from sklearn.model_selection import cross_val_score #score evaluation
# from sklearn.model_selection import cross_val_predict #prediction
# from sklearn.model_selection import cross_validate

# # GridSearchCV
# from sklearn.model_selection import GridSearchCV
# from sklearn.model_selection import RandomizedSearchCV

# #Common data processors
# from sklearn.preprocessing import OneHotEncoder, LabelEncoder
# from sklearn import feature_selection
# from sklearn import model_selection
# from sklearn import metrics
# from sklearn.base import BaseEstimator, TransformerMixin
# from sklearn.utils import check_array
# from scipy import sparse

In [3]:
# to make this notebook's output stable across runs
np.random.seed(123)
# FIRST-AUTHOR: remove GC code, plotting
# gc.collect()
# # To plot pretty figures
# %matplotlib inline
# plt.rcParams['axes.labelsize'] = 14
# plt.rcParams['xtick.labelsize'] = 12
# plt.rcParams['ytick.labelsize'] = 12

In [4]:
#Reduce the memory usage - Inspired by Panchajanya Banerjee
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [5]:
train = reduce_mem_usage(pd.read_csv('./input/train.scaled.csv',parse_dates=["first_active_month"]))
test = reduce_mem_usage(pd.read_csv('./input/test.scaled.csv', parse_dates=["first_active_month"]))

Mem. usage decreased to  4.04 Mb (56.2% reduction)
Mem. usage decreased to  2.24 Mb (52.5% reduction)


In [6]:
train.shape

(201917, 6)

In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 201917 entries, 0 to 201916
Data columns (total 6 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   first_active_month  201917 non-null  datetime64[ns]
 1   card_id             201917 non-null  object        
 2   feature_1           201917 non-null  int8          
 3   feature_2           201917 non-null  int8          
 4   feature_3           201917 non-null  int8          
 5   target              201917 non-null  float16       
dtypes: datetime64[ns](1), float16(1), int8(3), object(1)
memory usage: 4.0+ MB


In [8]:
# Now extract the month, year, day, weekday
train["month"] = train["first_active_month"].dt.month
train["year"] = train["first_active_month"].dt.year
# FIRST-AUTHOR: make notebook run
# train['week'] = train["first_active_month"].dt.weekofyear
train['week'] = train["first_active_month"].dt.isocalendar().week
train['dayofweek'] = train['first_active_month'].dt.dayofweek
# FIRST-AUTHOR: make notebook run
# train['days'] = (datetime.date(2018, 2, 1) - train['first_active_month'].dt.date).dt.days
train['days'] = pd.to_timedelta((datetime.date(2018, 2, 1) - train['first_active_month'].dt.date)).dt.days

test["month"] = test["first_active_month"].dt.month
test["year"] = test["first_active_month"].dt.year
# FIRST-AUTHOR: make notebook run
# test['week'] = test["first_active_month"].dt.weekofyear
test['week'] = test["first_active_month"].dt.isocalendar().week
test['dayofweek'] = test['first_active_month'].dt.dayofweek
# FIRST-AUTHOR: make notebook run
# test['days'] = (datetime.date(2018, 2, 1) - test['first_active_month'].dt.date).dt.days
test['days'] = pd.to_timedelta(datetime.date(2018, 2, 1) - test['first_active_month'].dt.date).dt.days

Now we will try to extract more features from Transactions Data

In [9]:
# Taking Reference from Other Kernels
def aggregate_transaction_hist(trans, prefix):  
        
    agg_func = {
        'purchase_date' : ['max','min'],
        'month_diff' : ['mean'],
        'weekend' : ['sum', 'mean'],
        'authorized_flag': ['sum', 'mean'],
        'category_1': ['sum','mean'],
        'purchase_amount': ['sum', 'mean', 'max', 'min', 'std'],
        'installments': ['sum', 'mean', 'max', 'min', 'std'],  
        #'merchant_id': ['nunique'],
        'month_lag': ['max','min','mean','var'],
        'month_diff' : ['mean'],
        'card_id' : ['size'],
        'month': ['nunique'],
        'hour': ['nunique'],
        'weekofyear': ['nunique'],
        'dayofweek': ['nunique'],
        'year': ['nunique'],
        'subsector_id': ['nunique'],
        'merchant_category_id' : ['nunique'],
        'Christmas_Day_2017':['mean'],
        #'Mothers_Day_2017':['mean'],
        'fathers_day_2017':['mean'],
        'Children_day_2017':['mean'],
        'Black_Friday_2017':['mean'],
        'Valentine_day_2017':['mean'],
        'Mothers_Day_2018':['mean']
    }
    
    agg_trans = trans.groupby(['card_id']).agg(agg_func)
    agg_trans.columns = [prefix + '_'.join(col).strip() 
                           for col in agg_trans.columns.values]
    agg_trans.reset_index(inplace=True)
    
    df = (trans.groupby('card_id')
          .size()
          .reset_index(name='{}transactions_count'.format(prefix)))
    
    agg_trans = pd.merge(df, agg_trans, on='card_id', how='left')
    
    return agg_trans

In [10]:
transactions = reduce_mem_usage(pd.read_csv('./input/historical_transactions.scaled.csv'))
transactions['authorized_flag'] = transactions['authorized_flag'].map({'Y': 1, 'N': 0})
transactions['category_1'] = transactions['category_1'].map({'Y': 1, 'N': 0})

Mem. usage decreased to 1749.11 Mb (43.7% reduction)


In [11]:
#Feature Engineering - Adding new features inspired by Chau's first kernel
transactions['purchase_date'] = pd.to_datetime(transactions['purchase_date'])
transactions['year'] = transactions['purchase_date'].dt.year
# FIRST-AUTHOR: make notebook run
# transactions['weekofyear'] = transactions['purchase_date'].dt.weekofyear
transactions['weekofyear'] = transactions['purchase_date'].dt.isocalendar().week
transactions['month'] = transactions['purchase_date'].dt.month
transactions['dayofweek'] = transactions['purchase_date'].dt.dayofweek
transactions['weekend'] = (transactions.purchase_date.dt.weekday >=5).astype(int)
transactions['hour'] = transactions['purchase_date'].dt.hour 
transactions['month_diff'] = ((datetime.datetime.today() - transactions['purchase_date']).dt.days)//30
transactions['month_diff'] += transactions['month_lag']

#impute missing values - This is now excluded.
transactions['category_2'] = transactions['category_2'].fillna(1.0,inplace=True)
transactions['category_3'] = transactions['category_3'].fillna('A',inplace=True)
transactions['merchant_id'] = transactions['merchant_id'].fillna('M_ID_00a6ca8a8a',inplace=True)
# FIRST-AUTHOR: remove GC code
# gc.collect()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  transactions['category_2'] = transactions['category_2'].fillna(1.0,inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  transactions['category_3'] = transactions['category_3'].fillna('A',inplace=True)
The behavior will change in pandas 3.0. This inplace method will never

In [12]:
# FIRST-AUTHOR: make notebook run
# agg_func = {
#         'mean': ['mean'],
#     }
# for col in ['category_2','category_3']:
#     transactions[col+'_mean'] = transactions['purchase_amount'].groupby(transactions[col]).agg(agg_func)
for col in ['category_2','category_3']:
    transactions[col+'_mean'] = transactions['purchase_amount'].groupby(transactions[col]).agg('mean')

In [13]:
# New Features with Key Shopping times considered in the dataset. if the purchase has been made within 60 days, it is considered as an influence
#Christmas : December 25 2017
transactions['Christmas_Day_2017'] = (pd.to_datetime('2017-12-25') - transactions['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)
#Mothers Day: May 14 2017
#transactions['Mothers_Day_2017'] = (pd.to_datetime('2017-05-04') - transactions['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)
#fathers day: August 13 2017
transactions['fathers_day_2017'] = (pd.to_datetime('2017-08-13') - transactions['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)
#Childrens day: October 12 2017
transactions['Children_day_2017'] = (pd.to_datetime('2017-10-12') - transactions['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)
#Black Friday : 24th November 2017
transactions['Black_Friday_2017'] = (pd.to_datetime('2017-11-24') - transactions['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)
#Valentines Day
transactions['Valentine_day_2017'] = (pd.to_datetime('2017-06-12') - transactions['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)

#2018
#Mothers Day: May 13 2018
transactions['Mothers_Day_2018'] = (pd.to_datetime('2018-05-13') - transactions['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)

# FIRST-AUTHOR: remove GC code
# gc.collect()

In [14]:
merge_trans = aggregate_transaction_hist(transactions, prefix='hist_')
del transactions
# FIRST-AUTHOR: remove GC code
# gc.collect()
train = pd.merge(train, merge_trans, on='card_id',how='left')
test = pd.merge(test, merge_trans, on='card_id',how='left')
del merge_trans
# FIRST-AUTHOR: remove GC code
# gc.collect()

In [15]:
train.head(2)

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,target,month,year,week,dayofweek,...,hist_dayofweek_nunique,hist_year_nunique,hist_subsector_id_nunique,hist_merchant_category_id_nunique,hist_Christmas_Day_2017_mean,hist_fathers_day_2017_mean,hist_Children_day_2017_mean,hist_Black_Friday_2017_mean,hist_Valentine_day_2017_mean,hist_Mothers_Day_2018_mean
0,2017-06-01,C_ID_92a2005557,5,2,1,-0.820312,6,2017,22,3,...,7,2,21,41,13.123077,6.265385,26.765385,16.465385,0.0,7.753846
1,2017-01-01,C_ID_3d0044924f,4,1,0,0.392822,1,2017,52,6,...,7,2,24,57,10.648571,12.551429,13.788571,9.834286,6.568571,0.0


In [16]:
#Feature Engineering - Adding new features inspired by Chau's first kernel
train['hist_purchase_date_max'] = pd.to_datetime(train['hist_purchase_date_max'])
train['hist_purchase_date_min'] = pd.to_datetime(train['hist_purchase_date_min'])
train['hist_purchase_date_diff'] = (train['hist_purchase_date_max'] - train['hist_purchase_date_min']).dt.days
train['hist_purchase_date_average'] = train['hist_purchase_date_diff']/train['hist_card_id_size']
train['hist_purchase_date_uptonow'] = (datetime.datetime.today() - train['hist_purchase_date_max']).dt.days
train['hist_first_buy'] = (train['hist_purchase_date_min'] - train['first_active_month']).dt.days
for feature in ['hist_purchase_date_max','hist_purchase_date_min']:
    train[feature] = train[feature].astype(np.int64) * 1e-9


In [17]:
#Feature Engineering - Adding new features inspired by Chau's first kernel
test['hist_purchase_date_max'] = pd.to_datetime(test['hist_purchase_date_max'])
test['hist_purchase_date_min'] = pd.to_datetime(test['hist_purchase_date_min'])
test['hist_purchase_date_diff'] = (test['hist_purchase_date_max'] - test['hist_purchase_date_min']).dt.days
test['hist_purchase_date_average'] = test['hist_purchase_date_diff']/test['hist_card_id_size']
test['hist_purchase_date_uptonow'] = (datetime.datetime.today() - test['hist_purchase_date_max']).dt.days
test['hist_first_buy'] = (test['hist_purchase_date_min'] - test['first_active_month']).dt.days
for feature in ['hist_purchase_date_max','hist_purchase_date_min']:
    test[feature] = test[feature].astype(np.int64) * 1e-9

In [18]:
# Taking Reference from Other Kernels
def aggregate_transaction_new(trans, prefix):  
        
    agg_func = {
        'purchase_date' : ['max','min'],
        'month_diff' : ['mean'],
        'weekend' : ['sum', 'mean'],
        'authorized_flag': ['sum'],
        'category_1': ['sum','mean'],
        'purchase_amount': ['sum', 'mean', 'max', 'min', 'std'],
        'installments': ['sum', 'mean', 'max', 'min', 'std'],  
        #'merchant_id': ['nunique'],
        'month_lag': ['max','min','mean','var'],
        'month_diff' : ['mean'],
        'card_id' : ['size'],
        'month': ['nunique'],
        'hour': ['nunique'],
        'weekofyear': ['nunique'],
        'dayofweek': ['nunique'],
        'year': ['nunique'],
        'subsector_id': ['nunique'],
        'merchant_category_id' : ['nunique'],
        'Christmas_Day_2017':['mean'],
        #'Mothers_Day_2017':['mean'],
        'fathers_day_2017':['mean'],
        'Children_day_2017':['mean'],
        'Black_Friday_2017':['mean'],
        'Valentine_Day_2017' : ['mean'],
        'Mothers_Day_2018':['mean']
    }
    
    agg_trans = trans.groupby(['card_id']).agg(agg_func)
    agg_trans.columns = [prefix + '_'.join(col).strip() 
                           for col in agg_trans.columns.values]
    agg_trans.reset_index(inplace=True)
    
    df = (trans.groupby('card_id')
          .size()
          .reset_index(name='{}transactions_count'.format(prefix)))
    
    agg_trans = pd.merge(df, agg_trans, on='card_id', how='left')
    
    return agg_trans

In [19]:
# Now extract the data from the new transactions
new_transactions = reduce_mem_usage(pd.read_csv('./input/new_merchant_transactions.scaled.csv'))
new_transactions['authorized_flag'] = new_transactions['authorized_flag'].map({'Y': 1, 'N': 0})
new_transactions['category_1'] = new_transactions['category_1'].map({'Y': 1, 'N': 0})

Mem. usage decreased to 114.20 Mb (45.5% reduction)


In [20]:
#Feature Engineering - Adding new features inspired by Chau's first kernel
new_transactions['purchase_date'] = pd.to_datetime(new_transactions['purchase_date'])
new_transactions['year'] = new_transactions['purchase_date'].dt.year
# FIRST-AUTHOR: make notebook run
# new_transactions['weekofyear'] = new_transactions['purchase_date'].dt.weekofyear
new_transactions['weekofyear'] = new_transactions['purchase_date'].dt.isocalendar().week
new_transactions['month'] = new_transactions['purchase_date'].dt.month
new_transactions['dayofweek'] = new_transactions['purchase_date'].dt.dayofweek
new_transactions['weekend'] = (new_transactions.purchase_date.dt.weekday >=5).astype(int)
new_transactions['hour'] = new_transactions['purchase_date'].dt.hour 
new_transactions['month_diff'] = ((datetime.datetime.today() - new_transactions['purchase_date']).dt.days)//30
new_transactions['month_diff'] += new_transactions['month_lag']

#impute missing values
new_transactions['category_2'] = new_transactions['category_2'].fillna(1.0,inplace=True)
new_transactions['category_3'] = new_transactions['category_3'].fillna('A',inplace=True)
new_transactions['merchant_id'] = new_transactions['merchant_id'].fillna('M_ID_00a6ca8a8a',inplace=True)

# New Features with Key Shopping times considered in the dataset. if the purchase has been made within 60 days, 
# it is considered as an influence

#Christmas : December 25 2017
new_transactions['Christmas_Day_2017'] = (pd.to_datetime('2017-12-25') - new_transactions['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)
#Mothers Day: May 14 2017 - Was not significant in Feature Importance
#new_transactions['Mothers_Day_2017'] = (pd.to_datetime('2017-06-04') - new_transactions['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)
#fathers day: August 13 2017
new_transactions['fathers_day_2017'] = (pd.to_datetime('2017-08-13') - new_transactions['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)
#Childrens day: October 12 2017
new_transactions['Children_day_2017'] = (pd.to_datetime('2017-10-12') - new_transactions['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)
#Valentine's Day : 12th June, 2017
new_transactions['Valentine_Day_2017'] = (pd.to_datetime('2017-06-12') - new_transactions['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)
#Black Friday : 24th November 2017
new_transactions['Black_Friday_2017'] = (pd.to_datetime('2017-11-24') - new_transactions['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)

#2018
#Mothers Day: May 13 2018
new_transactions['Mothers_Day_2018'] = (pd.to_datetime('2018-05-13') - new_transactions['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)

# FIRST-AUTHOR: make notebook run
# agg_func = {
#         'mean': ['mean'],
#     }
# for col in ['category_2','category_3']:
#     new_transactions[col+'_mean'] = new_transactions['purchase_amount'].groupby(new_transactions[col]).agg(agg_func)
for col in ['category_2','category_3']:
    new_transactions[col+'_mean'] = new_transactions['purchase_amount'].groupby(new_transactions[col]).agg('mean')

# FIRST-AUTHOR: remove GC code
# gc.collect()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  new_transactions['category_2'] = new_transactions['category_2'].fillna(1.0,inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  new_transactions['category_3'] = new_transactions['category_3'].fillna('A',inplace=True)
The behavior will change in pandas 3.0. This inplace m

In [21]:
merge_new = aggregate_transaction_new(new_transactions, prefix='new_')
del new_transactions
# FIRST-AUTHOR: remove GC code
# gc.collect()

train = pd.merge(train, merge_new, on='card_id',how='left')
test = pd.merge(test, merge_new, on='card_id',how='left')
del merge_new

# FIRST-AUTHOR: remove GC code
# gc.collect()

In [22]:
#Feature Engineering - Adding new features inspired by Chau's first kernel
train['new_purchase_date_max'] = pd.to_datetime(train['new_purchase_date_max'])
train['new_purchase_date_min'] = pd.to_datetime(train['new_purchase_date_min'])
train['new_purchase_date_diff'] = (train['new_purchase_date_max'] - train['new_purchase_date_min']).dt.days
train['new_purchase_date_average'] = train['new_purchase_date_diff']/train['new_card_id_size']
train['new_purchase_date_uptonow'] = (datetime.datetime.today() - train['new_purchase_date_max']).dt.days
train['new_first_buy'] = (train['new_purchase_date_min'] - train['first_active_month']).dt.days
for feature in ['new_purchase_date_max','new_purchase_date_min']:
    train[feature] = train[feature].astype(np.int64) * 1e-9

#Feature Engineering - Adding new features inspired by Chau's first kernel
test['new_purchase_date_max'] = pd.to_datetime(test['new_purchase_date_max'])
test['new_purchase_date_min'] = pd.to_datetime(test['new_purchase_date_min'])
test['new_purchase_date_diff'] = (test['new_purchase_date_max'] - test['new_purchase_date_min']).dt.days
test['new_purchase_date_average'] = test['new_purchase_date_diff']/test['new_card_id_size']
test['new_purchase_date_uptonow'] = (datetime.datetime.today() - test['new_purchase_date_max']).dt.days
test['new_first_buy'] = (test['new_purchase_date_min'] - test['first_active_month']).dt.days
for feature in ['new_purchase_date_max','new_purchase_date_min']:
    test[feature] = test[feature].astype(np.int64) * 1e-9
    
#added new feature - Interactive
train['card_id_total'] = train['new_card_id_size'] + train['hist_card_id_size']
train['purchase_amount_total'] = train['new_purchase_amount_sum'] + train['hist_purchase_amount_sum']

test['card_id_total'] = test['new_card_id_size'] + test['hist_card_id_size']
test['purchase_amount_total'] = test['new_purchase_amount_sum'] + test['hist_purchase_amount_sum']

# FIRST-AUTHOR: remove GC code
# gc.collect()

In [23]:
# Now check the shape of Train and Test Data
train.shape

(201917, 96)

In [24]:
test.shape

(123623, 95)

In [25]:
train.head(5)

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,target,month,year,week,dayofweek,...,new_Children_day_2017_mean,new_Black_Friday_2017_mean,new_Valentine_Day_2017_mean,new_Mothers_Day_2018_mean,new_purchase_date_diff,new_purchase_date_average,new_purchase_date_uptonow,new_first_buy,card_id_total,purchase_amount_total
0,2017-06-01,C_ID_92a2005557,5,2,1,-0.820312,6,2017,22,3,...,0.0,0.0,0.0,41.73913,54.0,2.347826,2382.0,277.0,283.0,-179.212631
1,2017-01-01,C_ID_3d0044924f,4,1,0,0.392822,1,2017,52,6,...,0.0,0.0,0.0,56.833333,56.0,9.333333,2412.0,396.0,356.0,-214.361801
2,2016-08-01,C_ID_d639edf6cd,2,2,0,0.687988,8,2016,31,0,...,0.0,0.0,0.0,14.0,0.0,0.0,2383.0,635.0,44.0,-29.867586
3,2017-09-01,C_ID_186d6a6901,4,3,0,0.142456,9,2017,35,4,...,0.0,0.0,0.0,37.714286,41.0,5.857143,2393.0,187.0,84.0,-54.146149
4,2017-11-01,C_ID_cdbd2c0db2,1,3,0,-0.15979,11,2017,44,2,...,0.0,0.0,0.0,41.194444,57.0,1.583333,2382.0,121.0,169.0,-68.6138


Imputations and Data Transformation

In [26]:
#Check for missing values in training set
nulls = np.sum(train.isnull())
nullcols = nulls.loc[(nulls != 0)]
dtypes = train.dtypes
dtypes2 = dtypes.loc[(nulls != 0)]
info = pd.concat([nullcols, dtypes2], axis=1).sort_values(by=0, ascending=False)
#print(info)
#print("There are", len(nullcols), "columns with missing values in data set")

  return reduction(axis=axis, out=out, **passkwargs)


In [27]:
#Check for missing values in training set
nulls = np.sum(test.isnull())
nullcols = nulls.loc[(nulls != 0)]
dtypes = test.dtypes
dtypes2 = dtypes.loc[(nulls != 0)]
info = pd.concat([nullcols, dtypes2], axis=1).sort_values(by=0, ascending=False)
#print(info)
#print("There are", len(nullcols), "columns with missing values in test set")

  return reduction(axis=axis, out=out, **passkwargs)


Impute any values will significantly affect the RMSE score for test set. So Imputations have been excluded

In [28]:
numeric_dtypes = ['float64']
numerics = []
for i in train.columns:
    if train[i].dtype in numeric_dtypes: 
        numerics.append(i)
        
#train.update(train[numerics].fillna(0))

In [29]:
numeric_dtypes = ['float64']
numerics = []
for i in test.columns:
    if test[i].dtype in numeric_dtypes: 
        numerics.append(i)
        
#test.update(test[numerics].fillna(0))

Detect and Correct Outliers

In [30]:
# Remove the Outliers if any 
train['outliers'] = 0
train.loc[train['target'] < -30, 'outliers'] = 1
train['outliers'].value_counts()

outliers
0    199710
1      2207
Name: count, dtype: int64

In [31]:
for features in ['feature_1','feature_2','feature_3']:
    order_label = train.groupby([features])['outliers'].mean()
    train[features] = train[features].map(order_label)
    test[features] =  test[features].map(order_label)

In [32]:
# Get the X and Y
df_train_columns = [c for c in train.columns if c not in ['card_id', 'first_active_month', 'outliers']]
df_test_columns = [c for c in train.columns if c not in ['card_id', 'first_active_month','target','outliers']]


In [33]:
train['fold'] = 0
# FIRST-AUTHOR: remove ML code
# folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1974)

# for fold_, (trn_idx, val_idx) in enumerate(folds.split(train,train['outliers'].values)):
#     train['fold'].iloc[val_idx] = fold_
_ = train['outliers'].values
    



In [34]:
df_train_columns.append('fold')

In [35]:
train.head(20)

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,target,month,year,week,dayofweek,...,new_Valentine_Day_2017_mean,new_Mothers_Day_2018_mean,new_purchase_date_diff,new_purchase_date_average,new_purchase_date_uptonow,new_first_buy,card_id_total,purchase_amount_total,outliers,fold
0,2017-06-01,C_ID_92a2005557,0.013145,0.008752,0.011428,-0.820312,6,2017,22,3,...,0.0,41.73913,54.0,2.347826,2382.0,277.0,283.0,-179.212631,0,0
1,2017-01-01,C_ID_3d0044924f,0.010712,0.011385,0.010283,0.392822,1,2017,52,6,...,0.0,56.833333,56.0,9.333333,2412.0,396.0,356.0,-214.361801,0,0
2,2016-08-01,C_ID_d639edf6cd,0.01061,0.008752,0.010283,0.687988,8,2016,31,0,...,0.0,14.0,0.0,0.0,2383.0,635.0,44.0,-29.867586,0,0
3,2017-09-01,C_ID_186d6a6901,0.010712,0.014166,0.010283,0.142456,9,2017,35,4,...,0.0,37.714286,41.0,5.857143,2393.0,187.0,84.0,-54.146149,0,0
4,2017-11-01,C_ID_cdbd2c0db2,0.008058,0.014166,0.010283,-0.15979,11,2017,44,2,...,0.0,41.194444,57.0,1.583333,2382.0,121.0,169.0,-68.6138,0,0
5,2016-09-01,C_ID_0894217f2f,0.010712,0.008752,0.010283,0.871582,9,2016,35,3,...,0.0,0.0,31.0,7.75,2607.0,348.0,37.0,-15.17598,0,0
6,2016-12-01,C_ID_7e63323c00,0.010479,0.008752,0.011428,0.230103,12,2016,48,3,...,0.0,56.6,31.0,6.2,2405.0,460.0,265.0,-189.024124,0,0
7,2017-09-01,C_ID_dfa21fc124,0.010479,0.008752,0.011428,2.136719,9,2017,35,4,...,0.0,87.333333,12.0,4.0,2451.0,158.0,25.0,-15.663989,0,0
8,2017-08-01,C_ID_fe0fdac8ea,0.01061,0.011385,0.010283,-0.06543,8,2017,31,1,...,0.0,54.5,3.0,1.5,2421.0,228.0,17.0,-9.971558,0,0
9,2016-08-01,C_ID_bf62c0b49d,0.01061,0.008752,0.010283,0.300049,8,2016,31,0,...,0.0,38.666667,7.0,2.333333,2403.0,608.0,116.0,-79.13726,0,0


In [36]:
train[df_train_columns].to_csv('new_train.csv', index=False)
test[df_test_columns].to_csv('new_test.csv', index=False)
train[['card_id', 'outliers']].to_csv('outliers', index=False)