In [None]:
import os

def scale_input_data(scale_factor):
  file_bases = ['./input/train', './input/test', './input/merchants',
                './input/historical_transactions', './input/new_merchant_transactions',
                './input/Data_Dictionary']
  for file_base in file_bases:
    import pandas as pd
    import shutil
    if scale_factor == 1.0:
      if file_base == './input/Data_Dictionary':
        shutil.copyfile(file_base + '.xlsx', file_base + '.scaled.xlsx')
      else:
        shutil.copyfile(file_base + '.csv', file_base + '.scaled.csv')
      continue
    if file_base =='./input/Data_Dictionary':
      df_to_scale = pd.read_excel(file_base + '.xlsx')
    else:
      df_to_scale = pd.read_csv(file_base + '.csv')
    new_num_rows = int(scale_factor * len(df_to_scale))
    if scale_factor <= 1.0:
      df_to_scale = df_to_scale.iloc[:new_num_rows]
    else:
      while len(df_to_scale) < new_num_rows:
        df_to_scale = pd.concat([df_to_scale, df_to_scale[:min(new_num_rows - len(df_to_scale), len(df_to_scale))]])
    if file_base == './input/Data_Dictionary':
      df_to_scale.to_excel(file_base + '.scaled.xlsx')
    else:
      df_to_scale.to_csv(file_base + '.scaled.csv')

if 'INPUT_SCALE_FACTOR' in os.environ:
  scale_input_data(float(os.environ['INPUT_SCALE_FACTOR']))

# Elo Merchant Category Recommendation | Kaggle
## By Yonela Nuba


### What files do I need?
You will need, at a minimum, the train.csv and test.csv files. These contain the card_ids that we'll be using for training and prediction.

The historical_transactions.csv and new_merchant_transactions.csv files contain information about each card's transactions. historical_transactions.csv contains up to 3 months' worth of transactions for every card at any of the provided merchant_ids. new_merchant_transactions.csv contains the transactions at new merchants (merchant_ids that this particular card_id has not yet visited) over a period of two months.

merchants.csv contains aggregate information for each merchant_id represented in the data set.

### What should I expect the data format to be?
The data is formatted as follows:

train.csv and test.csv contain card_ids and information about the card itself - the first month the card was active, etc. train.csv also contains the target.

historical_transactions.csv and new_merchant_transactions.csv are designed to be joined with train.csv, test.csv, and merchants.csv. They contain information about transactions for each card, as described above.

merchants can be joined with the transaction sets to provide additional merchant-level information.

### What am I predicting?
You are predicting a loyalty score for each card_id represented in test.csv and sample_submission.csv.


# This what we are going to do:

1. Import Libraries and Data
2. Data Visualization
3. Data Cleaning
4. Feature Engineering
5. More Data Cleaning
6. Models
7. Predictions
8. Output Data

# 1. Import libraries and Data

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
exec(os.environ['IREWR_IMPORTS'])
# ALEX: remove plotting
# import matplotlib.pyplot as plt
# import seaborn as sns
# plt.style.use('fivethirtyeight') 
# %matplotlib inline


# import warnings
import datetime
import calendar
# ALEX: remove additional imports
# from datetime import time
# from dateutil.relativedelta import relativedelta

# # to ignore future warnings
# warnings.simplefilter(action = 'ignore', category = FutureWarning)
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

# ALEX: remove path printing
# import os
# print(os.listdir("./input"))
train = pd.read_csv('./input/train.scaled.csv')
test = pd.read_csv('./input/test.scaled.csv')
# Any results you write to the current directory are saved as output.

In [2]:
merchants = pd.read_csv('./input/merchants.scaled.csv')
historical_trans = pd.read_csv('./input/historical_transactions.scaled.csv')
new_merchants = pd.read_csv('./input/new_merchant_transactions.scaled.csv')
data_dict = pd.read_excel('./input/Data_Dictionary.scaled.xlsx')

# 2. Data visualization 

In [3]:
train.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,target
0,2017-06,C_ID_92a2005557,5,2,1,-0.820283
1,2017-01,C_ID_3d0044924f,4,1,0,0.392913
2,2016-08,C_ID_d639edf6cd,2,2,0,0.688056
3,2017-09,C_ID_186d6a6901,4,3,0,0.142495
4,2017-11,C_ID_cdbd2c0db2,1,3,0,-0.159749


In [4]:
test.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3
0,2017-04,C_ID_0ab67a22ab,3,3,1
1,2017-01,C_ID_130fd0cbdd,2,3,0
2,2017-08,C_ID_b709037bc5,5,1,1
3,2017-12,C_ID_d27d835a9f,2,1,0
4,2015-12,C_ID_2b5e3df5c2,5,1,1


In [5]:
merchants.head()

Unnamed: 0,merchant_id,merchant_group_id,merchant_category_id,subsector_id,numerical_1,numerical_2,category_1,most_recent_sales_range,most_recent_purchases_range,avg_sales_lag3,...,avg_sales_lag6,avg_purchases_lag6,active_months_lag6,avg_sales_lag12,avg_purchases_lag12,active_months_lag12,category_4,city_id,state_id,category_2
0,M_ID_838061e48c,8353,792,9,-0.057471,-0.057471,N,E,E,-0.4,...,-2.25,18.666667,6,-2.32,13.916667,12,N,242,9,1.0
1,M_ID_9339d880ad,3184,840,20,-0.057471,-0.057471,N,E,E,-0.72,...,-0.74,1.291667,6,-0.57,1.6875,12,N,22,16,1.0
2,M_ID_e726bbae1e,447,690,1,-0.057471,-0.057471,N,E,E,-82.13,...,-82.13,260.0,2,-82.13,260.0,2,N,-1,5,5.0
3,M_ID_a70e9c5f81,5026,792,9,-0.057471,-0.057471,Y,E,E,,...,,4.666667,6,,3.833333,12,Y,-1,-1,
4,M_ID_64456c37ce,2228,222,21,-0.057471,-0.057471,Y,E,E,,...,,0.361111,6,,0.347222,12,Y,-1,-1,


In [6]:
new_merchants.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,Y,C_ID_415bb3a509,107,N,1,B,307,M_ID_b0c793002c,1,-0.557574,2018-03-11 14:57:36,1.0,9,19
1,Y,C_ID_415bb3a509,140,N,1,B,307,M_ID_88920c89e8,1,-0.56958,2018-03-19 18:53:37,1.0,9,19
2,Y,C_ID_415bb3a509,330,N,1,B,507,M_ID_ad5237ef6b,2,-0.551037,2018-04-26 14:08:44,1.0,9,14
3,Y,C_ID_415bb3a509,-1,Y,1,B,661,M_ID_9e84cda3b1,1,-0.671925,2018-03-07 09:43:21,,-1,8
4,Y,C_ID_ef55cf8d4b,-1,Y,1,B,166,M_ID_3c86fa3831,1,-0.659904,2018-03-22 21:07:53,,-1,29


In [7]:
data_dict.head(10)

Unnamed: 0,train.csv,Unnamed: 1
0,,
1,Columns,Description
2,card_id,Unique card identifier
3,first_active_month,"'YYYY-MM', month of first purchase"
4,feature_1,Anonymized card categorical feature
5,feature_2,Anonymized card categorical feature
6,feature_3,Anonymized card categorical feature
7,target,Loyalty numerical score calculated 2 months af...


In [8]:
historical_trans.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25 15:33:07,1.0,16,37
1,Y,C_ID_4e6213e9bc,88,N,0,A,367,M_ID_86ec983688,-7,-0.733128,2017-07-15 12:10:45,1.0,16,16
2,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09 22:04:29,1.0,16,37
3,Y,C_ID_4e6213e9bc,88,N,0,A,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02 10:06:26,1.0,16,34
4,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10 01:14:19,1.0,16,37


## SHAPE OF THE DATASETS

In [9]:
print('train Data shape: {}'.format(train.shape))
print('test Data shape: {}'.format(test.shape))
print('merchants data shape: {}'.format(merchants.shape))
print('New merchants data shape: {}'.format(new_merchants.shape))
print('historical data shape: {}'.format(historical_trans.shape))
print('Data dictionary data shape: {}'.format(data_dict.shape))

train Data shape: (201917, 6)
test Data shape: (123623, 5)
merchants data shape: (334696, 22)
New merchants data shape: (1963031, 14)
historical data shape: (1000000, 14)
Data dictionary data shape: (8, 2)


In [10]:
# ALEX: remove plotting
# sns.distplot(train.target.values, kde = False)
_ = train.target.values

In [11]:
# ALEX: remove plotting
# sns.violinplot(x = "target", data=train)

In [12]:
# Target value outliers
print(f'There are: {train[train.target <- 10].target.shape[0]} outliers')

There are: 2264 outliers


In [13]:
train.describe()

Unnamed: 0,feature_1,feature_2,feature_3,target
count,201917.0,201917.0,201917.0,201917.0
mean,3.105311,1.74541,0.565569,-0.393636
std,1.18616,0.751362,0.495683,3.8505
min,1.0,1.0,0.0,-33.219281
25%,2.0,1.0,0.0,-0.88311
50%,3.0,2.0,1.0,-0.023437
75%,4.0,2.0,1.0,0.765453
max,5.0,3.0,1.0,17.965068


In [14]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 201917 entries, 0 to 201916
Data columns (total 6 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   first_active_month  201917 non-null  object 
 1   card_id             201917 non-null  object 
 2   feature_1           201917 non-null  int64  
 3   feature_2           201917 non-null  int64  
 4   feature_3           201917 non-null  int64  
 5   target              201917 non-null  float64
dtypes: float64(1), int64(3), object(2)
memory usage: 9.2+ MB


In [15]:
# Lets change the First_active_month column from string to date.

train['first_active_month'] = pd.to_datetime(train['first_active_month'])

# Also do this to the test dataset

test['first_active_month'] = pd.to_datetime(test['first_active_month'])

In [16]:
train_cat_feats = train.dtypes[train.dtypes == 'object'].index
train_num_feats = train.dtypes[train.dtypes != 'object'].index

In [17]:
print('train Categorical features {}'.format(len(train_cat_feats)))
print('train Numerical features {}'.format(len(train_num_feats)))

train Categorical features 1
train Numerical features 5


In [18]:
# ALEX: remove plotting
# sns.set()
# sns.pairplot(train, kind='scatter', size = 2, diag_kind='kde')
# plt.show()

In [19]:
# ALEX: remove plotting
# plt.figure(figsize = (7,5))
# plt.scatter(x= train.feature_1, y = train.target)
# plt.title('Feature_1', size = 15)

# plt.figure(figsize = (7,5))
# plt.scatter(x = train.feature_2, y = train.target)
# plt.title('Feature_2')

# plt.figure(figsize = (7,5))
# plt.scatter(x = train.feature_3, y = train.target)
# plt.title('Feature_3')
_ = train.feature_1
_ = train.target
_ = train.feature_2
_ = train.target
_ = train.feature_3
_ = train.target

In [20]:
train.drop(train[train['target'] < -10].index, inplace = True)
# train.drop(train[train['feature_2'] < -10].index, inplace = True)
# train.drop(train[train['feature_3'] < -10].index, inplace = True)

In [21]:
# ALEX: remove plotting
# plt.figure(figsize = (7,5))
# plt.scatter(x= train.feature_1, y = train.target)
# plt.title('Feature_1', size = 15)

# plt.figure(figsize = (7,5))
# plt.scatter(x = train.feature_2, y = train.target)
# plt.title('Feature_2')

# plt.figure(figsize = (7,5))
# plt.scatter(x = train.feature_3, y = train.target)
# plt.title('Feature_3')
_ = train.feature_1
_ = train.target
_ = train.feature_2
_ = train.target
_ = train.feature_3
_ = train.target

In [22]:
# ALEX: make notebook run
# correlation = train.corr()
correlation = train.corr(numeric_only=True)

In [23]:
correlation

Unnamed: 0,feature_1,feature_2,feature_3,target
feature_1,1.0,-0.130188,0.582984,-0.011285
feature_2,-0.130188,1.0,0.059843,-0.003951
feature_3,0.582984,0.059843,1.0,-0.007297
target,-0.011285,-0.003951,-0.007297,1.0


In [24]:
def missing_values(df):
    total_miss = df.isnull().sum()
    perc_miss = 100 * total_miss / len(df)
    table_miss = pd.concat([total_miss, perc_miss], axis = 1)
    ren_table = table_miss.rename(columns = {0:'Total Miss Values', 1: '% of miss values'})
    ren_table = ren_table[ren_table.iloc[:,1]!=0].sort_values('% of miss values', ascending = False).round(2)
    
    print('You data contains {}'.format(df.shape[1]) + ' columns and has {}'.format(ren_table.shape[0]) + ' colums with missing values' )
    
    return ren_table

In [25]:
missing_values(train)

You data contains 6 columns and has 0 colums with missing values


Unnamed: 0,Total Miss Values,% of miss values


In [26]:
train = train[train.target > -10]
# ALEX: remove plotting
# sns.pairplot(train[['feature_1', 'feature_2', 'feature_3', 'target']])
_ = train[['feature_1', 'feature_2', 'feature_3', 'target']]

### Lets check the distribution of our training datasets

In [27]:
# ALEX: remove plotting
# f, axes = plt.subplots(1,figsize = (12,6))
# sns.distplot(train.feature_1, ax = axes, kde = False, color = 'green', bins = 10).set_title('Train data')
# axes.set(ylabel = 'Card Counts')
# axes.set_xticks(np.arange(1,6,1))
# plt.show()
_ = train.feature_1

In [28]:
# ALEX: remove plotting
# f, axes = plt.subplots(1,figsize = (12,6))
# sns.distplot(train.feature_2, ax = axes, kde = False, color = 'green', bins = 10).set_title('Train data')
# axes.set(ylabel = 'Card Counts')
# axes.set_xticks(np.arange(1,6,1))
# plt.show()
_ = train.feature_2

In [29]:
# ALEX: remove plotting
# f, axes = plt.subplots(1,figsize = (12,6))
# sns.distplot(train.feature_3, ax = axes, kde = False, color = 'green', bins = 10).set_title('Train data')
# axes.set(ylabel = 'Card Counts')
# axes.set_xticks(np.arange(1,6,1))
# plt.show()
_ = train.feature_3

In [30]:
# f, axes = plt.subplots(1,figsize = (12,6))
# sns.distplot(train.first_active_month, ax = axes, color = 'green', kde = False, bins = 10).set_title('Train data')
# axes.set(ylabel = 'Card Counts')
# axes.set_xticks(np.arange(1,6,1))
# plt.show()

In [31]:
# ALEX: remove plotting
# train.groupby('first_active_month').count()['card_id'].plot(figsize = (15,5), title = 'Count of First Active Month in Train set', color = 'r')
# plt.show()
train.groupby('first_active_month').count()['card_id']

first_active_month
2011-11-01        8
2011-12-01        2
2012-02-01        7
2012-03-01       10
2012-04-01       15
              ...  
2017-10-01    13681
2017-11-01    12929
2017-12-01    10170
2018-01-01       34
2018-02-01        1
Name: card_id, Length: 75, dtype: int64

In [32]:
train_cat_feats = train.dtypes[train.dtypes == 'object'].index
train_num_feats = train.dtypes[train.dtypes != 'object'].index

print('Train categories: {}'.format(len(train_cat_feats)))
print('Train Numerical values: {}'.format(len(train_num_feats)))

Train categories: 1
Train Numerical values: 5


In [33]:
for col in train_cat_feats:
    cols = train[col].value_counts().index.shape[0]
    print(f'{cols} \t unique values in \t {col}')

199653 	 unique values in 	 card_id


In [34]:
for col in train_num_feats:
    cols = train[col].value_counts().index.shape[0]
    print(f'{cols} \t unique values in \t {col}')

75 	 unique values in 	 first_active_month
5 	 unique values in 	 feature_1
3 	 unique values in 	 feature_2
2 	 unique values in 	 feature_3
197052 	 unique values in 	 target


In [35]:
target = train.target
target.head()

0   -0.820283
1    0.392913
2    0.688056
3    0.142495
4   -0.159749
Name: target, dtype: float64

## Now lets see how other datasets look like

### Lets start with merchants dataset

In [36]:
merchants.describe()

  sqr = _ensure_numeric((avg - values) ** 2)
  sqr = _ensure_numeric((avg - values) ** 2)
  sqr = _ensure_numeric((avg - values) ** 2)


Unnamed: 0,merchant_group_id,merchant_category_id,subsector_id,numerical_1,numerical_2,avg_sales_lag3,avg_purchases_lag3,active_months_lag3,avg_sales_lag6,avg_purchases_lag6,active_months_lag6,avg_sales_lag12,avg_purchases_lag12,active_months_lag12,city_id,state_id,category_2
count,334696.0,334696.0,334696.0,334696.0,334696.0,334683.0,334696.0,334696.0,334683.0,334696.0,334696.0,334683.0,334696.0,334696.0,334696.0,334696.0,322809.0
mean,31028.736143,423.131663,25.116404,0.011476,0.008103,13.832993,inf,2.994108,21.65079,inf,5.947397,25.22771,inf,11.599335,102.917926,11.860942,2.380002
std,31623.043426,252.898046,9.807371,1.098154,1.070497,2395.489999,,0.095247,3947.108,,0.394936,5251.842,,1.520138,107.090673,6.176889,1.562661
min,1.0,-1.0,-1.0,-0.057471,-0.057471,-82.13,0.3334953,1.0,-82.13,0.1670447,1.0,-82.13,0.09832954,1.0,-1.0,-1.0,1.0
25%,3612.0,222.0,19.0,-0.057471,-0.057471,0.88,0.9236499,3.0,0.85,0.9022475,6.0,0.85,0.8983333,12.0,-1.0,9.0,1.0
50%,19900.0,373.0,27.0,-0.057471,-0.057471,1.0,1.016667,3.0,1.01,1.026961,6.0,1.02,1.043361,12.0,69.0,9.0,2.0
75%,51707.25,683.0,33.0,-0.047556,-0.047556,1.16,1.146522,3.0,1.23,1.215575,6.0,1.29,1.26648,12.0,182.0,16.0,4.0
max,112586.0,891.0,41.0,183.735111,182.079322,851844.64,inf,3.0,1513959.0,inf,6.0,2567408.0,inf,12.0,347.0,24.0,5.0


In [37]:
missing_values(merchants)

You data contains 22 columns and has 4 colums with missing values


Unnamed: 0,Total Miss Values,% of miss values
category_2,11887,3.55
avg_sales_lag3,13,0.0
avg_sales_lag6,13,0.0
avg_sales_lag12,13,0.0


In [38]:
merchants.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 334696 entries, 0 to 334695
Data columns (total 22 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   merchant_id                  334696 non-null  object 
 1   merchant_group_id            334696 non-null  int64  
 2   merchant_category_id         334696 non-null  int64  
 3   subsector_id                 334696 non-null  int64  
 4   numerical_1                  334696 non-null  float64
 5   numerical_2                  334696 non-null  float64
 6   category_1                   334696 non-null  object 
 7   most_recent_sales_range      334696 non-null  object 
 8   most_recent_purchases_range  334696 non-null  object 
 9   avg_sales_lag3               334683 non-null  float64
 10  avg_purchases_lag3           334696 non-null  float64
 11  active_months_lag3           334696 non-null  int64  
 12  avg_sales_lag6               334683 non-null  float64
 13 

In [39]:
merchant_cat_feats = merchants.dtypes[merchants.dtypes == 'object'].index
merchant_cat_feats

Index(['merchant_id', 'category_1', 'most_recent_sales_range',
       'most_recent_purchases_range', 'category_4'],
      dtype='object')

In [40]:
merchant_num_feats = merchants.dtypes[merchants.dtypes != 'object'].index
merchant_num_feats

Index(['merchant_group_id', 'merchant_category_id', 'subsector_id',
       'numerical_1', 'numerical_2', 'avg_sales_lag3', 'avg_purchases_lag3',
       'active_months_lag3', 'avg_sales_lag6', 'avg_purchases_lag6',
       'active_months_lag6', 'avg_sales_lag12', 'avg_purchases_lag12',
       'active_months_lag12', 'city_id', 'state_id', 'category_2'],
      dtype='object')

In [41]:
print('Merchant categorical features {}'.format(len(merchant_cat_feats)))
print('Merchant Numerical features {}'.format(len(merchant_num_feats)))

Merchant categorical features 5
Merchant Numerical features 17


In [42]:
for col in merchant_cat_feats:
    cols = merchants[col].value_counts().index.shape[0]
    print(f'{cols} \t unique values in \t {col}')

334633 	 unique values in 	 merchant_id
2 	 unique values in 	 category_1
5 	 unique values in 	 most_recent_sales_range
5 	 unique values in 	 most_recent_purchases_range
2 	 unique values in 	 category_4


In [43]:
for col in merchant_num_feats:
    cols = merchants[col].value_counts().index.shape[0]
    print(f'{cols} \t unique values in \t {col}')

109391 	 unique values in 	 merchant_group_id
324 	 unique values in 	 merchant_category_id
41 	 unique values in 	 subsector_id
954 	 unique values in 	 numerical_1
947 	 unique values in 	 numerical_2
3372 	 unique values in 	 avg_sales_lag3
100003 	 unique values in 	 avg_purchases_lag3
3 	 unique values in 	 active_months_lag3
4507 	 unique values in 	 avg_sales_lag6
135202 	 unique values in 	 avg_purchases_lag6
6 	 unique values in 	 active_months_lag6
5009 	 unique values in 	 avg_sales_lag12
172917 	 unique values in 	 avg_purchases_lag12
12 	 unique values in 	 active_months_lag12
271 	 unique values in 	 city_id
25 	 unique values in 	 state_id
5 	 unique values in 	 category_2


## Now lets observe Historical transactions

In [44]:
historical_trans.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25 15:33:07,1.0,16,37
1,Y,C_ID_4e6213e9bc,88,N,0,A,367,M_ID_86ec983688,-7,-0.733128,2017-07-15 12:10:45,1.0,16,16
2,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09 22:04:29,1.0,16,37
3,Y,C_ID_4e6213e9bc,88,N,0,A,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02 10:06:26,1.0,16,34
4,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10 01:14:19,1.0,16,37


In [45]:
historical_trans.describe()

Unnamed: 0,city_id,installments,merchant_category_id,month_lag,purchase_amount,category_2,state_id,subsector_id
count,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,924639.0,1000000.0,1000000.0
mean,127.939636,0.501833,470.369678,-5.028485,-0.235407,2.186192,10.844769,27.506362
std,103.040829,1.968149,247.741753,3.684319,114.412911,1.540223,6.244131,9.527308
min,-1.0,-1.0,-1.0,-13.0,-0.746908,1.0,-1.0,-1.0
25%,53.0,0.0,278.0,-8.0,-0.722024,1.0,9.0,19.0
50%,88.0,0.0,437.0,-4.0,-0.693579,1.0,9.0,33.0
75%,209.0,1.0,705.0,-2.0,-0.619633,3.0,16.0,34.0
max,347.0,999.0,891.0,0.0,87155.460339,5.0,24.0,41.0


In [46]:
historical_trans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 14 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   authorized_flag       1000000 non-null  object 
 1   card_id               1000000 non-null  object 
 2   city_id               1000000 non-null  int64  
 3   category_1            1000000 non-null  object 
 4   installments          1000000 non-null  int64  
 5   category_3            996845 non-null   object 
 6   merchant_category_id  1000000 non-null  int64  
 7   merchant_id           995889 non-null   object 
 8   month_lag             1000000 non-null  int64  
 9   purchase_amount       1000000 non-null  float64
 10  purchase_date         1000000 non-null  object 
 11  category_2            924639 non-null   float64
 12  state_id              1000000 non-null  int64  
 13  subsector_id          1000000 non-null  int64  
dtypes: float64(2), int64(6), object(6)


In [47]:
missing_values(historical_trans)

You data contains 14 columns and has 3 colums with missing values


Unnamed: 0,Total Miss Values,% of miss values
category_2,75361,7.54
merchant_id,4111,0.41
category_3,3155,0.32


In [48]:
hist_cat_feats = historical_trans.dtypes[historical_trans.dtypes == 'object'].index
hist_num_feats = historical_trans.dtypes[historical_trans.dtypes != 'object'].index

print('Historical Transactions categorical features {}'.format(len(hist_cat_feats)))
print('Historical Transactions Numerical features {}'.format(len(hist_num_feats)))

Historical Transactions categorical features 6
Historical Transactions Numerical features 8


In [49]:
for col in hist_cat_feats:
    cols = historical_trans[col].value_counts().index.shape[0]
    print(f'{cols} \t unique values in \t {col}')

2 	 unique values in 	 authorized_flag
4792 	 unique values in 	 card_id
2 	 unique values in 	 category_1
3 	 unique values in 	 category_3
100057 	 unique values in 	 merchant_id
950453 	 unique values in 	 purchase_date


In [50]:
for col in hist_num_feats:
    cols = historical_trans[col].value_counts().index.shape[0]
    print(f'{cols} \t unique values in \t {col}')

307 	 unique values in 	 city_id
15 	 unique values in 	 installments
290 	 unique values in 	 merchant_category_id
14 	 unique values in 	 month_lag
51593 	 unique values in 	 purchase_amount
5 	 unique values in 	 category_2
25 	 unique values in 	 state_id
41 	 unique values in 	 subsector_id


## Now lets check how does the New Merchant dataset looks like

In [51]:
new_merchants.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,Y,C_ID_415bb3a509,107,N,1,B,307,M_ID_b0c793002c,1,-0.557574,2018-03-11 14:57:36,1.0,9,19
1,Y,C_ID_415bb3a509,140,N,1,B,307,M_ID_88920c89e8,1,-0.56958,2018-03-19 18:53:37,1.0,9,19
2,Y,C_ID_415bb3a509,330,N,1,B,507,M_ID_ad5237ef6b,2,-0.551037,2018-04-26 14:08:44,1.0,9,14
3,Y,C_ID_415bb3a509,-1,Y,1,B,661,M_ID_9e84cda3b1,1,-0.671925,2018-03-07 09:43:21,,-1,8
4,Y,C_ID_ef55cf8d4b,-1,Y,1,B,166,M_ID_3c86fa3831,1,-0.659904,2018-03-22 21:07:53,,-1,29


In [52]:
new_merchants.describe()

Unnamed: 0,city_id,installments,merchant_category_id,month_lag,purchase_amount,category_2,state_id,subsector_id
count,1963031.0,1963031.0,1963031.0,1963031.0,1963031.0,1851286.0,1963031.0,1963031.0
mean,134.3867,0.6829643,430.9701,1.476515,-0.550969,2.197841,10.88067,25.97624
std,101.5152,1.584069,246.3385,0.4994483,0.6940043,1.528125,6.038542,10.12908
min,-1.0,-1.0,-1.0,1.0,-0.7468928,1.0,-1.0,-1.0
25%,69.0,0.0,278.0,1.0,-0.7166294,1.0,9.0,19.0
50%,110.0,1.0,367.0,1.0,-0.6748406,1.0,9.0,29.0
75%,212.0,1.0,683.0,2.0,-0.5816162,3.0,15.0,34.0
max,347.0,999.0,891.0,2.0,263.1575,5.0,24.0,41.0


In [53]:
new_merchants.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1963031 entries, 0 to 1963030
Data columns (total 14 columns):
 #   Column                Dtype  
---  ------                -----  
 0   authorized_flag       object 
 1   card_id               object 
 2   city_id               int64  
 3   category_1            object 
 4   installments          int64  
 5   category_3            object 
 6   merchant_category_id  int64  
 7   merchant_id           object 
 8   month_lag             int64  
 9   purchase_amount       float64
 10  purchase_date         object 
 11  category_2            float64
 12  state_id              int64  
 13  subsector_id          int64  
dtypes: float64(2), int64(6), object(6)
memory usage: 209.7+ MB


In [54]:
missing_values(new_merchants)

You data contains 14 columns and has 3 colums with missing values


Unnamed: 0,Total Miss Values,% of miss values
category_2,111745,5.69
category_3,55922,2.85
merchant_id,26216,1.34


In [55]:
new_merch_cat_feats = new_merchants.dtypes[new_merchants.dtypes == 'object'].index
new_merch_num_feats = new_merchants.dtypes[new_merchants.dtypes != 'object'].index

print('New Merchant categorical features {}'.format(len(new_merch_cat_feats)))
print('New Merchant numerical features {}'.format(len(new_merch_num_feats)))

New Merchant categorical features 6
New Merchant numerical features 8


In [56]:
for col in new_merch_cat_feats:
    cols = new_merchants[col].value_counts().index.shape[0]
    print(f'{cols} \t unique values in \t {col}')

1 	 unique values in 	 authorized_flag
290001 	 unique values in 	 card_id
2 	 unique values in 	 category_1
3 	 unique values in 	 category_3
226129 	 unique values in 	 merchant_id
1667025 	 unique values in 	 purchase_date


In [57]:
for col in new_merch_num_feats:
    cols = new_merchants[col].value_counts().index.shape[0]
    print(f'{cols} \t unique values in \t {col}')

308 	 unique values in 	 city_id
15 	 unique values in 	 installments
314 	 unique values in 	 merchant_category_id
2 	 unique values in 	 month_lag
75190 	 unique values in 	 purchase_amount
5 	 unique values in 	 category_2
25 	 unique values in 	 state_id
41 	 unique values in 	 subsector_id


# 3. Feature Engineering

### Let's work with the Merchants datasets (both merchants and new_merchants data)

for more info on this check this kernel out [By Robin Denz](https://www.kaggle.com/denzo123/a-closer-look-at-date-variables)

In [58]:
print(f'Merchants data shape: {merchants.shape}, new merchants data shape: {new_merchants.shape}')

Merchants data shape: (334696, 22), new merchants data shape: (1963031, 14)


In [59]:
print(merchants.columns)
print('=='*18)
print(new_merchants.columns)

Index(['merchant_id', 'merchant_group_id', 'merchant_category_id',
       'subsector_id', 'numerical_1', 'numerical_2', 'category_1',
       'most_recent_sales_range', 'most_recent_purchases_range',
       'avg_sales_lag3', 'avg_purchases_lag3', 'active_months_lag3',
       'avg_sales_lag6', 'avg_purchases_lag6', 'active_months_lag6',
       'avg_sales_lag12', 'avg_purchases_lag12', 'active_months_lag12',
       'category_4', 'city_id', 'state_id', 'category_2'],
      dtype='object')
Index(['authorized_flag', 'card_id', 'city_id', 'category_1', 'installments',
       'category_3', 'merchant_category_id', 'merchant_id', 'month_lag',
       'purchase_amount', 'purchase_date', 'category_2', 'state_id',
       'subsector_id'],
      dtype='object')


In [60]:
# Lets merge our datasets
# But first we need to delete duplicate columns
todrop = ['merchant_category_id', 'subsector_id', 'city_id', 'state_id', 'category_2', 'category_1']

for col in todrop:
    merchants = merchants.drop(col, axis = 1)
    
merchants_data = pd.merge(merchants, new_merchants, on = 'merchant_id')
merchants_data[:5]

Unnamed: 0,merchant_id,merchant_group_id,numerical_1,numerical_2,most_recent_sales_range,most_recent_purchases_range,avg_sales_lag3,avg_purchases_lag3,active_months_lag3,avg_sales_lag6,...,category_1,installments,category_3,merchant_category_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,M_ID_a70e9c5f81,5026,-0.057471,-0.057471,E,E,,1.666667,3,,...,Y,2,C,792,1,-0.622007,2018-03-05 14:51:53,,-1,9
1,M_ID_d8ff08219e,16430,-0.057471,-0.057471,E,E,,1.0,3,,...,Y,10,C,529,1,1.146703,2018-03-08 22:19:58,,-1,20
2,M_ID_c9eb20f6f5,25449,-0.057471,-0.057471,E,E,,1.666667,3,,...,N,0,A,45,2,-0.596643,2018-04-11 11:45:08,1.0,16,18
3,M_ID_c9855bf3e3,321,-0.057471,-0.057471,E,E,,1084.0,3,,...,Y,5,C,690,2,-0.32618,2018-04-23 17:19:42,,-1,1
4,M_ID_c9855bf3e3,321,-0.057471,-0.057471,E,E,,1084.0,3,,...,Y,1,B,690,1,-0.652391,2018-03-19 23:24:49,,-1,1


In [61]:
# Lets merge our merchant data with the training data.

data = pd.merge(merchants_data, train, on = 'card_id')

In [62]:
data.shape

(1215489, 34)

In [63]:
data[:5]

Unnamed: 0,merchant_id,merchant_group_id,numerical_1,numerical_2,most_recent_sales_range,most_recent_purchases_range,avg_sales_lag3,avg_purchases_lag3,active_months_lag3,avg_sales_lag6,...,purchase_amount,purchase_date,category_2,state_id,subsector_id,first_active_month,feature_1,feature_2,feature_3,target
0,M_ID_c9eb20f6f5,25449,-0.057471,-0.057471,E,E,,1.666667,3,,...,-0.596643,2018-04-11 11:45:08,1.0,16,18,2013-11-01,3,3,1,-0.242904
1,M_ID_c9855bf3e3,321,-0.057471,-0.057471,E,E,,1084.0,3,,...,0.755593,2017-11-05 23:30:27,,-1,1,2017-03-01,1,1,0,-1.197832
2,M_ID_c9855bf3e3,321,-0.057471,-0.057471,E,E,,1084.0,3,,...,-0.593788,2017-11-28 18:56:03,,-1,1,2017-09-01,2,3,0,-0.37628
3,M_ID_c9855bf3e3,321,-0.057471,-0.057471,E,E,,1084.0,3,,...,-0.445025,2017-11-05 01:02:17,,-1,1,2015-08-01,3,3,1,-0.259445
4,M_ID_c9855bf3e3,321,-0.057471,-0.057471,E,E,,1084.0,3,,...,3.385233,2017-12-22 15:27:47,,-1,1,2017-06-01,5,2,1,-2.570895


If you look at our data column - purchase_date, it had date and time. To better work with this data we need to split the two to two columns. 

In [64]:
data['purchase_time'] = data['purchase_date'].str.split(' ')
data['purchase_date'] = data['purchase_time'].str[0]
data['purchase_time'] = data['purchase_time'].str[1]

In [65]:
# Let's also do what we have done to the testing dataset
test_data = pd.merge(merchants_data, test, on = 'card_id')

In [66]:
test_data['purchase_time'] = test_data['purchase_date'].str.split(' ')
test_data['purchase_date'] = test_data['purchase_time'].str[0]
test_data['purchase_time'] = test_data['purchase_time'].str[1]

In [67]:
print(f'Train data shape: {data.shape} Test data shape: {test_data.shape}')

Train data shape: (1215489, 35) Test data shape: (745364, 34)


In [68]:
test_data[:5]

Unnamed: 0,merchant_id,merchant_group_id,numerical_1,numerical_2,most_recent_sales_range,most_recent_purchases_range,avg_sales_lag3,avg_purchases_lag3,active_months_lag3,avg_sales_lag6,...,purchase_amount,purchase_date,category_2,state_id,subsector_id,first_active_month,feature_1,feature_2,feature_3,purchase_time
0,M_ID_a70e9c5f81,5026,-0.057471,-0.057471,E,E,,1.666667,3,,...,-0.622007,2018-03-05,,-1,9,2017-10-01,2,3,0,14:51:53
1,M_ID_d8ff08219e,16430,-0.057471,-0.057471,E,E,,1.0,3,,...,1.146703,2018-03-08,,-1,20,2016-07-01,5,1,1,22:19:58
2,M_ID_c9855bf3e3,321,-0.057471,-0.057471,E,E,,1084.0,3,,...,-0.32618,2018-04-23,,-1,1,2017-08-01,1,1,0,17:19:42
3,M_ID_c9855bf3e3,321,-0.057471,-0.057471,E,E,,1084.0,3,,...,-0.652391,2018-03-19,,-1,1,2017-11-01,1,1,0,23:24:49
4,M_ID_c9855bf3e3,321,-0.057471,-0.057471,E,E,,1084.0,3,,...,0.940089,2018-01-10,,-1,1,2017-02-01,1,1,0,15:07:04


In [69]:
#We were not suppose to have converted these to date at first But now we fix that so that we can easily manipulate them
data['first_active_month'] = data['first_active_month'].astype(str)
test_data['first_active_month'] = test_data['first_active_month'].astype(str)

In [70]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1215489 entries, 0 to 1215488
Data columns (total 35 columns):
 #   Column                       Non-Null Count    Dtype  
---  ------                       --------------    -----  
 0   merchant_id                  1215489 non-null  object 
 1   merchant_group_id            1215489 non-null  int64  
 2   numerical_1                  1215489 non-null  float64
 3   numerical_2                  1215489 non-null  float64
 4   most_recent_sales_range      1215489 non-null  object 
 5   most_recent_purchases_range  1215489 non-null  object 
 6   avg_sales_lag3               1215295 non-null  float64
 7   avg_purchases_lag3           1215489 non-null  float64
 8   active_months_lag3           1215489 non-null  int64  
 9   avg_sales_lag6               1215295 non-null  float64
 10  avg_purchases_lag6           1215489 non-null  float64
 11  active_months_lag6           1215489 non-null  int64  
 12  avg_sales_lag12              1215295 non-n

In [71]:
# The main business with the purchase_date and purchase_time. Lets see if we can't generate more columns out of them
def get_weekday(date_string):
    date = datetime.datetime.strptime(date_string, '%Y-%m-%d')
    return calendar.day_name[date.weekday()]

# get weekday for date variable
data['purchase_weekday'] = data['purchase_date'].apply(lambda x: get_weekday(x))

# for plotting recode to ordered categorical
day_labels = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
data['purchase_weekday'] = pd.Categorical(data['purchase_weekday'], categories = day_labels, 
                                          ordered = True)

def get_month(date_string, kind = 'month'):
    if kind == 'month':
        date = datetime.datetime.strptime(date_string, '%Y-%m-%d')
    elif kind == 'day':
        date = datetime.datetime.strptime(date_string, '%Y-%m-%d')
    return date.strftime("%B")

# Creating new columns and their values

data['purchase_month'] = data['purchase_date'].apply(lambda x: get_month(x, kind = 'day'))
data['first_active_month2'] = data['first_active_month'].apply(lambda x: get_month(x))
data['first_active_year'] = data['first_active_month'].str[:4]

month_labels = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August',
                'September', 'October', 'November', 'December']
data['purchase_month'] = pd.Categorical(data['purchase_month'], categories = month_labels, 
                                          ordered = True)
data['first_active_month2'] = pd.Categorical(data['first_active_month2'], categories = month_labels, 
                                          ordered = True)

year_labels = ['2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018']
data['first_active_year'] = pd.Categorical(data['first_active_year'], categories = year_labels, 
                                          ordered = True)

# get time of the day
data['temp'] = data['purchase_time'].str.split(':')

def get_session(time_list):
    time_list[0] = int(time_list[0])
    if time_list[0] > 4 and time_list[0] < 12:
        return 'Morning'
    elif time_list[0] >= 12 and time_list[0] < 17:
        return 'Afternoon'
    elif time_list[0] >= 17 and time_list[0] < 21:
        return 'Evening'
    else:
        return 'Night'
    
data['purchase_session'] = data['temp'].apply(lambda x: get_session(x))

session_labels = ['Morning', 'Afternoon', 'Evening', 'Night']
data['purchase_session'] = pd.Categorical(data['purchase_session'], categories = session_labels, 
                                          ordered = True)

In [72]:
## time of month
# as categorical variable, thressholds are arbitrary and could be different
def get_time_of_month_cat(date):
    date_temp = date.split('-')
    if int(date_temp[2]) < 10:
        time_of_month = 'Beginning'
    elif int(date_temp[2]) >= 10 and int(date_temp[2]) < 20:
        time_of_month = 'Middle'
    else:
        time_of_month = 'End'
    return time_of_month

data['time_of_month_cat'] = data['purchase_date'].apply(lambda x: get_time_of_month_cat(x))

tof_labels = ['Beginning', 'Middle', 'End']
data['time_of_month_cat'] = pd.Categorical(data['time_of_month_cat'], categories = tof_labels, 
                                           ordered = True)

data['time_of_month_num'] = data['purchase_date'].str[8:].astype(int)

In [73]:
# ALEX: remove plotting
# ax = sns.lineplot(x = "purchase_month", y = "target", 
#                   markers = True, dashes = False, data = data)
# plt.xticks(rotation = 45)
# ax.set_title('Target Variable Changes over Purchase Month')
# ax.set_xlabel('Purchase Month')

In [74]:
# ALEX: remove plotting
# ax = sns.lineplot(x = "first_active_month2", y = "target", 
#                   markers = True, dashes = False, data = data)
# plt.xticks(rotation = 45)
# ax.set_title('Target Variable Changes over the First Active Month')
# ax.set_xlabel('First Active Month')

In [75]:
print(f'Train data shape: {data.shape} , Test data shape: {test_data.shape}')

Train data shape: (1215489, 43) , Test data shape: (745364, 34)


## Now let's repeat what we have done on Training data to the Testing dataset

In [76]:
# The main business with the purchase_date and purchase_time. Lets see if we can't generate more columns out of them
def get_weekday(date_string):
    date = datetime.datetime.strptime(date_string, '%Y-%m-%d')
    return calendar.day_name[date.weekday()]

# get weekday for date variable
test_data['purchase_weekday'] = test_data['purchase_date'].apply(lambda x: get_weekday(x))

# for plotting recode to ordered categorical
day_labels = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
test_data['purchase_weekday'] = pd.Categorical(test_data['purchase_weekday'], categories = day_labels, ordered = True)

def get_month(date_string, kind = 'month'):
    if kind == 'month':
        date = datetime.datetime.strptime(date_string, '%Y-%m-%d')
    elif kind == 'day':
        date = datetime.datetime.strptime(date_string, '%Y-%m-%d')
    return date.strftime("%B")

test_data['purchase_month'] = test_data['purchase_date'].apply(lambda x: get_month(x, kind = 'day'))
test_data['first_active_month2'] = test_data['first_active_month'].apply(lambda x: get_month(x))
test_data['first_active_year'] = test_data['first_active_month'].str[:4]

month_labels = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August',
                'September', 'October', 'November', 'December']
test_data['purchase_month'] = pd.Categorical(test_data['purchase_month'], categories = month_labels, ordered = True)
test_data['first_active_month2'] = pd.Categorical(test_data['first_active_month2'], categories = month_labels, ordered = True)

year_labels = ['2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018']
test_data['first_active_year'] = pd.Categorical(test_data['first_active_year'], categories = year_labels, ordered = True)

# get time of the day
test_data['temp'] = test_data['purchase_time'].str.split(':')

def get_session(time_list):
    time_list[0] = int(time_list[0])
    if time_list[0] > 4 and time_list[0] < 12:
        return 'Morning'
    elif time_list[0] >= 12 and time_list[0] < 17:
        return 'Afternoon'
    elif time_list[0] >= 17 and time_list[0] < 21:
        return 'Evening'
    else:
        return 'Night'
    
test_data['purchase_session'] = test_data['temp'].apply(lambda x: get_session(x))

session_labels = ['Morning', 'Afternoon', 'Evening', 'Night']
test_data['purchase_session'] = pd.Categorical(test_data['purchase_session'], categories = session_labels, ordered = True)

In [77]:
## time of month
# as categorical variable, thressholds are arbitrary and could be different
def get_time_of_month_cat(date):
    date_temp = date.split('-')
    if int(date_temp[2]) < 10:
        time_of_month = 'Beginning'
    elif int(date_temp[2]) >= 10 and int(date_temp[2]) < 20:
        time_of_month = 'Middle'
    else:
        time_of_month = 'End'
    return time_of_month

test_data['time_of_month_cat'] = test_data['purchase_date'].apply(lambda x: get_time_of_month_cat(x))

tof_labels = ['Beginning', 'Middle', 'End']
test_data['time_of_month_cat'] = pd.Categorical(test_data['time_of_month_cat'], categories = tof_labels, ordered = True)

test_data['time_of_month_num'] = test_data['purchase_date'].str[8:].astype(int)

In [78]:
# Now the shape should be the same in terms of columns except that Train data will have one extra column because of the target value
print(f'Train data shape: {data.shape} , Test data shape: {test_data.shape}')

Train data shape: (1215489, 43) , Test data shape: (745364, 42)


In [79]:
def get_end_of_month(date):
    date_temp = date.split('-')
    if int(date_temp[2]) >= 25:
        end_of_month = 'Y'
    else:
        end_of_month = 'N'
    return end_of_month
data['end_of_month'] = data['purchase_date'].apply(lambda x: get_end_of_month(x))
test_data['end_of_month'] = test_data['purchase_date'].apply(lambda x: get_end_of_month(x))

In [80]:
print(f'Train data shape: {data.shape} , Test data shape: {test_data.shape}')

Train data shape: (1215489, 44) , Test data shape: (745364, 43)


In [81]:
data[:5]

Unnamed: 0,merchant_id,merchant_group_id,numerical_1,numerical_2,most_recent_sales_range,most_recent_purchases_range,avg_sales_lag3,avg_purchases_lag3,active_months_lag3,avg_sales_lag6,...,purchase_time,purchase_weekday,purchase_month,first_active_month2,first_active_year,temp,purchase_session,time_of_month_cat,time_of_month_num,end_of_month
0,M_ID_c9eb20f6f5,25449,-0.057471,-0.057471,E,E,,1.666667,3,,...,11:45:08,Wednesday,April,November,2013,"[11, 45, 08]",Morning,Middle,11,N
1,M_ID_c9855bf3e3,321,-0.057471,-0.057471,E,E,,1084.0,3,,...,23:30:27,Sunday,November,March,2017,"[23, 30, 27]",Night,Beginning,5,N
2,M_ID_c9855bf3e3,321,-0.057471,-0.057471,E,E,,1084.0,3,,...,18:56:03,Tuesday,November,September,2017,"[18, 56, 03]",Evening,End,28,Y
3,M_ID_c9855bf3e3,321,-0.057471,-0.057471,E,E,,1084.0,3,,...,01:02:17,Sunday,November,August,2015,"[1, 02, 17]",Night,Beginning,5,N
4,M_ID_c9855bf3e3,321,-0.057471,-0.057471,E,E,,1084.0,3,,...,15:27:47,Friday,December,June,2017,"[15, 27, 47]",Afternoon,End,22,N


# 4. Data Cleaning

In [82]:
# ALEX: make notebook run
# correlation_data = data.corr()
correlation_data = data.corr(numeric_only=True)

In [83]:
correlation_data

Unnamed: 0,merchant_group_id,numerical_1,numerical_2,avg_sales_lag3,avg_purchases_lag3,active_months_lag3,avg_sales_lag6,avg_purchases_lag6,active_months_lag6,avg_sales_lag12,...,month_lag,purchase_amount,category_2,state_id,subsector_id,feature_1,feature_2,feature_3,target,time_of_month_num
merchant_group_id,1.0,-0.156547,-0.155307,-0.005135,-0.005697,0.001023,-0.004724,-0.012769,-0.03737,-0.004048,...,0.009785,-0.009891,0.042531,0.00039,0.058488,0.010781,-0.001144,0.004122,0.00287,0.002069
numerical_1,-0.156547,1.0,0.99943,-0.001557,-0.00173,0.002518,-0.001594,-0.003839,0.013528,-0.001493,...,-0.001076,-0.025117,-0.104358,-0.018447,0.112249,-0.004285,-0.018507,0.005145,0.003202,-0.000207
numerical_2,-0.155307,0.99943,1.0,-0.001539,-0.001711,0.00249,-0.001576,-0.003795,0.013379,-0.001476,...,-0.001014,-0.024973,-0.103228,-0.018445,0.114747,-0.003999,-0.018186,0.005386,0.003222,-0.000198
avg_sales_lag3,-0.005135,-0.001557,-0.001539,1.0,0.990371,8.1e-05,0.955149,0.984672,0.000396,0.855365,...,0.0002,0.000694,-0.005772,-0.002057,0.003539,-0.000164,0.002008,0.000683,-0.000141,0.000408
avg_purchases_lag3,-0.005697,-0.00173,-0.001711,0.990371,1.0,7.5e-05,0.924859,0.991625,0.000331,0.816898,...,-0.000126,0.000752,-0.006063,-0.003159,0.00242,5.8e-05,0.001795,0.000765,-0.000243,6.1e-05
active_months_lag3,0.001023,0.002518,0.00249,8.1e-05,7.5e-05,1.0,8.3e-05,0.000191,0.380161,7.7e-05,...,0.000779,-0.025401,-0.000469,0.000414,0.004913,0.003225,0.003531,0.003989,0.000736,0.002393
avg_sales_lag6,-0.004724,-0.001594,-0.001576,0.955149,0.924859,8.3e-05,1.0,0.919799,0.000413,0.969351,...,0.000562,0.00138,-0.005893,-0.002013,0.002588,-0.000123,0.002102,0.000557,-0.000275,0.001028
avg_purchases_lag6,-0.012769,-0.003839,-0.003795,0.984672,0.991625,0.000191,0.919799,1.0,0.000981,0.812505,...,-0.004713,0.001657,-0.013081,-0.007721,-0.016242,0.000163,0.002029,0.001083,-0.001029,-0.001319
active_months_lag6,-0.03737,0.013528,0.013379,0.000396,0.000331,0.380161,0.000413,0.000981,1.0,0.000384,...,-0.025011,-0.019347,-0.005036,0.003722,-0.005908,0.004298,0.001851,0.006052,-0.005682,-0.012735
avg_sales_lag12,-0.004048,-0.001493,-0.001476,0.855365,0.816898,7.7e-05,0.969351,0.812505,0.000384,1.0,...,0.00082,0.001784,-0.005502,-0.001882,0.001849,-9e-06,0.002003,0.00048,-0.00038,0.001405


In [84]:
print(correlation_data['target'].sort_values(ascending = False))

target                  1.000000
month_lag               0.038046
time_of_month_num       0.023520
subsector_id            0.011933
installments            0.003297
numerical_2             0.003222
numerical_1             0.003202
merchant_group_id       0.002870
merchant_category_id    0.002061
active_months_lag3      0.000736
avg_sales_lag3         -0.000141
avg_purchases_lag3     -0.000243
avg_sales_lag6         -0.000275
avg_sales_lag12        -0.000380
avg_purchases_lag6     -0.001029
avg_purchases_lag12    -0.001361
category_2             -0.003124
city_id                -0.004305
active_months_lag6     -0.005682
state_id               -0.008991
active_months_lag12    -0.011031
feature_1              -0.019548
feature_2              -0.029319
feature_3              -0.040984
purchase_amount        -0.058869
Name: target, dtype: float64


In [85]:
data_cat_feats = data.dtypes[data.dtypes == 'object'].index
data_num_feats = data.dtypes[data.dtypes != 'object'].index

print(f'Data categrical: {len(data_cat_feats)}')
print(f'Data Numerical: {len(data_num_feats)}')

Data categrical: 13
Data Numerical: 31


In [86]:
# for col in data_cat_feats:
#     cols = data[col].value_counts().index.shape[0]
#     print(f'{cols} \t unique values in \t {col}')

In [87]:
# for col in data_num_feats:
#     cols = data[col].value_counts().index.shape[0]
#     print(f'{cols} \t unique values in \t {col}')

In [88]:
missing_values(data.select_dtypes(['object']))

You data contains 13 columns and has 1 colums with missing values


Unnamed: 0,Total Miss Values,% of miss values
category_3,33143,2.73


In [89]:
missing_values(data.select_dtypes(include = [np.number]))

You data contains 25 columns and has 4 colums with missing values


Unnamed: 0,Total Miss Values,% of miss values
category_2,69190,5.69
avg_sales_lag3,194,0.02
avg_sales_lag6,194,0.02
avg_sales_lag12,194,0.02


In [90]:
missing_values(data)

You data contains 44 columns and has 5 colums with missing values


Unnamed: 0,Total Miss Values,% of miss values
category_2,69190,5.69
category_3,33143,2.73
avg_sales_lag3,194,0.02
avg_sales_lag6,194,0.02
avg_sales_lag12,194,0.02


In [91]:
data['category_3'].value_counts()

category_3
A    570483
B    517052
C     94811
Name: count, dtype: int64

In [92]:
data['category_3'] = data['category_3'].fillna(data['category_3'].mode()[0])

In [93]:
missing_values(data.select_dtypes(['object']))

You data contains 13 columns and has 0 colums with missing values


Unnamed: 0,Total Miss Values,% of miss values


In [94]:
data['category_3'].value_counts()

category_3
A    603626
B    517052
C     94811
Name: count, dtype: int64

In [95]:
cols = ['category_2', 'avg_sales_lag3', 'avg_sales_lag6', 'avg_sales_lag12']

for col in cols:
    data[col] = data[col].fillna(data[col].mean())

In [96]:
missing_values(data)

You data contains 44 columns and has 0 colums with missing values


Unnamed: 0,Total Miss Values,% of miss values


# Now lets look at the Test data

In [97]:
missing_values(test_data.select_dtypes(['object']))

You data contains 13 columns and has 1 colums with missing values


Unnamed: 0,Total Miss Values,% of miss values
category_3,20459,2.74


In [98]:
test_data['category_3'] = test_data['category_3'].fillna(test_data['category_3'].mode()[0])

In [99]:
missing_values(test_data)

You data contains 43 columns and has 4 colums with missing values


Unnamed: 0,Total Miss Values,% of miss values
category_2,43321,5.81
avg_sales_lag3,129,0.02
avg_sales_lag6,129,0.02
avg_sales_lag12,129,0.02


In [100]:
cols = ['category_2', 'avg_sales_lag3', 'avg_sales_lag6', 'avg_sales_lag12']

for col in cols:
    test_data[col] = test_data[col].fillna(test_data[col].mean())

In [101]:
missing_values(test_data)

You data contains 43 columns and has 0 colums with missing values


Unnamed: 0,Total Miss Values,% of miss values


Now that we have filled all the nulls both in training data and testing dataset, we can then work with categorical features, map the ones that have few categorical features and create dummy features for those that have lots of uniques categorical features

In [102]:
# Lets change the First_active_month column from string to date.

data['first_active_month'] = pd.to_datetime(data['first_active_month'])

# Also do this to the test dataset

test_data['first_active_month'] = pd.to_datetime(test_data['first_active_month'])

data['purchase_date'] = pd.to_datetime(data['purchase_date'])
test_data['purchase_date'] = pd.to_datetime(test_data['purchase_date'])

In [103]:
data['purchase_time'] = pd.to_timedelta(data['purchase_time'])
test_data['purchase_time'] = pd.to_timedelta(test_data['purchase_time'])

In [104]:
data[:5]

Unnamed: 0,merchant_id,merchant_group_id,numerical_1,numerical_2,most_recent_sales_range,most_recent_purchases_range,avg_sales_lag3,avg_purchases_lag3,active_months_lag3,avg_sales_lag6,...,purchase_time,purchase_weekday,purchase_month,first_active_month2,first_active_year,temp,purchase_session,time_of_month_cat,time_of_month_num,end_of_month
0,M_ID_c9eb20f6f5,25449,-0.057471,-0.057471,E,E,45.534707,1.666667,3,45.211503,...,0 days 11:45:08,Wednesday,April,November,2013,"[11, 45, 08]",Morning,Middle,11,N
1,M_ID_c9855bf3e3,321,-0.057471,-0.057471,E,E,45.534707,1084.0,3,45.211503,...,0 days 23:30:27,Sunday,November,March,2017,"[23, 30, 27]",Night,Beginning,5,N
2,M_ID_c9855bf3e3,321,-0.057471,-0.057471,E,E,45.534707,1084.0,3,45.211503,...,0 days 18:56:03,Tuesday,November,September,2017,"[18, 56, 03]",Evening,End,28,Y
3,M_ID_c9855bf3e3,321,-0.057471,-0.057471,E,E,45.534707,1084.0,3,45.211503,...,0 days 01:02:17,Sunday,November,August,2015,"[1, 02, 17]",Night,Beginning,5,N
4,M_ID_c9855bf3e3,321,-0.057471,-0.057471,E,E,45.534707,1084.0,3,45.211503,...,0 days 15:27:47,Friday,December,June,2017,"[15, 27, 47]",Afternoon,End,22,N


In [105]:
test_data[:5]

Unnamed: 0,merchant_id,merchant_group_id,numerical_1,numerical_2,most_recent_sales_range,most_recent_purchases_range,avg_sales_lag3,avg_purchases_lag3,active_months_lag3,avg_sales_lag6,...,purchase_time,purchase_weekday,purchase_month,first_active_month2,first_active_year,temp,purchase_session,time_of_month_cat,time_of_month_num,end_of_month
0,M_ID_a70e9c5f81,5026,-0.057471,-0.057471,E,E,38.47594,1.666667,3,36.517076,...,0 days 14:51:53,Monday,March,October,2017,"[14, 51, 53]",Afternoon,Beginning,5,N
1,M_ID_d8ff08219e,16430,-0.057471,-0.057471,E,E,38.47594,1.0,3,36.517076,...,0 days 22:19:58,Thursday,March,July,2016,"[22, 19, 58]",Night,Beginning,8,N
2,M_ID_c9855bf3e3,321,-0.057471,-0.057471,E,E,38.47594,1084.0,3,36.517076,...,0 days 17:19:42,Monday,April,August,2017,"[17, 19, 42]",Evening,End,23,N
3,M_ID_c9855bf3e3,321,-0.057471,-0.057471,E,E,38.47594,1084.0,3,36.517076,...,0 days 23:24:49,Monday,March,November,2017,"[23, 24, 49]",Night,Middle,19,N
4,M_ID_c9855bf3e3,321,-0.057471,-0.057471,E,E,38.47594,1084.0,3,36.517076,...,0 days 15:07:04,Wednesday,January,February,2017,"[15, 07, 04]",Afternoon,Middle,10,N


In [106]:
print(f'train: {data.shape}, Test: {test_data.shape}')

cols = ['merchant_id', 'card_id', 'temp']

for col in cols:
    data.drop([col], axis = 1, inplace = True)
    test_data.drop([col], axis = 1, inplace = True)
    
print(f'train: {data.shape}, test: {test_data.shape}')

train: (1215489, 44), Test: (745364, 43)
train: (1215489, 41), test: (745364, 40)


In [107]:
catcols = data.select_dtypes(['object'])

for col in catcols:
    print(col)
    print(data[col].value_counts())
    print('==' * 20)

most_recent_sales_range
most_recent_sales_range
C    326322
D    308003
A    235539
B    201442
E    144183
Name: count, dtype: int64
most_recent_purchases_range
most_recent_purchases_range
C    335820
D    323135
A    245636
B    191056
E    119842
Name: count, dtype: int64
category_4
category_4
Y    614329
N    601160
Name: count, dtype: int64
authorized_flag
authorized_flag
Y    1215489
Name: count, dtype: int64
category_1
category_1
N    1175278
Y      40211
Name: count, dtype: int64
category_3
category_3
A    603626
B    517052
C     94811
Name: count, dtype: int64
end_of_month
end_of_month
N    961171
Y    254318
Name: count, dtype: int64


In [108]:
data['most_recent_sales_range'] = data['most_recent_sales_range'].map({'A':0, 'B':1, 'C':2, 'D':3, 'E':4})
data['most_recent_purchases_range'] = data['most_recent_purchases_range'].map({'A':0, 'B':1, 'C':2, 'D':3, 'E':4})
data['category_4'] = data['category_4'].map({'Y':1, 'N':0})
data['authorized_flag'] = data['authorized_flag'].map({'Y':0})
data['category_1'] = data['category_1'].map({'Y':1, 'N':0})
data['category_3'] = data['category_3'].map({'A':0, 'B':1, 'C':2})
data['end_of_month'] = data['end_of_month'].map({'Y':1, 'N':0})


test_data['most_recent_sales_range'] = test_data['most_recent_sales_range'].map({'A':0, 'B':1, 'C':2, 'D':3, 'E':4})
test_data['most_recent_purchases_range'] = test_data['most_recent_purchases_range'].map({'A':0, 'B':1, 'C':2, 'D':3, 'E':4})
test_data['category_4'] = test_data['category_4'].map({'Y':1, 'N':0})
test_data['authorized_flag'] = test_data['authorized_flag'].map({'Y':0})
test_data['category_1'] = test_data['category_1'].map({'Y':1, 'N':0})
test_data['category_3'] = test_data['category_3'].map({'A':0, 'B':1, 'C':2})
test_data['end_of_month'] = test_data['end_of_month'].map({'Y':1, 'N':0})

In [109]:
catcols = data.select_dtypes(['object'])

for col in catcols:
    print(col)
    print(data[col].value_counts())
    print('==' * 20)

In [110]:
data[:5]

Unnamed: 0,merchant_group_id,numerical_1,numerical_2,most_recent_sales_range,most_recent_purchases_range,avg_sales_lag3,avg_purchases_lag3,active_months_lag3,avg_sales_lag6,avg_purchases_lag6,...,target,purchase_time,purchase_weekday,purchase_month,first_active_month2,first_active_year,purchase_session,time_of_month_cat,time_of_month_num,end_of_month
0,25449,-0.057471,-0.057471,4,4,45.534707,1.666667,3,45.211503,1.666667,...,-0.242904,0 days 11:45:08,Wednesday,April,November,2013,Morning,Middle,11,0
1,321,-0.057471,-0.057471,4,4,45.534707,1084.0,3,45.211503,3432.666667,...,-1.197832,0 days 23:30:27,Sunday,November,March,2017,Night,Beginning,5,0
2,321,-0.057471,-0.057471,4,4,45.534707,1084.0,3,45.211503,3432.666667,...,-0.37628,0 days 18:56:03,Tuesday,November,September,2017,Evening,End,28,1
3,321,-0.057471,-0.057471,4,4,45.534707,1084.0,3,45.211503,3432.666667,...,-0.259445,0 days 01:02:17,Sunday,November,August,2015,Night,Beginning,5,0
4,321,-0.057471,-0.057471,4,4,45.534707,1084.0,3,45.211503,3432.666667,...,-2.570895,0 days 15:27:47,Friday,December,June,2017,Afternoon,End,22,0


In [111]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1215489 entries, 0 to 1215488
Data columns (total 41 columns):
 #   Column                       Non-Null Count    Dtype          
---  ------                       --------------    -----          
 0   merchant_group_id            1215489 non-null  int64          
 1   numerical_1                  1215489 non-null  float64        
 2   numerical_2                  1215489 non-null  float64        
 3   most_recent_sales_range      1215489 non-null  int64          
 4   most_recent_purchases_range  1215489 non-null  int64          
 5   avg_sales_lag3               1215489 non-null  float64        
 6   avg_purchases_lag3           1215489 non-null  float64        
 7   active_months_lag3           1215489 non-null  int64          
 8   avg_sales_lag6               1215489 non-null  float64        
 9   avg_purchases_lag6           1215489 non-null  float64        
 10  active_months_lag6           1215489 non-null  int64          
 11

In [112]:
data_app = pd.get_dummies(data)
test_app = pd.get_dummies(test_data)

In [113]:
data_app[:5]

Unnamed: 0,merchant_group_id,numerical_1,numerical_2,most_recent_sales_range,most_recent_purchases_range,avg_sales_lag3,avg_purchases_lag3,active_months_lag3,avg_sales_lag6,avg_purchases_lag6,...,first_active_year_2016,first_active_year_2017,first_active_year_2018,purchase_session_Morning,purchase_session_Afternoon,purchase_session_Evening,purchase_session_Night,time_of_month_cat_Beginning,time_of_month_cat_Middle,time_of_month_cat_End
0,25449,-0.057471,-0.057471,4,4,45.534707,1.666667,3,45.211503,1.666667,...,False,False,False,True,False,False,False,False,True,False
1,321,-0.057471,-0.057471,4,4,45.534707,1084.0,3,45.211503,3432.666667,...,False,True,False,False,False,False,True,True,False,False
2,321,-0.057471,-0.057471,4,4,45.534707,1084.0,3,45.211503,3432.666667,...,False,True,False,False,False,True,False,False,False,True
3,321,-0.057471,-0.057471,4,4,45.534707,1084.0,3,45.211503,3432.666667,...,False,False,False,False,False,False,True,True,False,False
4,321,-0.057471,-0.057471,4,4,45.534707,1084.0,3,45.211503,3432.666667,...,False,True,False,False,True,False,False,False,False,True


In [114]:
missing_values(test_app)

You data contains 80 columns and has 0 colums with missing values


Unnamed: 0,Total Miss Values,% of miss values


# Models

In [115]:
# ALEX: remove ML code
# from xgboost import XGBRegressor

In [116]:
y = data_app.target
X = data_app.drop(['target'], axis = 1)

In [117]:
X.shape, test_app.shape

((1215489, 80), (745364, 80))

In [118]:
y[:5]

0   -0.242904
1   -1.197832
2   -0.376280
3   -0.259445
4   -2.570895
Name: target, dtype: float64

In [119]:
# ALEX: remove ML code
# xgb = XGBRegressor()

In [120]:
X['purchase_date'] = X['purchase_date'].astype(int)
X['first_active_month'] = X['first_active_month'].astype(int)
X['purchase_time'] = X['purchase_time'].astype(int)

In [121]:
test_app['purchase_date'] = test_app['purchase_date'].astype(int)
test_app['first_active_month'] = test_app['first_active_month'].astype(int)
test_app['purchase_time'] = test_app['purchase_time'].astype(int)

In [122]:
# xgb_model = xgb.fit(X,y)

In [123]:
# test, X = test.align(X, join = 'inner', axis = 1)

In [124]:
test, X = test.align(X, join = 'left', axis = 0)
X, test_app = X.align(test_app, join = 'left', axis = 0)
X, y = X.align(y, join = 'left', axis = 0)

In [125]:
test.shape, X.shape, test_app.shape, y.shape

((123623, 5), (123623, 80), (123623, 80), (123623,))

In [126]:
# ALEX: remove ML code
# from sklearn.linear_model import Lasso

In [127]:
# ALEX: remove ML code
# lassoreg = Lasso(alpha = 1.0, normalize = True, max_iter=1e5)

In [128]:
# ALEX: remove ML code
# model_lasso = lassoreg.fit(X,y)

In [129]:
# ALEX:  remove ML code
# pred_lasso = model_lasso.predict(test_app)
pred_lasso = y

In [130]:
test['card_id'].shape

(123623,)

In [131]:
pred_lasso.shape

(123623,)

In [132]:
# pred_lasso, test['card_id'] = pred_lasso.align(test['card_id'], join = 'inner', axis = 1)

In [133]:
sub_lasso = pd.DataFrame({'card_id': test['card_id'], 'target': pred_lasso})
sub_lasso.to_csv('Lasso_submission.csv', index = False)
sub_lasso.head()

Unnamed: 0,card_id,target
0,C_ID_0ab67a22ab,-0.242904
1,C_ID_130fd0cbdd,-1.197832
2,C_ID_b709037bc5,-0.37628
3,C_ID_d27d835a9f,-0.259445
4,C_ID_2b5e3df5c2,-2.570895


Score : 3.933

In [134]:
# ALEX: remove ML code
# import lightgbm as lgb

In [135]:
# ALEX: remove ML code
# params = {
#         "objective" : "regression",
#         "metric" : "rmse",
#         "num_leaves" : 30,
#         "min_child_weight" : 50,
#         "learning_rate" : 0.05,
#         "bagging_fraction" : 0.7,
#         "feature_fraction" : 0.7,
#         "bagging_seed" : 2018,
#         "verbosity" : -1
#     }

In [136]:
# ALEX: remove ML code
# train_set = lgb.Dataset(X, label = y.values)
_ = y.values

In [137]:
# ALEX: remove ML code
# val_set = lgb.Dataset(X)

In [138]:
# ALEX: remove ML code
# lgb_model = lgb.train(params, train_set,valid_sets= val_set, early_stopping_rounds=1000,verbose_eval=1000, num_boost_round=1000)

In [139]:
# ALEX: remove ML code
# lgb_pred = lgb_model.predict(test_app)
lgb_pred = y.values

In [140]:
lgb_pred

array([-0.24290435, -1.19783221, -0.37628   , ..., -0.65566056,
       -2.46528826, -2.30065316])

In [141]:
sub_lgb = pd.DataFrame({'card_id':test['card_id'], 'target': lgb_pred})
sub_lgb.to_csv('lgb_submission.csv', index = False)
sub_lgb.head()

Unnamed: 0,card_id,target
0,C_ID_0ab67a22ab,-0.242904
1,C_ID_130fd0cbdd,-1.197832
2,C_ID_b709037bc5,-0.37628
3,C_ID_d27d835a9f,-0.259445
4,C_ID_2b5e3df5c2,-2.570895


In [142]:
# ALEX: remove ML code
# import xgboost as xgb

In [143]:
# ALEX: remove ML code
# dtrain = xgb.DMatrix(X, label = y.values)
# dtest = xgb.DMatrix(test_app)
_ = y.values

In [144]:
# ALEX: remove ML code
# xgb_params = {'learning_rate':0.001, 
#               'n_estimators':1000,
#               'max_depth':4,
#               'min_child_weight':6,
#               'gamma':0,
#               'subsample':0.8,
#               'colsample_bytree':0.8,
#               'objective':'binary:logistic',
#               'nthread':4,
#               'scale_pos_weight':1,
#               'seed':27,
#               'silent':1
#               }

In [145]:
# ALEX: remove ML code
# xgb_model = xgb.train(params = {'silent':1, 'learning_rate':0.1, 'n_estimators':1000, 'min_child_weight':5},dtrain=dtrain, verbose_eval = False, num_boost_round=100)

In [146]:
# ALEX: remove ML code
# xgb_pred = xgb_model.predict(dtest)
xgb_pred = y.values

In [147]:
xgb_pred

array([-0.24290435, -1.19783221, -0.37628   , ..., -0.65566056,
       -2.46528826, -2.30065316])

In [148]:
xgb_sub = pd.DataFrame({'card_id':test['card_id'], 'target':xgb_pred})
xgb_sub.to_csv('xgb_submission.csv', index = False)
xgb_sub.tail(20)

Unnamed: 0,card_id,target
123603,C_ID_c3e88ef62d,3.126852
123604,C_ID_650557248c,-1.08723
123605,C_ID_f14af9cb56,-4.101944
123606,C_ID_068dcf31e8,0.187264
123607,C_ID_6659f4625a,0.293828
123608,C_ID_dc842bba27,-3.101826
123609,C_ID_bd625edce0,-0.155908
123610,C_ID_278446cc2f,-0.03364
123611,C_ID_ad358eb92e,-0.66158
123612,C_ID_7d9bcec574,0.03879
