# FEATURE ENGINEERING
# Feature Engineering on Transactions and Members
## Abstract
This notebook will be expanded into the full kernel, for the moment it is just the feature engineering part
## Import useful
### Import libraries

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import time
from datetime import datetime
from collections import Counter

### Import datasets

In [7]:
train = pd.read_csv('F:/Kaggle Data/MMBOX Churn/churn/train.csv')
train_2 = pd.read_csv('F:/Kaggle Data/MMBOX Churn/churn/train_v2.csv')
test_2 = pd.read_csv('F:/Kaggle Data/MMBOX Churn/churn/sample_submission_v2.csv')
transactions = pd.read_csv('F:/Kaggle Data/MMBOX Churn/churn/transactions.csv')
transactions_2 = pd.read_csv('F:/Kaggle Data/MMBOX Churn/churn/transactions_v2.csv')
members_3 = pd.read_csv('F:/Kaggle Data/MMBOX Churn/churn/members_v3.csv')


Concatenate transactions datasets

In [8]:

transactions_merged=pd.concat([transactions, transactions_2])

## Reduce memory consumption
By changing variable type

In [9]:
def change_datatype(df):
    int_cols = list(df.select_dtypes(include=['int']).columns)
    for col in int_cols:
        if ((np.max(df[col]) <= 127) and(np.min(df[col] >= -128))):
            df[col] = df[col].astype(np.int8)
        elif ((np.max(df[col]) <= 32767) and(np.min(df[col] >= -32768))):
            df[col] = df[col].astype(np.int16)
        elif ((np.max(df[col]) <= 2147483647) and(np.min(df[col] >= -2147483648))):
            df[col] = df[col].astype(np.int32)
        else:
            df[col] = df[col].astype(np.int64)
            
def change_datatype_float(df):
    float_cols = list(df.select_dtypes(include=['float']).columns)
    for col in float_cols:
        df[col] = df[col].astype(np.float32)

In [11]:
change_datatype(transactions_merged)
change_datatype_float(transactions_merged)

change_datatype(members_3)
change_datatype_float(members_3)



## Create "discount" feature
Create a feature giving the different discounts received by users, and print the unique values of discounts

In [12]:
transactions_merged['discount'] = transactions_merged['plan_list_price'] - transactions_merged['actual_amount_paid']

transactions_merged['discount'].unique()

array([    0,   149,  -149,    30,  -119,  -150,  -129,  -536,  -894,
       -1788,  -134,    20,  -799,   120,   180,   -35,    50, -1599,
        -100,  -131,  -300,  -480,  -450,    -1, -1000,  -105,     1,
        -447, -1200,  -930,   -30,   -50,   699,   -11,  -350,  -500,
           4,  -400,   -41,   -14, -2000,   -95,   -20,  -596,    45,
       -1150,   108,  -760,   -12,    99,   129,  1599,    10,     8,
         -90,    -3,   894], dtype=int64)

## Create "is_discount" binary feature

In [13]:
transactions_merged['is_discount'] = transactions_merged.discount.apply(lambda x: 1 if x > 0 else 0)
print(transactions_merged['is_discount'].head())
print(transactions_merged['is_discount'].unique())

0    0
1    0
2    0
3    0
4    0
Name: is_discount, dtype: int64
[0 1]


## Create "amount_per_day" feature
This feature gives the amount paid every day as the amount paid divided by the payment plan

In [14]:
transactions_merged['amt_per_day'] = transactions_merged['actual_amount_paid'] / transactions_merged['payment_plan_days']
transactions_merged['amt_per_day'].head()

0    4.300000
1    4.966667
2    4.300000
3    4.966667
4    4.966667
Name: amt_per_day, dtype: float64

## Convert transaction_date and memberhsip_expire to data format

In [15]:
date_cols = ['transaction_date', 'membership_expire_date']
for col in date_cols:
    transactions_merged[col] = pd.to_datetime(transactions_merged[col], format='%Y%m%d')
    
transactions_merged.head()

Unnamed: 0,msno,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel,discount,is_discount,amt_per_day
0,YyO+tlZtAXYXoZhNr3Vg3+dfVQvrBVGO8j1mfqe4ZHc=,41,30,129,129,1,2015-09-30,2015-11-01,0,0,0,4.3
1,AZtu6Wl0gPojrEQYB8Q3vBSmE2wnZ3hi1FbK1rQQ0A4=,41,30,149,149,1,2015-09-30,2015-10-31,0,0,0,4.966667
2,UkDFI97Qb6+s2LWcijVVv4rMAsORbVDT2wNXF0aVbns=,41,30,129,129,1,2015-09-30,2016-04-27,0,0,0,4.3
3,M1C56ijxozNaGD0t2h68PnH2xtx5iO5iR2MVYQB6nBI=,39,30,149,149,1,2015-09-30,2015-11-28,0,0,0,4.966667
4,yvj6zyBUaqdbUQSrKsrZ+xNDVM62knauSZJzakS9OW4=,39,30,149,149,1,2015-09-30,2015-11-21,0,0,0,4.966667


## Create "memberhsip_duration" feature

In [17]:
#--- difference in days ---
transactions_merged['membership_duration'] = transactions_merged.membership_expire_date - transactions_merged.transaction_date
transactions_merged['membership_duration'] = transactions_merged['membership_duration'] / np.timedelta64(1, 'D')
transactions_merged['membership_duration'] = transactions_merged['membership_duration'].astype(int)

 
#---difference in months ---
#df_transactions['membership_duration_M'] = (df_transactions.membership_expire_date - df_transactions.transaction_date)/ np.timedelta64(1, 'M')
#df_transactions['membership_duration_M'] = round(df_transactions['membership_duration_M']).astype(int)
#df_transactions['membership_duration_M'].head()

In [19]:
change_datatype(transactions_merged)
change_datatype_float(transactions_merged)

## Member_v3 
### inspect content

In [20]:
members_3.head()

Unnamed: 0,msno,city,bd,gender,registered_via,registration_init_time
0,Rb9UwLQTrxzBVwCB6+bCcSQWZ9JiNLC9dXtM1oEsZA8=,1,0,,11,20110911
1,+tJonkh+O1CA796Fm5X60UMOtB6POHAwPjbTRVl/EuU=,1,0,,7,20110914
2,cV358ssn7a0f7jZOwGNWS07wCKVqxyiImJUX6xcIwKw=,1,0,,11,20110915
3,9bzDeJP6sQodK73K5CBlJ6fgIQzPeLnRl0p5B77XP+g=,1,0,,11,20110915
4,WFLY3s7z4EZsieHCt63XrsdtfTEmJ+2PnnKLH5GY4Tk=,6,32,female,9,20110915


### Convert dates

In [22]:
date_cols = ['registration_init_time']

for col in date_cols:
    members_3[col] = pd.to_datetime(members_3[col], format='%Y%m%d')

## Merge transactions and members

In [24]:
#-- merging the two dataframes---
combination = pd.merge(transactions_merged, members_3, on='msno', how='inner')

#--- deleting the dataframes to save memory
del transactions_merged
del members_3

combination.head()

Unnamed: 0,msno,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel,discount,is_discount,amt_per_day,membership_duration,city,bd,gender,registered_via,registration_init_time
0,YyO+tlZtAXYXoZhNr3Vg3+dfVQvrBVGO8j1mfqe4ZHc=,41,30,129,129,1,2015-09-30,2015-11-01,0,0,0,4.3,32,1,0,,7,2011-06-29
1,YyO+tlZtAXYXoZhNr3Vg3+dfVQvrBVGO8j1mfqe4ZHc=,41,30,129,129,1,2015-10-31,2015-12-01,0,0,0,4.3,31,1,0,,7,2011-06-29
2,YyO+tlZtAXYXoZhNr3Vg3+dfVQvrBVGO8j1mfqe4ZHc=,41,30,129,129,1,2015-06-30,2015-08-01,0,0,0,4.3,32,1,0,,7,2011-06-29
3,YyO+tlZtAXYXoZhNr3Vg3+dfVQvrBVGO8j1mfqe4ZHc=,41,30,129,129,1,2015-02-28,2015-04-01,0,0,0,4.3,32,1,0,,7,2011-06-29
4,YyO+tlZtAXYXoZhNr3Vg3+dfVQvrBVGO8j1mfqe4ZHc=,41,30,129,129,1,2016-11-30,2017-01-01,0,0,0,4.3,32,1,0,,7,2011-06-29


## Create "Auto_renew_not_cancel" on combination

In [25]:
combination['autorenew_&_not_cancel'] = ((combination.is_auto_renew == 1) == (combination.is_cancel == 0)).astype(np.int8)
combination['autorenew_&_not_cancel'].unique()

array([1, 0], dtype=int64)

## Create "not_autorenew_and_cancel" on combination

In [26]:
combination['notAutorenew_&_cancel'] = ((combination.is_auto_renew == 0) == (combination.is_cancel == 1)).astype(np.int8)
combination['notAutorenew_&_cancel'].unique()

array([1, 0], dtype=int64)

# Feature engineering on "user_logs"