# Feature Engineering

[Reference](https://towardsdatascience.com/the-hitchhikers-guide-to-feature-extraction-b4c157e96631)

In [1]:
import featuretools as ft
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
!pip install category_encoders
from category_encoders.binary import BinaryEncoder
from category_encoders.hashing import HashingEncoder

from sklearn import base
from sklearn.model_selection import KFold

Collecting category_encoders
[?25l  Downloading https://files.pythonhosted.org/packages/44/57/fcef41c248701ee62e8325026b90c432adea35555cbc870aff9cfba23727/category_encoders-2.2.2-py2.py3-none-any.whl (80kB)
[K     |████                            | 10kB 17.4MB/s eta 0:00:01[K     |████████▏                       | 20kB 1.7MB/s eta 0:00:01[K     |████████████▏                   | 30kB 2.2MB/s eta 0:00:01[K     |████████████████▎               | 40kB 2.5MB/s eta 0:00:01[K     |████████████████████▎           | 51kB 2.0MB/s eta 0:00:01[K     |████████████████████████▍       | 61kB 2.3MB/s eta 0:00:01[K     |████████████████████████████▍   | 71kB 2.5MB/s eta 0:00:01[K     |████████████████████████████████| 81kB 2.2MB/s 
Installing collected packages: category-encoders
Successfully installed category-encoders-2.2.2


  import pandas.util.testing as tm


In [2]:
import os
from google.colab import drive
drive.mount('/content/gdrive',force_remount=True)
!pwd
os.chdir('gdrive/My Drive/Colab Notebooks/')
!pwd

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive
/content
/content/gdrive/My Drive/Colab Notebooks


## 1. Automatic Feature Creation using featuretools

In [3]:
data = ft.demo.load_mock_customer()

In [4]:
customers_df = data["customers"]

In [5]:
customers_df.head()

Unnamed: 0,customer_id,zip_code,join_date,date_of_birth
0,1,60091,2011-04-17 10:48:33,1994-07-18
1,2,13244,2012-04-15 23:31:04,1986-08-18
2,3,13244,2011-08-13 15:42:34,2003-11-21
3,4,60091,2011-04-08 20:08:14,2006-08-15
4,5,60091,2010-07-17 05:27:50,1984-07-28


In [6]:
sessions_df = data['sessions']

In [7]:
sessions_df.head(5)

Unnamed: 0,session_id,customer_id,device,session_start
0,1,2,desktop,2014-01-01 00:00:00
1,2,5,mobile,2014-01-01 00:17:20
2,3,4,mobile,2014-01-01 00:28:10
3,4,1,mobile,2014-01-01 00:44:25
4,5,4,mobile,2014-01-01 01:11:30


In [8]:
transactions_df = data["transactions"]

In [9]:
transactions_df.head(5)

Unnamed: 0,transaction_id,session_id,transaction_time,product_id,amount
0,298,1,2014-01-01 00:00:00,5,127.64
1,2,1,2014-01-01 00:01:05,2,109.48
2,308,1,2014-01-01 00:02:10,3,95.06
3,116,1,2014-01-01 00:03:15,4,78.92
4,371,1,2014-01-01 00:04:20,3,31.54


In [10]:
# Create new entityset
es = ft.EntitySet(id = 'customers')

In [11]:
# Create an entity from the customers dataframe

es = es.entity_from_dataframe(entity_id = 'customers', dataframe = customers_df, 
                              index = 'customer_id', time_index = 'join_date' ,variable_types =  {"zip_code": ft.variable_types.Numeric})

In [12]:
es

Entityset: customers
  Entities:
    customers [Rows: 5, Columns: 4]
  Relationships:
    No relationships

In [13]:
es = es.entity_from_dataframe(entity_id="transactions",
                                 dataframe=transactions_df,
                                 index="transaction_id",
                               time_index="transaction_time",
                               variable_types={"product_id": ft.variable_types.Categorical})

In [14]:
ft.variable_types.ALL_VARIABLE_TYPES

[featuretools.variable_types.variable.Datetime,
 featuretools.variable_types.variable.Numeric,
 featuretools.variable_types.variable.Timedelta,
 featuretools.variable_types.variable.Categorical,
 featuretools.variable_types.variable.Text,
 featuretools.variable_types.variable.Ordinal,
 featuretools.variable_types.variable.Boolean,
 featuretools.variable_types.variable.LatLong]

In [15]:
es

Entityset: customers
  Entities:
    customers [Rows: 5, Columns: 4]
    transactions [Rows: 500, Columns: 5]
  Relationships:
    No relationships

In [16]:
es = es.entity_from_dataframe(entity_id="sessions",
            dataframe=sessions_df,
            index="session_id", time_index = 'session_start')


In [17]:
es

Entityset: customers
  Entities:
    customers [Rows: 5, Columns: 4]
    transactions [Rows: 500, Columns: 5]
    sessions [Rows: 35, Columns: 4]
  Relationships:
    No relationships

In [18]:
cust_relationship = ft.Relationship(es["customers"]["customer_id"],
                       es["sessions"]["customer_id"])

# Add the relationship to the entity set
es = es.add_relationship(cust_relationship)

In [19]:
sess_relationship = ft.Relationship(es["sessions"]["session_id"],
                       es["transactions"]["session_id"])

# Add the relationship to the entity set
es = es.add_relationship(sess_relationship)

In [20]:
es

Entityset: customers
  Entities:
    customers [Rows: 5, Columns: 4]
    transactions [Rows: 500, Columns: 5]
    sessions [Rows: 35, Columns: 4]
  Relationships:
    sessions.customer_id -> customers.customer_id
    transactions.session_id -> sessions.session_id

In [21]:
feature_matrix, feature_defs = ft.dfs(entityset=es,
                                        target_entity="customers",max_depth = 3)

In [22]:
feature_matrix

Unnamed: 0_level_0,zip_code,COUNT(sessions),NUM_UNIQUE(sessions.device),MODE(sessions.device),SUM(transactions.amount),STD(transactions.amount),MAX(transactions.amount),SKEW(transactions.amount),MIN(transactions.amount),MEAN(transactions.amount),COUNT(transactions),NUM_UNIQUE(transactions.product_id),MODE(transactions.product_id),DAY(join_date),DAY(date_of_birth),YEAR(join_date),YEAR(date_of_birth),MONTH(join_date),MONTH(date_of_birth),WEEKDAY(join_date),WEEKDAY(date_of_birth),SUM(sessions.STD(transactions.amount)),SUM(sessions.MAX(transactions.amount)),SUM(sessions.SKEW(transactions.amount)),SUM(sessions.MIN(transactions.amount)),SUM(sessions.MEAN(transactions.amount)),SUM(sessions.NUM_UNIQUE(transactions.product_id)),STD(sessions.SUM(transactions.amount)),STD(sessions.MAX(transactions.amount)),STD(sessions.SKEW(transactions.amount)),STD(sessions.MIN(transactions.amount)),STD(sessions.MEAN(transactions.amount)),STD(sessions.COUNT(transactions)),STD(sessions.NUM_UNIQUE(transactions.product_id)),MAX(sessions.SUM(transactions.amount)),MAX(sessions.STD(transactions.amount)),MAX(sessions.SKEW(transactions.amount)),MAX(sessions.MIN(transactions.amount)),MAX(sessions.MEAN(transactions.amount)),MAX(sessions.COUNT(transactions)),...,NUM_UNIQUE(transactions.DAY(transaction_time)),NUM_UNIQUE(transactions.YEAR(transaction_time)),NUM_UNIQUE(transactions.MONTH(transaction_time)),NUM_UNIQUE(transactions.WEEKDAY(transaction_time)),MODE(transactions.DAY(transaction_time)),MODE(transactions.YEAR(transaction_time)),MODE(transactions.MONTH(transaction_time)),MODE(transactions.WEEKDAY(transaction_time)),SUM(sessions.NUM_UNIQUE(transactions.DAY(transaction_time))),SUM(sessions.NUM_UNIQUE(transactions.YEAR(transaction_time))),SUM(sessions.NUM_UNIQUE(transactions.MONTH(transaction_time))),SUM(sessions.NUM_UNIQUE(transactions.WEEKDAY(transaction_time))),STD(sessions.NUM_UNIQUE(transactions.DAY(transaction_time))),STD(sessions.NUM_UNIQUE(transactions.YEAR(transaction_time))),STD(sessions.NUM_UNIQUE(transactions.MONTH(transaction_time))),STD(sessions.NUM_UNIQUE(transactions.WEEKDAY(transaction_time))),MAX(sessions.NUM_UNIQUE(transactions.DAY(transaction_time))),MAX(sessions.NUM_UNIQUE(transactions.YEAR(transaction_time))),MAX(sessions.NUM_UNIQUE(transactions.MONTH(transaction_time))),MAX(sessions.NUM_UNIQUE(transactions.WEEKDAY(transaction_time))),SKEW(sessions.NUM_UNIQUE(transactions.DAY(transaction_time))),SKEW(sessions.NUM_UNIQUE(transactions.YEAR(transaction_time))),SKEW(sessions.NUM_UNIQUE(transactions.MONTH(transaction_time))),SKEW(sessions.NUM_UNIQUE(transactions.WEEKDAY(transaction_time))),MIN(sessions.NUM_UNIQUE(transactions.DAY(transaction_time))),MIN(sessions.NUM_UNIQUE(transactions.YEAR(transaction_time))),MIN(sessions.NUM_UNIQUE(transactions.MONTH(transaction_time))),MIN(sessions.NUM_UNIQUE(transactions.WEEKDAY(transaction_time))),MEAN(sessions.NUM_UNIQUE(transactions.DAY(transaction_time))),MEAN(sessions.NUM_UNIQUE(transactions.YEAR(transaction_time))),MEAN(sessions.NUM_UNIQUE(transactions.MONTH(transaction_time))),MEAN(sessions.NUM_UNIQUE(transactions.WEEKDAY(transaction_time))),NUM_UNIQUE(sessions.MODE(transactions.DAY(transaction_time))),NUM_UNIQUE(sessions.MODE(transactions.YEAR(transaction_time))),NUM_UNIQUE(sessions.MODE(transactions.MONTH(transaction_time))),NUM_UNIQUE(sessions.MODE(transactions.WEEKDAY(transaction_time))),MODE(sessions.MODE(transactions.DAY(transaction_time))),MODE(sessions.MODE(transactions.YEAR(transaction_time))),MODE(sessions.MODE(transactions.MONTH(transaction_time))),MODE(sessions.MODE(transactions.WEEKDAY(transaction_time)))
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1,60091,8,3,mobile,9025.62,40.442059,139.43,0.019698,5.81,71.631905,126,5,4,17,18,2011,1994,4,7,6,0,312.745952,1057.97,-0.476122,78.59,582.193117,40,279.510713,7.322191,0.589386,6.954507,13.759314,4.062019,0.0,1613.93,46.905665,0.640252,26.36,88.755625,25,...,1,1,1,1,1,2014,1,2,8,8,8,8,0.0,0.0,0.0,0.0,1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,2014,1,2
2,13244,7,3,desktop,7200.28,37.705178,146.81,0.098259,8.73,77.422366,93,5,4,15,18,2012,1986,4,8,6,0,258.700528,931.63,-0.27764,154.6,548.905851,35,251.609234,17.221593,0.509798,15.874374,11.477071,3.450328,0.0,1320.64,47.93592,0.755711,56.46,96.581,18,...,1,1,1,1,1,2014,1,2,7,7,7,7,0.0,0.0,0.0,0.0,1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,2014,1,2
3,13244,6,3,desktop,6236.62,43.683296,149.15,0.41823,5.89,67.06043,93,5,1,13,21,2011,2003,8,11,5,4,257.299895,847.63,2.286086,66.21,405.237462,29,219.02142,10.724241,0.429374,5.424407,11.174282,2.428992,0.408248,1477.97,50.11012,0.854976,20.06,82.109444,18,...,1,1,1,1,1,2014,1,2,6,6,6,6,0.0,0.0,0.0,0.0,1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,2014,1,2
4,60091,8,3,mobile,8727.68,45.068765,149.95,-0.036348,5.73,80.070459,109,5,2,8,15,2011,2006,4,8,4,1,356.125829,1157.99,0.002764,131.51,649.657515,37,235.992478,3.514421,0.387884,16.960575,13.027258,3.335416,0.517549,1351.46,54.293903,0.382868,54.83,110.45,18,...,1,1,1,1,1,2014,1,2,8,8,8,8,0.0,0.0,0.0,0.0,1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,2014,1,2
5,60091,6,3,mobile,6349.66,44.09563,149.02,-0.025941,7.55,80.375443,79,5,5,17,28,2010,1984,7,7,5,5,259.873954,839.76,0.014384,86.49,472.231119,30,402.775486,7.928001,0.415426,4.961414,11.007471,3.600926,0.0,1700.67,51.14925,0.602209,20.65,94.481667,18,...,1,1,1,1,1,2014,1,2,6,6,6,6,0.0,0.0,0.0,0.0,1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,2014,1,2


In [23]:
len(feature_defs)

113

In [24]:
feature_defs

[<Feature: zip_code>,
 <Feature: COUNT(sessions)>,
 <Feature: NUM_UNIQUE(sessions.device)>,
 <Feature: MODE(sessions.device)>,
 <Feature: SUM(transactions.amount)>,
 <Feature: STD(transactions.amount)>,
 <Feature: MAX(transactions.amount)>,
 <Feature: SKEW(transactions.amount)>,
 <Feature: MIN(transactions.amount)>,
 <Feature: MEAN(transactions.amount)>,
 <Feature: COUNT(transactions)>,
 <Feature: NUM_UNIQUE(transactions.product_id)>,
 <Feature: MODE(transactions.product_id)>,
 <Feature: DAY(join_date)>,
 <Feature: DAY(date_of_birth)>,
 <Feature: YEAR(join_date)>,
 <Feature: YEAR(date_of_birth)>,
 <Feature: MONTH(join_date)>,
 <Feature: MONTH(date_of_birth)>,
 <Feature: WEEKDAY(join_date)>,
 <Feature: WEEKDAY(date_of_birth)>,
 <Feature: SUM(sessions.STD(transactions.amount))>,
 <Feature: SUM(sessions.MAX(transactions.amount))>,
 <Feature: SUM(sessions.SKEW(transactions.amount))>,
 <Feature: SUM(sessions.MIN(transactions.amount))>,
 <Feature: SUM(sessions.MEAN(transactions.amount))>,
 <

In [25]:
# Lets talk about categorical features 
sessions_df.head()

Unnamed: 0,session_id,customer_id,device,session_start
0,1,2,desktop,2014-01-01 00:00:00
1,2,5,mobile,2014-01-01 00:17:20
2,3,4,mobile,2014-01-01 00:28:10
3,4,1,mobile,2014-01-01 00:44:25
4,5,4,mobile,2014-01-01 01:11:30


In [26]:
pd.get_dummies(sessions_df['device'],drop_first=True).head()

Unnamed: 0,mobile,tablet
0,0,0
1,1,0
2,1,0
3,1,0
4,1,0


## 2. Handling Categorical Features: Label/Binary/Hashing and Target/Mean Encoding

### Ordinal Encoding

In [27]:
df = pd.DataFrame(
       [[ 'low', 'London'], [ 'medium', 'New York'], [ 'high', 'Dubai']],
       columns=['Temperature', 'City'])

In [28]:
df

Unnamed: 0,Temperature,City
0,low,London
1,medium,New York
2,high,Dubai


In [29]:
map_dict = {'low':0,'medium':1,'high':2}
def map_values(x):
    return map_dict[x]
df['Temperature_oe'] = df['Temperature'].apply(lambda x: map_values(x))

In [30]:
df

Unnamed: 0,Temperature,City,Temperature_oe
0,low,London,0
1,medium,New York,1
2,high,Dubai,2


## Label Encoder

In [31]:
# create a labelencoder object
le = LabelEncoder()
# fit and transform on the data
sessions_df['device_le'] = le.fit_transform(sessions_df['device'])
sessions_df.head()

Unnamed: 0,session_id,customer_id,device,session_start,device_le
0,1,2,desktop,2014-01-01 00:00:00,0
1,2,5,mobile,2014-01-01 00:17:20,1
2,3,4,mobile,2014-01-01 00:28:10,1
3,4,1,mobile,2014-01-01 00:44:25,1
4,5,4,mobile,2014-01-01 01:11:30,1


In [32]:
sessions_df.head()

Unnamed: 0,session_id,customer_id,device,session_start,device_le
0,1,2,desktop,2014-01-01 00:00:00,0
1,2,5,mobile,2014-01-01 00:17:20,1
2,3,4,mobile,2014-01-01 00:28:10,1
3,4,1,mobile,2014-01-01 00:44:25,1
4,5,4,mobile,2014-01-01 01:11:30,1


### Binary Encoder

In [33]:
players = pd.read_csv("./fifa_data.csv")

In [34]:
len(players.Club.unique())

652

In [35]:
# create a Binaryencoder object
be = BinaryEncoder(cols = ['Club'],)
# fit and transform on the data
players = be.fit_transform(players)

In [36]:
players.head()

Unnamed: 0.1,Unnamed: 0,ID,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club_0,Club_1,Club_2,Club_3,Club_4,Club_5,Club_6,Club_7,Club_8,Club_9,Club_10,Club Logo,Value,Wage,Special,Preferred Foot,International Reputation,Weak Foot,Skill Moves,Work Rate,Body Type,Real Face,Position,Jersey Number,Joined,Loaned From,Contract Valid Until,Height,Weight,LS,ST,...,LB,LCB,CB,RCB,RB,Crossing,Finishing,HeadingAccuracy,ShortPassing,Volleys,Dribbling,Curve,FKAccuracy,LongPassing,BallControl,Acceleration,SprintSpeed,Agility,Reactions,Balance,ShotPower,Jumping,Stamina,Strength,LongShots,Aggression,Interceptions,Positioning,Vision,Penalties,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Release Clause
0,0,158023,L. Messi,31,https://cdn.sofifa.org/players/4/19/158023.png,Argentina,https://cdn.sofifa.org/flags/52.png,94,94,0,0,0,0,0,0,0,0,0,0,1,https://cdn.sofifa.org/teams/2/light/241.png,€110.5M,€565K,2202,Left,5.0,4.0,4.0,Medium/ Medium,Messi,Yes,RF,10.0,"Jul 1, 2004",,2021,5'7,159lbs,88+2,88+2,...,59+2,47+2,47+2,47+2,59+2,84.0,95.0,70.0,90.0,86.0,97.0,93.0,94.0,87.0,96.0,91.0,86.0,91.0,95.0,95.0,85.0,68.0,72.0,59.0,94.0,48.0,22.0,94.0,94.0,75.0,96.0,33.0,28.0,26.0,6.0,11.0,15.0,14.0,8.0,€226.5M
1,1,20801,Cristiano Ronaldo,33,https://cdn.sofifa.org/players/4/19/20801.png,Portugal,https://cdn.sofifa.org/flags/38.png,94,94,0,0,0,0,0,0,0,0,0,1,0,https://cdn.sofifa.org/teams/2/light/45.png,€77M,€405K,2228,Right,5.0,4.0,5.0,High/ Low,C. Ronaldo,Yes,ST,7.0,"Jul 10, 2018",,2022,6'2,183lbs,91+3,91+3,...,61+3,53+3,53+3,53+3,61+3,84.0,94.0,89.0,81.0,87.0,88.0,81.0,76.0,77.0,94.0,89.0,91.0,87.0,96.0,70.0,95.0,95.0,88.0,79.0,93.0,63.0,29.0,95.0,82.0,85.0,95.0,28.0,31.0,23.0,7.0,11.0,15.0,14.0,11.0,€127.1M
2,2,190871,Neymar Jr,26,https://cdn.sofifa.org/players/4/19/190871.png,Brazil,https://cdn.sofifa.org/flags/54.png,92,93,0,0,0,0,0,0,0,0,0,1,1,https://cdn.sofifa.org/teams/2/light/73.png,€118.5M,€290K,2143,Right,5.0,5.0,5.0,High/ Medium,Neymar,Yes,LW,10.0,"Aug 3, 2017",,2022,5'9,150lbs,84+3,84+3,...,60+3,47+3,47+3,47+3,60+3,79.0,87.0,62.0,84.0,84.0,96.0,88.0,87.0,78.0,95.0,94.0,90.0,96.0,94.0,84.0,80.0,61.0,81.0,49.0,82.0,56.0,36.0,89.0,87.0,81.0,94.0,27.0,24.0,33.0,9.0,9.0,15.0,15.0,11.0,€228.1M
3,3,193080,De Gea,27,https://cdn.sofifa.org/players/4/19/193080.png,Spain,https://cdn.sofifa.org/flags/45.png,91,93,0,0,0,0,0,0,0,0,1,0,0,https://cdn.sofifa.org/teams/2/light/11.png,€72M,€260K,1471,Right,4.0,3.0,1.0,Medium/ Medium,Lean,Yes,GK,1.0,"Jul 1, 2011",,2020,6'4,168lbs,,,...,,,,,,17.0,13.0,21.0,50.0,13.0,18.0,21.0,19.0,51.0,42.0,57.0,58.0,60.0,90.0,43.0,31.0,67.0,43.0,64.0,12.0,38.0,30.0,12.0,68.0,40.0,68.0,15.0,21.0,13.0,90.0,85.0,87.0,88.0,94.0,€138.6M
4,4,192985,K. De Bruyne,27,https://cdn.sofifa.org/players/4/19/192985.png,Belgium,https://cdn.sofifa.org/flags/7.png,91,92,0,0,0,0,0,0,0,0,1,0,1,https://cdn.sofifa.org/teams/2/light/10.png,€102M,€355K,2281,Right,4.0,5.0,4.0,High/ High,Normal,Yes,RCM,7.0,"Aug 30, 2015",,2023,5'11,154lbs,82+3,82+3,...,73+3,66+3,66+3,66+3,73+3,93.0,82.0,55.0,92.0,82.0,86.0,85.0,83.0,91.0,91.0,78.0,76.0,79.0,91.0,77.0,91.0,63.0,90.0,75.0,91.0,76.0,61.0,87.0,94.0,79.0,88.0,68.0,58.0,51.0,15.0,13.0,5.0,10.0,13.0,€196.4M


### Hashing Encoder

In [37]:
players = pd.read_csv("./fifa_data.csv")

# create a HashingEncoder object
be = HashingEncoder(cols = ['Club'])
# fit and transform on the data
players = be.fit_transform(players)

In [38]:
players.head()

Unnamed: 0.1,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,Unnamed: 0,ID,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club Logo,Value,Wage,Special,Preferred Foot,International Reputation,Weak Foot,Skill Moves,Work Rate,Body Type,Real Face,Position,Jersey Number,Joined,Loaned From,Contract Valid Until,Height,Weight,LS,ST,RS,LW,LF,...,LB,LCB,CB,RCB,RB,Crossing,Finishing,HeadingAccuracy,ShortPassing,Volleys,Dribbling,Curve,FKAccuracy,LongPassing,BallControl,Acceleration,SprintSpeed,Agility,Reactions,Balance,ShotPower,Jumping,Stamina,Strength,LongShots,Aggression,Interceptions,Positioning,Vision,Penalties,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Release Clause
0,1,0,0,0,0,0,0,0,0,158023,L. Messi,31,https://cdn.sofifa.org/players/4/19/158023.png,Argentina,https://cdn.sofifa.org/flags/52.png,94,94,https://cdn.sofifa.org/teams/2/light/241.png,€110.5M,€565K,2202,Left,5.0,4.0,4.0,Medium/ Medium,Messi,Yes,RF,10.0,"Jul 1, 2004",,2021,5'7,159lbs,88+2,88+2,88+2,92+2,93+2,...,59+2,47+2,47+2,47+2,59+2,84.0,95.0,70.0,90.0,86.0,97.0,93.0,94.0,87.0,96.0,91.0,86.0,91.0,95.0,95.0,85.0,68.0,72.0,59.0,94.0,48.0,22.0,94.0,94.0,75.0,96.0,33.0,28.0,26.0,6.0,11.0,15.0,14.0,8.0,€226.5M
1,0,0,0,0,0,1,0,0,1,20801,Cristiano Ronaldo,33,https://cdn.sofifa.org/players/4/19/20801.png,Portugal,https://cdn.sofifa.org/flags/38.png,94,94,https://cdn.sofifa.org/teams/2/light/45.png,€77M,€405K,2228,Right,5.0,4.0,5.0,High/ Low,C. Ronaldo,Yes,ST,7.0,"Jul 10, 2018",,2022,6'2,183lbs,91+3,91+3,91+3,89+3,90+3,...,61+3,53+3,53+3,53+3,61+3,84.0,94.0,89.0,81.0,87.0,88.0,81.0,76.0,77.0,94.0,89.0,91.0,87.0,96.0,70.0,95.0,95.0,88.0,79.0,93.0,63.0,29.0,95.0,82.0,85.0,95.0,28.0,31.0,23.0,7.0,11.0,15.0,14.0,11.0,€127.1M
2,0,0,0,0,0,1,0,0,2,190871,Neymar Jr,26,https://cdn.sofifa.org/players/4/19/190871.png,Brazil,https://cdn.sofifa.org/flags/54.png,92,93,https://cdn.sofifa.org/teams/2/light/73.png,€118.5M,€290K,2143,Right,5.0,5.0,5.0,High/ Medium,Neymar,Yes,LW,10.0,"Aug 3, 2017",,2022,5'9,150lbs,84+3,84+3,84+3,89+3,89+3,...,60+3,47+3,47+3,47+3,60+3,79.0,87.0,62.0,84.0,84.0,96.0,88.0,87.0,78.0,95.0,94.0,90.0,96.0,94.0,84.0,80.0,61.0,81.0,49.0,82.0,56.0,36.0,89.0,87.0,81.0,94.0,27.0,24.0,33.0,9.0,9.0,15.0,15.0,11.0,€228.1M
3,0,0,0,0,1,0,0,0,3,193080,De Gea,27,https://cdn.sofifa.org/players/4/19/193080.png,Spain,https://cdn.sofifa.org/flags/45.png,91,93,https://cdn.sofifa.org/teams/2/light/11.png,€72M,€260K,1471,Right,4.0,3.0,1.0,Medium/ Medium,Lean,Yes,GK,1.0,"Jul 1, 2011",,2020,6'4,168lbs,,,,,,...,,,,,,17.0,13.0,21.0,50.0,13.0,18.0,21.0,19.0,51.0,42.0,57.0,58.0,60.0,90.0,43.0,31.0,67.0,43.0,64.0,12.0,38.0,30.0,12.0,68.0,40.0,68.0,15.0,21.0,13.0,90.0,85.0,87.0,88.0,94.0,€138.6M
4,0,0,0,0,0,1,0,0,4,192985,K. De Bruyne,27,https://cdn.sofifa.org/players/4/19/192985.png,Belgium,https://cdn.sofifa.org/flags/7.png,91,92,https://cdn.sofifa.org/teams/2/light/10.png,€102M,€355K,2281,Right,4.0,5.0,4.0,High/ High,Normal,Yes,RCM,7.0,"Aug 30, 2015",,2023,5'11,154lbs,82+3,82+3,82+3,87+3,87+3,...,73+3,66+3,66+3,66+3,73+3,93.0,82.0,55.0,92.0,82.0,86.0,85.0,83.0,91.0,91.0,78.0,76.0,79.0,91.0,77.0,91.0,63.0,90.0,75.0,91.0,76.0,61.0,87.0,94.0,79.0,88.0,68.0,58.0,51.0,15.0,13.0,5.0,10.0,13.0,€196.4M


### Target/Mean Encoding

In [39]:
train = pd.read_csv("./titanic_train.csv")

In [40]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [41]:
class KFoldTargetEncoderTrain(base.BaseEstimator, base.TransformerMixin):
    def __init__(self,colnames,targetName,
                  n_fold=5, verbosity=True,
                  discardOriginal_col=False):
        self.colnames = colnames
        self.targetName = targetName
        self.n_fold = n_fold
        self.verbosity = verbosity
        self.discardOriginal_col = discardOriginal_col
    def fit(self, X, y=None):
        return self
    def transform(self,X):
        assert(type(self.targetName) == str)
        assert(type(self.colnames) == str)
        assert(self.colnames in X.columns)
        assert(self.targetName in X.columns)
        mean_of_target = X[self.targetName].mean()
        kf = KFold(n_splits = self.n_fold,
                   shuffle = True, random_state=2019)
        col_mean_name = self.colnames + '_' + 'Kfold_Target_Enc'
        X[col_mean_name] = np.nan
        for tr_ind, val_ind in kf.split(X):
            X_tr, X_val = X.iloc[tr_ind], X.iloc[val_ind]
            X.loc[X.index[val_ind], col_mean_name] = X_val[self.colnames].map(X_tr.groupby(self.colnames)
                                     [self.targetName].mean())
            X[col_mean_name].fillna(mean_of_target, inplace = True)
        if self.verbosity:
            encoded_feature = X[col_mean_name].values
            print('Correlation between the new feature, {} and, {} is {}.'.format(col_mean_name,self.targetName,                    
                   np.corrcoef(X[self.targetName].values,
                               encoded_feature)[0][1]))
        if self.discardOriginal_col:
            X = X.drop(self.targetName, axis=1)
        return X

In [42]:
targetc = KFoldTargetEncoderTrain('Pclass','Survived',n_fold=5)
new_train = targetc.fit_transform(train)

Correlation between the new feature, Pclass_Kfold_Target_Enc and, Survived is 0.33349480268464116.


In [43]:
new_train[['Pclass_Kfold_Target_Enc','Pclass']].head()

Unnamed: 0,Pclass_Kfold_Target_Enc,Pclass
0,0.242268,3
1,0.642045,1
2,0.248756,3
3,0.640244,1
4,0.242268,3


## 3. How best to use Latitude and Longitude features - Part 1:

In [44]:
train = pd.read_csv("./nyc_train.csv")

In [45]:
train = train.sample(500)

In [46]:
def haversine_array(lat1, lng1, lat2, lng2): 
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2)) 
    AVG_EARTH_RADIUS = 6371 # in km 
    lat = lat2 - lat1 
    lng = lng2 - lng1 
    d = np.sin(lat * 0.5) ** 2 + np.cos(lat1) * np.cos(lat2) *      np.sin(lng * 0.5) ** 2 
    h = 2 * AVG_EARTH_RADIUS * np.arcsin(np.sqrt(d)) 
    return h

In [47]:
train['haversine_distance'] = train.apply(lambda x: haversine_array(x['pickup_latitude'], x['pickup_longitude'], x['dropoff_latitude'], x['dropoff_longitude']),axis=1)

In [48]:
def dummy_manhattan_distance(lat1, lng1, lat2, lng2): 
    a = haversine_array(lat1, lng1, lat1, lng2) 
    b = haversine_array(lat1, lng1, lat2, lng1) 
    return a + b

In [49]:
train['manhattan_distance'] = train.apply(lambda x: dummy_manhattan_distance(x['pickup_latitude'], x['pickup_longitude'], x['dropoff_latitude'], x['dropoff_longitude']),axis=1)

In [50]:
def bearing_array(lat1, lng1, lat2, lng2): 
    AVG_EARTH_RADIUS = 6371 # in km 
    lng_delta_rad = np.radians(lng2 - lng1) 
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2)) 
    y = np.sin(lng_delta_rad) * np.cos(lat2) 
    x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(lng_delta_rad) 
    return np.degrees(np.arctan2(y, x))

In [51]:
train['bearing'] = train.apply(lambda x: bearing_array(x['pickup_latitude'], x['pickup_longitude'], x['dropoff_latitude'], x['dropoff_longitude']),axis=1)

In [52]:
train.loc[:, 'center_latitude'] = (train['pickup_latitude'].values + train['dropoff_latitude'].values) / 2 
train.loc[:, 'center_longitude'] = (train['pickup_longitude'].values + train['dropoff_longitude'].values) / 2

In [53]:
train.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,haversine_distance,manhattan_distance,bearing,center_latitude,center_longitude
1227341,id3668879,1,2016-05-12 08:27:27,2016-05-12 08:34:55,1,-73.981995,40.74078,-73.975533,40.755886,N,448,1.765739,2.224159,17.954238,40.748333,-73.978764
235133,id3630719,1,2016-01-18 20:15:19,2016-01-18 20:25:36,1,-73.960388,40.775887,-73.974815,40.761909,N,617,1.972714,2.769012,-141.979117,40.768898,-73.967602
273157,id2587382,1,2016-01-28 21:47:30,2016-01-28 21:59:12,1,-73.981941,40.773285,-73.985214,40.751987,N,702,2.384158,2.643782,-173.35952,40.762636,-73.983578
608932,id1718826,2,2016-03-10 09:08:06,2016-03-10 09:31:30,5,-73.99472,40.740398,-73.966988,40.769585,N,1404,3.99864,5.581843,35.736766,40.754992,-73.980854
311199,id3915979,1,2016-05-13 23:32:57,2016-05-13 23:42:48,1,-73.948074,40.776314,-73.939209,40.804707,N,591,3.244152,3.903633,13.297522,40.79051,-73.943642


### log feature transformation

In [54]:
!pip install plotly_express
import plotly_express as px

Collecting plotly_express
  Downloading https://files.pythonhosted.org/packages/d4/d6/8a2906f51e073a4be80cab35cfa10e7a34853e60f3ed5304ac470852a08d/plotly_express-0.4.1-py2.py3-none-any.whl
Installing collected packages: plotly-express
Successfully installed plotly-express-0.4.1


In [55]:
px.histogram(train,x='trip_duration')

In [56]:
train['log_trip_duration'] = train['trip_duration'].apply(lambda x: np.log(1+x))

In [57]:
px.histogram(train,x='log_trip_duration')