### Objective:

In [1]:
## Loading packages
import numpy as np
import pandas as pd

In [2]:
## Loading dataset
train = pd.read_csv("E:/Github/Feature_Engineering/titanic/train.csv")
test = pd.read_csv("E:/Github/Feature_Engineering/titanic/test.csv")

## Glimpse throught the data
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [4]:
## Removing dummy variables
train.drop(labels = ["Cabin", "Ticket"], axis = 1, inplace = True)
test.drop(labels = ["Cabin", "Ticket"], axis = 1, inplace = True)

In [5]:
## Fill missing values with NaN
train = train.fillna(np.nan)
test = test.fillna(np.nan)


In [6]:
## Check for Null values
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Fare             0
Embarked         2
dtype: int64

In [7]:
## Missing Values Imputation
train["Age"].fillna(train["Age"].median(), inplace = True)
train["Embarked"].fillna("S", inplace = True)

In [8]:
## Lets create a variable called title from the name variable
for name in train["Name"]:
    train["Title"] = train["Name"].str.extract("([A-Za-z]+)\.",expand=True)

title_replacements = {"Mlle": "Other", "Major": "Other", "Col": "Other", "Sir": "Other", "Don": "Other", "Mme": "Other",
          "Jonkheer": "Other", "Lady": "Other", "Capt": "Other", "Countess": "Other", "Ms": "Other", "Dona": "Other"}

train.replace({"Title": title_replacements}, inplace=True)
train.replace({"Title": title_replacements}, inplace=True)


#### One Hot Encoding

In [9]:
## subset categorical variables which you want to encode
x = train[['Embarked','Pclass','Title']]

x = pd.get_dummies(x, columns=['Embarked','Pclass','Title'], drop_first=False)
x.head()

Unnamed: 0,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3,Title_Dr,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Other,Title_Rev
0,0,0,1,0,0,1,0,0,0,1,0,0,0
1,1,0,0,1,0,0,0,0,0,0,1,0,0
2,0,0,1,0,0,1,0,0,1,0,0,0,0
3,0,0,1,1,0,0,0,0,0,0,1,0,0
4,0,0,1,0,0,1,0,0,0,1,0,0,0


#### Label Encoding

In [11]:
## subset categorical variables which you want to encode
x = train[['Embarked','Pclass','Title']]

from sklearn.preprocessing import LabelEncoder
x = x.apply(LabelEncoder().fit_transform)
x.head()

Unnamed: 0,Embarked,Pclass,Title
0,2,2,3
1,0,0,4
2,2,2,2
3,2,0,4
4,2,2,3


In [12]:
## sample train dataset
sample_train = train[['Embarked','Pclass','Title']]

## Frequency Encoding title variable
y = sample_train.groupby(['Title']).size().reset_index()
y.columns = ['Title', 'Freq_Encoded_Title']
y.head()

Unnamed: 0,Title,Freq_Encoded_Title
0,Dr,7
1,Master,40
2,Miss,182
3,Mr,517
4,Mrs,125


In [13]:
sample_train = pd.merge(sample_train,y,on = 'Title',how = 'left')
sample_train.head()

Unnamed: 0,Embarked,Pclass,Title,Freq_Encoded_Title
0,S,3,Mr,517
1,C,1,Mrs,125
2,S,3,Miss,182
3,S,1,Mrs,125
4,S,3,Mr,517


In [14]:
sample_train = train[['Title','Survived']]

## Mean encoding 
x = sample_train.groupby(['Title'])['Survived'].sum().reset_index()
x = x.rename(columns={"Survived" : "Title_Survived_sum"})

y = sample_train.groupby(['Title'])['Survived'].count().reset_index()
y = y.rename(columns={"Survived" : "Title_Survived_count"})

z = pd.merge(x,y,on = 'Title',how = 'inner')
z['Target_Encoded_over_Title'] = z['Title_Survived_sum']/z['Title_Survived_count']
z.head()

Unnamed: 0,Title,Title_Survived_sum,Title_Survived_count,Target_Encoded_over_Title
0,Dr,3,7,0.428571
1,Master,23,40,0.575
2,Miss,127,182,0.697802
3,Mr,81,517,0.156673
4,Mrs,99,125,0.792


In [15]:
## Joining this back with the sample_train dataset

z = z[['Title','Target_Encoded_over_Title']]

sample_train = pd.merge(sample_train,z,on = 'Title',how = 'left')
sample_train.head()

Unnamed: 0,Title,Survived,Target_Encoded_over_Title
0,Mr,0,0.156673
1,Mrs,1,0.792
2,Miss,1,0.697802
3,Mrs,1,0.792
4,Mr,0,0.156673


In [16]:
## Direct Method
## TYPE 1
## Selecting title (categorical) and Fare (numeric) from the train dataset

sample_train = train[['Title','Fare']]

## Mean encoding 
x = sample_train.groupby(['Title'])['Fare'].mean().reset_index()
x = x.rename(columns={"Fare" : "Title" +"_Mean_Encoded"})
x.head()

Unnamed: 0,Title,Title_Mean_Encoded
0,Dr,49.168457
1,Master,34.703125
2,Miss,43.797873
3,Mr,24.44156
4,Mrs,45.138533


In [17]:
## Joining this back with the sample_train dataset

sample_train = pd.merge(sample_train,x,on = 'Title',how = 'left')
sample_train.head()

Unnamed: 0,Title,Fare,Title_Mean_Encoded
0,Mr,7.25,24.44156
1,Mrs,71.2833,45.138533
2,Miss,7.925,43.797873
3,Mrs,53.1,45.138533
4,Mr,8.05,24.44156


In [18]:
## K-Fold Method  
## TYPE 2
## Selecting title (categorical) and Fare (numeric) from the train dataset

x = train[['Embarked','Pclass','Title','Fare']]
cols = ['Embarked','Pclass','Title']

## Loading k-fold from sklearn
import sklearn
from sklearn.model_selection import StratifiedKFold

## 10 fold cv
kf = sklearn.model_selection.KFold(n_splits = 10, shuffle = False) 

In [19]:
for i in cols: ## Looping through all features   
    x['Mean_Encoded_on'] = np.nan

    for tr_ind, val_ind in kf.split(x):
        X_tr, X_val = x.iloc[tr_ind], x.iloc[val_ind] ## train-test hold out
        x.loc[x.index[val_ind], 'Mean_Encoded_on'] = X_val[i].map(X_tr.groupby(i).Fare.mean())

    x = x.rename(index=str, columns={"Mean_Encoded_on": i +"_K_Encoded"})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [20]:
x.head()

Unnamed: 0,Embarked,Pclass,Title,Fare,Embarked_K_Encoded,Pclass_K_Encoded,Title_K_Encoded
0,S,3,Mr,7.25,26.849641,13.511521,24.102051
1,C,1,Mrs,71.2833,63.086356,84.304041,46.607722
2,S,3,Miss,7.925,26.849641,13.511521,45.713516
3,S,1,Mrs,53.1,26.849641,84.304041,46.607722
4,S,3,Mr,8.05,26.849641,13.511521,24.102051
