In [1]:
import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd

In [2]:
train = pd.read_csv('train.csv')

In [3]:
train.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [5]:
test = pd.read_csv('test.csv')

In [6]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [7]:
# Return missing values in dataframe for training set
print("Total Records: {}\n".format(train["Age"].count()))

print("Training set missing values")
print(train.isnull().sum())
print("\n")

print("Testing set missing values")
print(test.isnull().sum())

Total Records: 714

Training set missing values
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


Testing set missing values
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


In [8]:
# apply the same preprocessing to test set
test["Fare"].fillna(test["Fare"].mean(), inplace=True)

In [10]:
train.Cabin.fillna("unknown", inplace=True)

In [11]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          891 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [12]:
#feature engineering

In [None]:
#embarked - feature engineering

In [12]:
freq_port = train.Embarked.dropna().mode()[0]
freq_port

'S'

In [13]:
train['EmbarkedFill'] = train['Embarked']
train.loc[train['Embarked'].isnull(), 'EmbarkedFill'] = freq_port
train[train['Embarked'].isnull()][['Embarked','EmbarkedFill']].head()

Unnamed: 0,Embarked,EmbarkedFill
61,,S
829,,S


In [15]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           418 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [16]:
#drop the Embarked feature from our datasets
test['EmbarkedFill'] = test['Embarked']
train = train.drop(['Embarked'], axis=1)
test = test.drop(['Embarked'], axis=1)
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,EmbarkedFill
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,unknown,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,unknown,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,unknown,S


In [17]:
#convert the EmbarkedFill feature by creating a new numeric Port feature

Ports = list(enumerate(np.unique(train['EmbarkedFill'])))
Ports_dict = { name : i for i, name in Ports }              
train['Port'] = train.EmbarkedFill.map( lambda x: Ports_dict[x]).astype(int)

Ports = list(enumerate(np.unique(test['EmbarkedFill'])))
Ports_dict = { name : i for i, name in Ports }
test['Port'] = test.EmbarkedFill.map( lambda x: Ports_dict[x]).astype(int)

train[['EmbarkedFill', 'Port']].head(10)

Unnamed: 0,EmbarkedFill,Port
0,S,2
1,C,0
2,S,2
3,S,2
4,S,2
5,Q,1
6,S,2
7,S,2
8,S,2
9,C,0


In [18]:
#get dummy for Sex
train['Sex'] = train['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
test['Sex'] = test['Sex'].map( {'female': 0, 'male': 1} ).astype(int)

In [19]:
#feature engineering: get mean age for female and male seperately
train.groupby("Sex")["Age"].mean()

Sex
0    27.915709
1    30.726645
Name: Age, dtype: float64

In [20]:
for i in range(len(train)):
    if train['Age'].isnull()[i]:
        if train['Sex'][i] == 1: 
            train['Age'][i] = 30.726645
        else:
            train['Age'][i] = 27.915709

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [21]:
for i in range(len(test)):
    if test['Age'].isnull()[i]:
        if test['Sex'][i] == 1: 
            test['Age'][i] = 30.726645
        else:
            test['Age'][i] = 27.915709

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [22]:
#children
train["Child"] = ["Child" if int(x) < 18 else "Adult" for x in train["Age"]]
test["Child"] = ["Child" if int(x) < 18 else "Adult" for x in test["Age"]]

In [23]:
# Family size column
train["Family_size"] = train["Parch"].astype(np.int64) + train["SibSp"].astype(np.int64) + 1 
test["Family_size"] = test["Parch"].astype(np.int64) + test["SibSp"].astype(np.int64) + 1 

print("See below a record for a child with sibling and parent, we know have a Child and Family Size indicator: \n")
print(train.iloc[10])

See below a record for a child with sibling and parent, we know have a Child and Family Size indicator: 

PassengerId                                  11
Survived                                      1
Pclass                                        3
Name            Sandstrom, Miss. Marguerite Rut
Sex                                           0
Age                                           4
SibSp                                         1
Parch                                         1
Ticket                                  PP 9549
Fare                                       16.7
Cabin                                        G6
EmbarkedFill                                  S
Port                                          2
Child                                     Child
Family_size                                   3
Name: 10, dtype: object


In [24]:
# Use regex and str.extract method to extract title from name for test and train
train["Title"] = train["Name"].str.extract("\,\s(.*?)\." , expand=True)
train["Title"].str.strip(" ")
test["Title"] = test["Name"].str.extract("\,\s(.*?)\." , expand=True)
test["Title"].str.strip(" ")

# Print list of values and the count for that data frame series
train["Title"].value_counts(ascending = False)

Mr              517
Miss            182
Mrs             125
Master           40
Dr                7
Rev               6
Mlle              2
Col               2
Major             2
Lady              1
Jonkheer          1
Don               1
Ms                1
Mme               1
Capt              1
the Countess      1
Sir               1
Name: Title, dtype: int64

In [25]:
#titles
train["Title"] = [x if x in ["Miss", "Mr", "Mrs", "Master", "Dr", "Rev"] else "Others" for x in train["Title"] ]
test["Title"] = [x if x in ["Miss", "Mr", "Mrs", "Master", "Dr", "Rev"] else "Others" for x in test["Title"] ]

In [26]:
train.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,EmbarkedFill,Port,Child,Family_size,Title
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,unknown,S,2,Adult,2,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,C,0,Adult,2,Mrs


In [27]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,EmbarkedFill,Port,Child,Family_size,Title
0,892,3,"Kelly, Mr. James",1,34.5,0,0,330911,7.8292,,Q,1,Adult,1,Mr
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",0,47.0,1,0,363272,7.0,,S,2,Adult,2,Mrs
2,894,2,"Myles, Mr. Thomas Francis",1,62.0,0,0,240276,9.6875,,Q,1,Adult,1,Mr
3,895,3,"Wirz, Mr. Albert",1,27.0,0,0,315154,8.6625,,S,2,Adult,1,Mr
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",0,22.0,1,1,3101298,12.2875,,S,2,Adult,3,Mrs


In [28]:
# create mothers

def mother(row):
  if row["Child"] == "Adult" and row["Sex"] == "female" and row["Title"] == "Mrs" and row["Parch"] > 0:
    return "Mother"
  else:
    return "Not Mother"

train["Mother"] = train.apply(mother, axis=1)
test["Mother"] = test.apply(mother, axis=1)

#print("See below and example record for who we believe to be a mother:\n")
#print(train.iloc[25])

In [29]:
z = {'Mother':1, 'Not Mother':0}
train['Mother'] = train['Mother'].map(z)
test['Mother'] = test['Mother'].map(z)

In [30]:
z = {'Child':1, 'Adult':0}
train['Child'] = train['Child'].map(z)
test['Child'] = test['Child'].map(z)

In [32]:
feature_cols = ['Pclass', 'Fare', 'Child', 'Family_size', 'Mother', 'Sex', 'Age','Port']
X = train[feature_cols]
y = train[['Survived']]

In [33]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=0)



In [34]:
X_train.shape, y_train.shape

((623, 8), (623, 1))

In [35]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
result = logreg.fit(X_train, y_train)
#logreg.score(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [36]:
#class predictions (not predicted probabilities)
y_pred_Survived = logreg.predict(X_test)

In [37]:
#y_pred_Survived = result.predict(X_test)

In [38]:
# calculate classification accuracy

from sklearn import metrics
print metrics.accuracy_score(y_test, y_pred_Survived)

0.798507462687


In [39]:
X_test.shape

(268, 8)

In [40]:
new_cols = ['Pclass', 'Fare', 'Child', 'Family_size', 'Mother', 'Sex', 'Age', 'Port']
x1 = test[new_cols]

In [41]:
x1.head()

Unnamed: 0,Pclass,Fare,Child,Family_size,Mother,Sex,Age,Port
0,3,7.8292,0,1,0,1,34.5,1
1,3,7.0,0,2,0,0,47.0,2
2,2,9.6875,0,1,0,1,62.0,1
3,3,8.6625,0,1,0,1,27.0,2
4,3,12.2875,0,3,0,0,22.0,2


In [42]:
x1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 8 columns):
Pclass         418 non-null int64
Fare           418 non-null float64
Child          418 non-null int64
Family_size    418 non-null int64
Mother         418 non-null int64
Sex            418 non-null int64
Age            418 non-null float64
Port           418 non-null int64
dtypes: float64(2), int64(6)
memory usage: 26.2 KB


In [43]:
x1.shape

(418, 8)

In [44]:
test_Survived = result.predict(x1)

In [45]:
test_Survived

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1,
       0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0,

In [46]:
import numpy as np
test['Survived']= test_Survived

In [47]:
test.head(15)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,EmbarkedFill,Port,Child,Family_size,Title,Mother,Survived
0,892,3,"Kelly, Mr. James",1,34.5,0,0,330911,7.8292,,Q,1,0,1,Mr,0,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",0,47.0,1,0,363272,7.0,,S,2,0,2,Mrs,0,1
2,894,2,"Myles, Mr. Thomas Francis",1,62.0,0,0,240276,9.6875,,Q,1,0,1,Mr,0,0
3,895,3,"Wirz, Mr. Albert",1,27.0,0,0,315154,8.6625,,S,2,0,1,Mr,0,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",0,22.0,1,1,3101298,12.2875,,S,2,0,3,Mrs,0,1
5,897,3,"Svensson, Mr. Johan Cervin",1,14.0,0,0,7538,9.225,,S,2,1,1,Mr,0,0
6,898,3,"Connolly, Miss. Kate",0,30.0,0,0,330972,7.6292,,Q,1,0,1,Miss,0,1
7,899,2,"Caldwell, Mr. Albert Francis",1,26.0,1,1,248738,29.0,,S,2,0,3,Mr,0,0
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",0,18.0,0,0,2657,7.2292,,C,0,0,1,Mrs,0,1
9,901,3,"Davies, Mr. John Samuel",1,21.0,2,0,A/4 48871,24.15,,S,2,0,3,Mr,0,0


In [48]:
x2 = test[["PassengerId","Survived"]]

In [49]:
x2.to_csv('LRFeatureEngineering3-2.csv', index=False)