In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
titanic = pd.concat([train, test], axis=0, ignore_index=True)
titanic.shape

(1309, 12)

In [4]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
titanic.isnull().sum()

PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
dtype: int64

In [6]:
titanic['Survived'] = titanic['Survived'].map({0.0: 0, 1.0: 1})
titanic['Survived'].head()

0    0.0
1    1.0
2    1.0
3    1.0
4    0.0
Name: Survived, dtype: float64

In [7]:
titanic['Name_tag'] = titanic['Name'].apply(lambda x:x.split(',')[1].split('.')[0].strip())
titanic['Name_tag'].head()

0      Mr
1     Mrs
2    Miss
3     Mrs
4      Mr
Name: Name_tag, dtype: object

In [8]:
titanic.Name_tag.unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms',
       'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'the Countess',
       'Jonkheer', 'Dona'], dtype=object)

In [9]:
titanic.Name_tag.value_counts()

Mr              757
Miss            260
Mrs             197
Master           61
Dr                8
Rev               8
Col               4
Major             2
Mlle              2
Ms                2
Dona              1
Lady              1
Sir               1
Capt              1
the Countess      1
Jonkheer          1
Don               1
Mme               1
Name: Name_tag, dtype: int64

In [10]:
pd.pivot_table(titanic, index='Name_tag', columns='Survived', values='Name', aggfunc='count').sort_values(by=1, ascending=False)

Survived,0.0,1.0
Name_tag,Unnamed: 1_level_1,Unnamed: 2_level_1
Miss,55.0,127.0
Mrs,26.0,99.0
Mr,436.0,81.0
Master,17.0,23.0
Dr,4.0,3.0
Mlle,,2.0
Col,1.0,1.0
Lady,,1.0
Major,1.0,1.0
Mme,,1.0


In [11]:
titanic['Is_married'] = 0
titanic['Is_married'].loc[titanic['Name_tag'] == 'Mrs'] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [12]:
titanic['Name_tag_pro'] = titanic['Name_tag'].copy()

print(titanic['Name_tag_pro'].unique())
titanic['Name_tag_pro'].replace(to_replace=['Don', 'Rev', 'Dr', 'Mme', 'Ms', 'Major', 'Lady', 'Sir', 'Mlle', 'Col',
                                                           'Capt', 'the Countess', 'Jonkheer', 'Dona'], value='Other',inplace=True)
print(titanic['Name_tag_pro'].unique())

['Mr' 'Mrs' 'Miss' 'Master' 'Don' 'Rev' 'Dr' 'Mme' 'Ms' 'Major' 'Lady'
 'Sir' 'Mlle' 'Col' 'Capt' 'the Countess' 'Jonkheer' 'Dona']
['Mr' 'Mrs' 'Miss' 'Master' 'Other']


In [13]:
titanic[titanic['PassengerId'] == 444]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Name_tag,Is_married,Name_tag_pro
443,444,1.0,2,"Reynaldo, Ms. Encarnacion",female,28.0,0,0,230434,13.0,,S,Ms,0,Other


In [14]:
titanic['Sex'] = titanic['Sex'].map({'male':1, 'female':0})

In [15]:
titanic['Age'].fillna(titanic['Age'].mean(), inplace=True)
titanic['Age'] = titanic['Age'].astype(int)
bins = [0, 20, 40, 60, 80]
labels = ['young', 'young_adults', 'adults', 'old_adults']
titanic['Age_groups'] = pd.cut(titanic['Age'], bins=bins, labels=labels, include_lowest=True)
titanic.groupby('Age_groups').size()

Age_groups
young           249
young_adults    836
adults          192
old_adults       32
dtype: int64

In [16]:
titanic['Family'] = titanic['SibSp'] + titanic['Parch']

In [17]:
titanic['Is_alone'] = titanic['Family'] == 0

In [18]:
titanic['Is_alone'] = titanic['Is_alone'].astype(int)
titanic.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Name_tag,Is_married,Name_tag_pro,Age_groups,Family,Is_alone
0,1,0.0,3,"Braund, Mr. Owen Harris",1,22,1,0,A/5 21171,7.25,,S,Mr,0,Mr,young_adults,1,0
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38,1,0,PC 17599,71.2833,C85,C,Mrs,1,Mrs,young_adults,1,0
2,3,1.0,3,"Heikkinen, Miss. Laina",0,26,0,0,STON/O2. 3101282,7.925,,S,Miss,0,Miss,young_adults,0,1


In [19]:
bins = [0,1,3,6,11]
labels = ['alone', 'duo', 'small', 'large']
titanic['Family_group'] = pd.cut(titanic['Family'], bins=bins, labels=labels, include_lowest=True, right=False)

In [20]:
titanic['Ticket_number'] = titanic['Ticket'].apply(lambda x: x.split(' ')[-1])

In [21]:
titanic[titanic['Ticket_number'] == 'LINE']

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Name_tag,Is_married,Name_tag_pro,Age_groups,Family,Is_alone,Family_group,Ticket_number
179,180,0.0,3,"Leonard, Mr. Lionel",1,36,0,0,LINE,0.0,,S,Mr,0,Mr,young_adults,0,1,alone,LINE
271,272,1.0,3,"Tornquist, Mr. William Henry",1,25,0,0,LINE,0.0,,S,Mr,0,Mr,young_adults,0,1,alone,LINE
302,303,0.0,3,"Johnson, Mr. William Cahoone Jr",1,19,0,0,LINE,0.0,,S,Mr,0,Mr,young,0,1,alone,LINE
597,598,0.0,3,"Johnson, Mr. Alfred",1,49,0,0,LINE,0.0,,S,Mr,0,Mr,adults,0,1,alone,LINE


In [22]:
titanic['Fare'].fillna(titanic['Fare'].mean(), inplace=True)

bins = [0, 8, 16, 30, 513]
labels = ['low', 'middle', 'high', 'top']
titanic['Fare_groups'] = pd.cut(titanic['Fare'], bins=bins, labels=labels, include_lowest=True)
titanic.groupby('Fare_groups').size()

Fare_groups
low       360
middle    356
high      249
top       344
dtype: int64

In [23]:
pd.pivot_table(titanic, index='Fare_groups', columns='Survived', values='Name', aggfunc='count')

Survived,0.0,1.0
Fare_groups,Unnamed: 1_level_1,Unnamed: 2_level_1
low,189,52
middle,168,76
high,94,78
top,98,136


In [24]:
titanic['Embarked'].fillna('S', inplace=True)

In [25]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Embarked,Name_tag,Is_married,Name_tag_pro,Age_groups,Family,Is_alone,Family_group,Ticket_number,Fare_groups
0,1,0.0,3,"Braund, Mr. Owen Harris",1,22,1,0,A/5 21171,7.25,...,S,Mr,0,Mr,young_adults,1,0,duo,21171,low
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38,1,0,PC 17599,71.2833,...,C,Mrs,1,Mrs,young_adults,1,0,duo,17599,top
2,3,1.0,3,"Heikkinen, Miss. Laina",0,26,0,0,STON/O2. 3101282,7.925,...,S,Miss,0,Miss,young_adults,0,1,alone,3101282,low
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35,1,0,113803,53.1,...,S,Mrs,1,Mrs,young_adults,1,0,duo,113803,top
4,5,0.0,3,"Allen, Mr. William Henry",1,35,0,0,373450,8.05,...,S,Mr,0,Mr,young_adults,0,1,alone,373450,middle


In [26]:
titanic.isnull().sum()

PassengerId         0
Survived          418
Pclass              0
Name                0
Sex                 0
Age                 0
SibSp               0
Parch               0
Ticket              0
Fare                0
Cabin            1014
Embarked            0
Name_tag            0
Is_married          0
Name_tag_pro        0
Age_groups          0
Family              0
Is_alone            0
Family_group        0
Ticket_number       0
Fare_groups         0
dtype: int64

In [27]:
titanic.nunique()

PassengerId      1309
Survived            2
Pclass              3
Name             1307
Sex                 2
Age                73
SibSp               7
Parch               8
Ticket            929
Fare              282
Cabin             186
Embarked            3
Name_tag           18
Is_married          2
Name_tag_pro        5
Age_groups          4
Family              9
Is_alone            2
Family_group        4
Ticket_number     924
Fare_groups         4
dtype: int64

In [28]:
new = titanic.copy()
new.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Embarked,Name_tag,Is_married,Name_tag_pro,Age_groups,Family,Is_alone,Family_group,Ticket_number,Fare_groups
0,1,0.0,3,"Braund, Mr. Owen Harris",1,22,1,0,A/5 21171,7.25,...,S,Mr,0,Mr,young_adults,1,0,duo,21171,low
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38,1,0,PC 17599,71.2833,...,C,Mrs,1,Mrs,young_adults,1,0,duo,17599,top
2,3,1.0,3,"Heikkinen, Miss. Laina",0,26,0,0,STON/O2. 3101282,7.925,...,S,Miss,0,Miss,young_adults,0,1,alone,3101282,low
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35,1,0,113803,53.1,...,S,Mrs,1,Mrs,young_adults,1,0,duo,113803,top
4,5,0.0,3,"Allen, Mr. William Henry",1,35,0,0,373450,8.05,...,S,Mr,0,Mr,young_adults,0,1,alone,373450,middle


In [29]:
new.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Name_tag',
       'Is_married', 'Name_tag_pro', 'Age_groups', 'Family', 'Is_alone',
       'Family_group', 'Ticket_number', 'Fare_groups'],
      dtype='object')

In [30]:
drop_cols = ['PassengerId', 'Name', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Name_tag', 'Family', 'Ticket_number']
new.drop(columns = drop_cols, inplace=True)
new.columns

Index(['Survived', 'Pclass', 'Sex', 'Embarked', 'Is_married', 'Name_tag_pro',
       'Age_groups', 'Is_alone', 'Family_group', 'Fare_groups'],
      dtype='object')

In [31]:
new_train = new.loc[:890]
new_test = new.loc[891:]
print(new_train.shape)
print(new_test.shape)

(891, 10)
(418, 10)


In [32]:
new_train['Survived'] = new_train['Survived'].map({0.0: 0, 1.0: 1})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_train['Survived'] = new_train['Survived'].map({0.0: 0, 1.0: 1})


In [33]:
new_test.drop(columns='Survived', inplace=True)
new_test.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


(418, 9)

In [34]:
new_train.head()

Unnamed: 0,Survived,Pclass,Sex,Embarked,Is_married,Name_tag_pro,Age_groups,Is_alone,Family_group,Fare_groups
0,0,3,1,S,0,Mr,young_adults,0,duo,low
1,1,1,0,C,1,Mrs,young_adults,0,duo,top
2,1,3,0,S,0,Miss,young_adults,1,alone,low
3,1,1,0,S,1,Mrs,young_adults,0,duo,top
4,0,3,1,S,0,Mr,young_adults,1,alone,middle


In [35]:
train_dummies = pd.get_dummies(new_train, columns = ['Pclass', 'Sex', 'Embarked', 'Is_married', 'Name_tag_pro', 'Age_groups', 'Is_alone', 'Family_group', 'Fare_groups'])

In [36]:
train_dummies

Unnamed: 0,Survived,Pclass_1,Pclass_2,Pclass_3,Sex_0,Sex_1,Embarked_C,Embarked_Q,Embarked_S,Is_married_0,...,Is_alone_0,Is_alone_1,Family_group_alone,Family_group_duo,Family_group_small,Family_group_large,Fare_groups_low,Fare_groups_middle,Fare_groups_high,Fare_groups_top
0,0,0,0,1,0,1,0,0,1,1,...,1,0,0,1,0,0,1,0,0,0
1,1,1,0,0,1,0,1,0,0,0,...,1,0,0,1,0,0,0,0,0,1
2,1,0,0,1,1,0,0,0,1,1,...,0,1,1,0,0,0,1,0,0,0
3,1,1,0,0,1,0,0,0,1,0,...,1,0,0,1,0,0,0,0,0,1
4,0,0,0,1,0,1,0,0,1,1,...,0,1,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,0,1,0,0,1,0,0,1,1,...,0,1,1,0,0,0,0,1,0,0
887,1,1,0,0,1,0,0,0,1,1,...,0,1,1,0,0,0,0,0,1,0
888,0,0,0,1,1,0,0,0,1,1,...,1,0,0,0,1,0,0,0,1,0
889,1,1,0,0,0,1,1,0,0,1,...,0,1,1,0,0,0,0,0,1,0


In [37]:
train_dummies.shape

(891, 30)

In [38]:
X = train_dummies.iloc[:, 1:]
y = train_dummies.iloc[:, 0]
print(X.shape)
print(y.shape)

(891, 29)
(891,)


In [39]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [40]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
logreg.score(X_train, y_train), logreg.score(X_test, y_test)

(0.8323353293413174, 0.820627802690583)

In [41]:
from sklearn.neighbors import KNeighborsClassifier
logreg = KNeighborsClassifier(n_neighbors=3)
logreg.fit(X_train, y_train)
logreg.score(X_train, y_train), logreg.score(X_test, y_test)

(0.8383233532934131, 0.8116591928251121)

In [42]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train, y_train)
clf.score(X_train, y_train), clf.score(X_test, y_test)

(0.8023952095808383, 0.7713004484304933)

In [43]:
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier()
clf.fit(X_train, y_train)
clf.score(X_train, y_train), clf.score(X_test, y_test)

(0.8517964071856288, 0.8295964125560538)

In [44]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
clf.score(X_train, y_train), clf.score(X_test, y_test)

(0.8787425149700598, 0.7937219730941704)

In [45]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
clf.score(X_train, y_train), clf.score(X_test, y_test)

(0.8787425149700598, 0.8071748878923767)

In [47]:
help(GradientBoostingClassifier)

Help on class GradientBoostingClassifier in module sklearn.ensemble._gb:

class GradientBoostingClassifier(sklearn.base.ClassifierMixin, BaseGradientBoosting)
 |  GradientBoostingClassifier(*, loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0, min_impurity_split=None, init=None, random_state=None, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False, validation_fraction=0.1, n_iter_no_change=None, tol=0.0001, ccp_alpha=0.0)
 |  
 |  Gradient Boosting for classification.
 |  
 |  GB builds an additive model in a
 |  forward stage-wise fashion; it allows for the optimization of
 |  arbitrary differentiable loss functions. In each stage ``n_classes_``
 |  regression trees are fit on the negative gradient of the
 |  binomial or multinomial deviance loss function. Binary classification
 |  is a special case where only a sin